In [None]:
from pingouin import ttest, bayesfactor_binom

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from io import BytesIO

In [83]:
FILE_DIR = 'test/mock_data.xlsx'

df = pd.read_excel(FILE_DIR)

COLUMNS = df.columns.values.tolist()

for index, col in enumerate(COLUMNS):
	print(index, '-', col)

0 - Treatment       (1-Exp; 2-SOC)
1 - Age (Yrs)
2 - Gender          (1-M; 2-F)
3 - Race             (1-W; 2-B; 3-AI/AN; 4-A)
4 - Weight (kg)
5 - Subject no
6 - HgbA1c (%) Wk0
7 - HgbA1c (%) Wk12
8 - HgbA1c (%) Wk20
9 - HgbA1c (%) Wk40


In [84]:
def get_bayes_interp(val):
    ranges = [[(0, 0.01), 'extreme evidence that the two treatment groups are statistically different'],
                [(0.01, 0.03), 'very strong evidence that the two treatment groups are statistically different'],
                [(0.03, 0.1), 'strong evidence that the two treatment groups are statistically different'],
                [(0.1, 0.3), 'moderate evidence that the two treatment groups are statistically different'],
                [(0.3, 1), 'anecdotal evidence that the two treatment groups are statistically different'],
                [(1, 3), 'anecdotal evidence that the two treatment groups are statistically similar'],
                [(3, 10), 'moderate evidence that the two treatment groups are statistically similar'],
                [(10, 30), 'strong evidence that the two treatment groups are statistically similar'],
                [(30, 100), 'very strong evidence that the two treatment groups are statistically similar'],
                [(100, 10000000), 'extreme evidence that the two treatment groups are statistically similar']
            ]
    for r in ranges:
        if ((val >= r[0][0]) and (val < r[0][1])):
            return r[1]
    return ranges[-1][1]

def interpolate_color(val):
    ranges = [[(0, 0.01), '#006400'],
                [(0.01, 0.03), '#338000'],
                [(0.03, 0.1), '#669900'],
                [(0.1, 0.3), '#99B300'],
                [(0.3, 1), '#CCCC00'],
                [(1, 3), '#FFCC00'],
                [(3, 10), '#FF9900'],
                [(10, 30), '#FF6600'],
                [(30, 100), '#993300'],
                [(100, 10000000), '#800000']
            ]
    for r in ranges:
        if ((val >= r[0][0]) and (val < r[0][1])):
            return r[1]
    return ranges[-1][1]

In [85]:
def analysis(df, columns):
    df_copy = df.copy()

    output = []
    try:
        col_types = [df[col].dtype.name for col in columns]

        if len(set(col_types)) == 1 and col_types[0] == 'object':
            col_unique_vals = sorted(
                [(col, df[col].dropna().nunique()) for col in columns], key=lambda x: x[1]
            )
            class_col = col_unique_vals[0][0]
            data_cols = [i[0] for i in col_unique_vals[1:]]

            cx_tab = pd.crosstab([df[data_col] for data_col in data_cols], df[class_col])
            class_vals = sorted(df[class_col].dropna().unique())

            bf_dict = {}
            for cls_val in class_vals:
                for idx, row in cx_tab.iterrows():
                    count = int(row[cls_val])
                    total = int(row.sum())
                    bf10 = bayesfactor_binom(count, total)
                    key = f"{cls_val} vs {', '.join(idx) if isinstance(idx, tuple) else idx}"
                    bf_dict[key] = bf10
            output = [0, bf_dict, {'style': 'class/data', 'class': class_col, 'data': ', '.join(data_cols)}, 'allcat']

        if set(columns) == {'EXP', 'SOC'}:
            result = ttest(df['EXP'], df['SOC'])
            output = [0, result, {'style': 'EXP/SOC', 'class': None, 'data': None}]

        if len(columns) != 2:
            output = [1, 'Can only process 2 columns.']

        class_col = next((col for col in columns if df[col].dropna().nunique() == 2), None)
        if class_col:
            class_a, class_b = df[class_col].dropna().unique()
            data_col = next(col for col in columns if col != class_col)

            df_a = df[df[class_col] == class_a][data_col]
            df_b = df[df[class_col] == class_b][data_col]
            result = ttest(df_a, df_b)
            output = [0, result, {'style': 'class/data', 'class': class_col, 'data': data_col}]

    except Exception as e:
        output = [1, f'Encountered error: {e}']

    if output[0] == 1:
        raise Exception(output[1])
    else:
        if output[-1] == 'allcat':
            res_dict = {}
            res_dict['variant'] = 'allcat'
            res_dict['bf_dict'] = output[1]
            res_dict['keys'] = list(res_dict['bf_dict'].keys())
            res_dict['cols'] = output[2]
            res_dict['colour'] = {}
            res_dict['interp'] = {}
            res_dict['bf01_dict'] = {} 
            for key in res_dict['bf_dict']:
                res_dict['bf01_dict'][key] = round(1 / float(res_dict['bf_dict'][key]), 2)
                res_dict['colour'][key] = interpolate_color(res_dict['bf01_dict'][key])
                res_dict['interp'][key] = get_bayes_interp(res_dict['bf01_dict'][key]).capitalize()
        else:
            res_dict = output[1].iloc[0].to_dict()
            res_dict['variant'] = 'normal'
            res_dict['cols'] = output[2]
            res_dict['BF01'] = round(1 / float(res_dict['BF10']), 2)
            res_dict['BF10_rounded'] = round(float(res_dict['BF10']), 2)
            res_dict['colour'] = interpolate_color(res_dict['BF01'])
            res_dict['interp'] = get_bayes_interp(res_dict['BF01']).capitalize()
    
    style = res_dict['cols']['style']
    class_col = res_dict['cols']['class']
    data_col = res_dict['cols']['data']

    if style == 'EXP/SOC':
        class_col, data_col = None, None
    try:
        df_e = pd.DataFrame()
        df_s = pd.DataFrame()

        if style == 'EXP/SOC':
            df_e['class'] = 'EXP'
            df_s['class'] = 'SOC'
        elif style == 'class/data' and class_col and data_col:
            class_e, class_s = df_copy[class_col].dropna().unique()
            e = df_copy[df_copy[class_col] == class_e][data_col]
            df_e = pd.concat([df_e, e])
            s = df_copy[df_copy[class_col] == class_s][data_col]
            df_s = pd.concat([df_s, s])
            df_e['class'] = class_e
            df_s['class'] = class_s
        else:
            return 1, 'ERROR! Data not in proper format.'
        
        df = pd.concat([df_e, df_s])
        df_long = pd.melt(df, id_vars='class', var_name='time', value_name='value')

        fig1 = BytesIO()
        fig2 = BytesIO()
        fig3 = BytesIO()
        fig4 = BytesIO()
        # fig1 = 'test/fig1.png'
        # fig2 = 'test/fig2.png'
        # fig3 = 'test/fig3.png'
        # fig4 = 'test/fig4.png'

        plt.figure(figsize=(12,9))
        sns.pointplot(df_long, x='time', y='value', hue='class', errorbar=lambda x: (x.quantile(0.25), x.quantile(0.75)), capsize=0.1, markers=['x', 's'], linestyle=['-', '--'], dodge=True, palette='bright')
        plt.title('Line + IQR over time')
        plt.savefig(fig1)
        plt.close()

        plt.figure(figsize=(12,9))
        sns.barplot(df_long, x='time', y='value', hue='class', errorbar=lambda x: (x.quantile(0.25), x.quantile(0.75)), capsize=0.1, palette='bright')
        plt.title('Bar + IQR over time')
        plt.savefig(fig2)
        plt.close()

        plt.figure(figsize=(12,9))
        sns.boxplot(df_long, x='time', y='value', hue='class', fill=True, gap=0.1, palette='bright')
        plt.title('Box CI95 over time')
        plt.savefig(fig3)
        plt.close()

        plt.figure(figsize=(12,9))
        sns.stripplot(df_long, x='time', y='value', hue='class', dodge=True, jitter=0.25, legend=False, palette='bright')
        sns.boxplot(df_long, x='time', y='value', hue='class', fill=True, gap=0.1, saturation=0.5, palette='bright')
        plt.title('Box + Scatter over time')
        plt.savefig(fig4)
        plt.close()

        fig1.seek(0)
        fig2.seek(0)
        fig3.seek(0)
        fig4.seek(0)

        plot_output = 0, [fig1, fig2, fig3, fig4]
    except Exception as e:
        print(f'Error! {e}')
        plot_output = [1]
    if plot_output[0] == 0:
        # PLOT OUTPUT
        return True
    else:
        print('ERROR! STYLE MISMATCH.')


In [86]:
result = analysis(df, [COLUMNS[1], COLUMNS[2]])
print(result)

True
