In [1]:
from src.auto_statstest import StatsTest
from src.statstest import *

np.random.seed(14)

In [2]:
# import datasets

df = pd.read_csv("synthetic_data.csv") # synthetic dataset to showcase various cases below
df2 = pd.read_csv("case_control.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   normal1                         10000 non-null  float64
 1   normal2                         10000 non-null  float64
 2   normal3                         10000 non-null  float64
 3   non-normal                      10000 non-null  float64
 4   categorical_2_levels1           10000 non-null  object 
 5   dependent_categorical_3_levels  10000 non-null  object 
 6   categorical_2_levels2           10000 non-null  object 
 7   categorical_multi_levels        10000 non-null  object 
 8   diff_mean_normal_samples        10000 non-null  float64
 9   categorical_fishers             10000 non-null  object 
 10  categorical_nofishers           10000 non-null  object 
 11  boolean                         10000 non-null  bool   
dtypes: bool(1), float64(5), object(6)

# AUTOSTATSTEST

*1. Create class instance - StatsTest*

parameter 1 = data (in pandas dataframe/numpy array)

parameter 2 = level of significance (optional)

In [3]:
test = StatsTest(df) # default level of significance = 0.05
test2 = StatsTest(df, alpha = 0.2)

*2. Call class method - autostatstest*

parameter 1 = dependent variable 

parameter 2 = independent variable 

*3. View & retrieve stored results (if needed)*

**EXAMPLE CASES**

In [4]:
# case 1 - column does not exist in dataset

result1 = test.autostatstest("normal1", "normal")

ValueError: Input column 'normal' is not in the dataset. Please try again.

In [None]:
# case 2 - column has inappropriate datatype

result2 = test.autostatstest("normal1", "boolean")

In [None]:
# case 3 - continuous normal vs continuous normal (pearson's correlation)

result3 = test.autostatstest("normal1", "normal2")

*Extracting results without post-hoc tests:*

In [None]:
result3

In [None]:
result3.pvalue

In [None]:
# case 4 - continuous normal vs continuous non-normal (spearman's correlation)

result4 = test.autostatstest("normal1", "non-normal")

In [None]:
# case 5 - categorical 2 levels vs categorical 2 levels (chi-square test) (results insignificant)

result5 = test.autostatstest("categorical_2_levels1", "categorical_2_levels2")

In [None]:
# case 6 - categorical 2 levels vs categorical > 2 levels (chi-square test) (results insignificant)

result6 = test.autostatstest("categorical_2_levels1", "categorical_multi_levels")

In [None]:
# case 7 - categorical 2 levels vs categorical > 2 levels (chi-square test) (results significant) 

result7 = test.autostatstest("categorical_2_levels1", "dependent_categorical_3_levels")

*Extracting results with post-hoc tests:*

In [None]:
result7

In [None]:
result7["chi2"].pvalue

In [None]:
result7["post-hoc"][1].corrected_pvalue

In [None]:
# function to print all significant categories from post-hoc test results

print_significant_categories(result7)

In [None]:
# case 8 - categorical 2 levels vs categorial 2 levels (fisher's exact test) [chi square assumption fails and contingency table is 2x2]

result8 = test.autostatstest("categorical_2_levels2", "categorical_fishers", verbose = False)

In [None]:
result8

In [None]:
# case 9 - categorical 2 levels vs categorical > 2 levels (chi-square test) [chi-square assumption fails but contingency table is not 2x2]

result9 = test.autostatstest("categorical_2_levels1", "categorical_nofishers")

In [None]:
# case 10 - continuous normal vs categorical 2 levels (student's independent t-test)

result10 = test.autostatstest("normal1", "categorical_2_levels1")

In [None]:
# case 11 - continuous non-normal vs categorical 2 levels (mann-whitney u test)

result11 = test.autostatstest("non-normal", "categorical_2_levels1")

In [None]:
# case 12 - continuous normal vs categorical > 2 levels (one-way anova) (results insignificant)

result12 = test.autostatstest("normal1", "categorical_multi_levels")

In [None]:
# case 13 - continuous normal vs categorical > 2 levels (one-way anova) (results significant)

result13 = test2.autostatstest("normal1", "dependent_categorical_3_levels", verbose=False)

In [None]:
result13

In [None]:
print_significant_categories(result13)

In [None]:
# case 14 - continuous non-normal vs categorical > 2 levels (kruskal-wallis test) (not significant)

result14 = test.autostatstest("non-normal", "categorical_multi_levels")

In [None]:
# case 15 - continuous non-normal vs categorical > 2 levels (kruskal-wallis test) (results significant)

result15 = test.autostatstest("diff_mean_normal_samples", "categorical_multi_levels")

# AUTOSTATSTEST_ALL

**This class method takes one column as input (dependent variable). Autostatstest is then conducted with all other columns in the dataset, assuming datatypes are appropriate. Results of statistical tests are stored in a dictionary. All errors raised are printed and stored as results as well.**

*1. Create class instance - StatsTest*

parameter 1 = data (in pandas dataframe/numpy array)

parameter 2 = level of significance (optional)

*2. Call class method - autostatstest_all*



*3. View & retrieve stored results in the dictionary (if needed)*

**EXAMPLE CASES**

In [None]:
result_all_1 = test.autostatstest_all("normal1", verbose = False)

In [None]:
result_all_1

{'normal2': DatkPearsonrResult(statistic=0.006323105498256652, pvalue=0.527231483649771, significant=False),
 'normal3': DatkPearsonrResult(statistic=0.007523414893723428, pvalue=0.45189586823596, significant=False),
 'non-normal': DatkSpearmanrResult(statistic=0.013384566625845667, pvalue=0.18078260126597606, significant=False),
 'categorical_2_levels1': DatkTTestResult(statistic=0.6964982784951681, pvalue=0.4861330268721993, significant=False),
 'dependent_categorical_3_levels': DatkOneWayAnovaResult(statistic=1.8362901753835394, pvalue=0.15946146883768222, significant=False),
 'categorical_2_levels2': DatkTTestResult(statistic=-2.2604870701425077, pvalue=0.023812450776105082, significant=True),
 'categorical_multi_levels': DatkOneWayAnovaResult(statistic=2.3005161004505177, pvalue=0.07517058079934379, significant=False),
 'diff_mean_normal_samples': DatkSpearmanrResult(statistic=-0.00358831799988318, pvalue=0.7197536168000374, significant=False),
 'categorical_fishers': DatkTTestRes

In [None]:
for k,v in result_all_1:
    if not isinstance(v, str):
        if v.pvalue < 0.05:
            print(k)

In [None]:
result_all_2 = test.autostatstest_all("categorical_2_levels1", verbose = False)

# list of cols = None


Assumption of Chi-square Test of Independence is violated (Each cell in contingency table should be at least 5).


In [None]:
result_all_2

# INDIVIDUAL STATISTICAL TESTS

In [None]:
df.info()

In [None]:
result_pearsons = pearsonr_test(var1 = df["normal1"], var2 = df["normal2"], alpha = 0.05)

In [None]:
observed = pd.crosstab(df["categorical_2_levels1"], df["dependent_categorical_3_levels"])

result_chi2 = chisquare_test(observed = observed, alpha = 0.05)

In [None]:
result_chi2_posthoc = chisquare_posthoc_test(df = observed, alpha = 0.05)