In [3]:
# The hypothesis testing is a core data analysis actibity behind experimentation, which goal is to determine if two 
# different conditions we have in an experiment have resulted in different impacts

# Library import
import numpy as np
import pandas as pd

from scipy import stats

In [4]:
# When we do hypothesis testing, we have two statements of interest: the first is our actual explanation, which we call
# the alternative hypothesis, and the second is that the explanation we have is not sufficient, and we call this the null 
# hypothesis. The testing method is to determine whether the null hypothesis is true or not, so if we find there is a 
# difference between groups, the we can reject the null hypothesis and accept our alternative

### Example

# We load our dataset
df = pd.read_csv('datasets/grades.csv')
df.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
0,B73F2C11-70F0-E37D-8B10-1D20AFED50B1,92.733946,2015-11-02 06:55:34.282000000,83.030552,2015-11-09 02:22:58.938000000,67.164441,2015-11-12 08:58:33.998000000,53.011553,2015-11-16 01:21:24.663000000,47.710398,2015-11-20 13:24:59.692000000,38.168318,2015-11-22 18:31:15.934000000
1,98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1,86.790821,2015-11-29 14:57:44.429000000,86.290821,2015-12-06 17:41:18.449000000,69.772657,2015-12-10 08:54:55.904000000,55.098125,2015-12-13 17:32:30.941000000,49.588313,2015-12-19 23:26:39.285000000,44.629482,2015-12-21 17:07:24.275000000
2,D0F62040-CEB0-904C-F563-2F8620916C4E,85.512541,2016-01-09 05:36:02.389000000,85.512541,2016-01-09 06:39:44.416000000,68.410033,2016-01-15 20:22:45.882000000,54.728026,2016-01-11 12:41:50.749000000,49.255224,2016-01-11 17:31:12.489000000,44.329701,2016-01-17 16:24:42.765000000
3,FFDF2B2C-F514-EF7F-6538-A6A53518E9DC,86.030665,2016-04-30 06:50:39.801000000,68.824532,2016-04-30 17:20:38.727000000,61.942079,2016-05-12 07:47:16.326000000,49.553663,2016-05-07 16:09:20.485000000,49.553663,2016-05-24 12:51:18.016000000,44.598297,2016-05-26 08:09:12.058000000
4,5ECBEEB6-F1CE-80AE-3164-E45E99473FB4,64.8138,2015-12-13 17:06:10.750000000,51.49104,2015-12-14 12:25:12.056000000,41.932832,2015-12-29 14:25:22.594000000,36.929549,2015-12-28 01:29:55.901000000,33.236594,2015-12-29 14:46:06.628000000,33.236594,2016-01-05 01:06:59.546000000


In [5]:
# We show the shape of the dataframe
print("There are {} rows and {} columns".format(df.shape[0], df.shape[1]))

There are 2315 rows and 13 columns


In [7]:
# We segment the data between early finishers (before the end of December 2015) and late finishers
early_finishers=df[pd.to_datetime(df['assignment1_submission']) < '2016']
early_finishers.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
0,B73F2C11-70F0-E37D-8B10-1D20AFED50B1,92.733946,2015-11-02 06:55:34.282000000,83.030552,2015-11-09 02:22:58.938000000,67.164441,2015-11-12 08:58:33.998000000,53.011553,2015-11-16 01:21:24.663000000,47.710398,2015-11-20 13:24:59.692000000,38.168318,2015-11-22 18:31:15.934000000
1,98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1,86.790821,2015-11-29 14:57:44.429000000,86.290821,2015-12-06 17:41:18.449000000,69.772657,2015-12-10 08:54:55.904000000,55.098125,2015-12-13 17:32:30.941000000,49.588313,2015-12-19 23:26:39.285000000,44.629482,2015-12-21 17:07:24.275000000
4,5ECBEEB6-F1CE-80AE-3164-E45E99473FB4,64.8138,2015-12-13 17:06:10.750000000,51.49104,2015-12-14 12:25:12.056000000,41.932832,2015-12-29 14:25:22.594000000,36.929549,2015-12-28 01:29:55.901000000,33.236594,2015-12-29 14:46:06.628000000,33.236594,2016-01-05 01:06:59.546000000
5,D09000A0-827B-C0FF-3433-BF8FF286E15B,71.647278,2015-12-28 04:35:32.836000000,64.05255,2016-01-03 21:05:38.392000000,64.75255,2016-01-07 08:55:43.692000000,57.467295,2016-01-11 00:45:28.706000000,57.467295,2016-01-11 00:54:13.579000000,57.467295,2016-01-20 19:54:46.166000000
8,C9D51293-BD58-F113-4167-A7C0BAFCB6E5,66.595568,2015-12-25 02:29:28.415000000,52.916454,2015-12-31 01:42:30.046000000,48.344809,2016-01-05 23:34:02.180000000,47.444809,2016-01-02 07:48:42.517000000,37.955847,2016-01-03 21:27:04.266000000,37.955847,2016-01-19 15:24:31.060000000


In [9]:
late_finishers=df[pd.to_datetime(df['assignment1_submission']) >= '2016']
late_finishers.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
2,D0F62040-CEB0-904C-F563-2F8620916C4E,85.512541,2016-01-09 05:36:02.389000000,85.512541,2016-01-09 06:39:44.416000000,68.410033,2016-01-15 20:22:45.882000000,54.728026,2016-01-11 12:41:50.749000000,49.255224,2016-01-11 17:31:12.489000000,44.329701,2016-01-17 16:24:42.765000000
3,FFDF2B2C-F514-EF7F-6538-A6A53518E9DC,86.030665,2016-04-30 06:50:39.801000000,68.824532,2016-04-30 17:20:38.727000000,61.942079,2016-05-12 07:47:16.326000000,49.553663,2016-05-07 16:09:20.485000000,49.553663,2016-05-24 12:51:18.016000000,44.598297,2016-05-26 08:09:12.058000000
6,3217BE3F-E4B0-C3B6-9F64-462456819CE4,87.498744,2016-03-05 11:05:25.408000000,69.998995,2016-03-09 07:29:52.405000000,55.999196,2016-03-16 22:31:24.316000000,50.399276,2016-03-18 07:19:26.032000000,45.359349,2016-03-19 10:35:41.869000000,45.359349,2016-03-23 14:02:00.987000000
7,F1CB5AA1-B3DE-5460-FAFF-BE951FD38B5F,80.57609,2016-01-24 18:24:25.619000000,72.518481,2016-01-27 13:37:12.943000000,65.266633,2016-01-30 14:34:36.581000000,65.266633,2016-02-03 22:08:49.002000000,65.266633,2016-02-16 14:22:23.664000000,65.266633,2016-02-18 08:35:04.796000000
9,E2C617C2-4654-622C-AB50-1550C4BE42A0,59.270882,2016-03-06 12:06:26.185000000,59.270882,2016-03-13 02:07:25.289000000,53.343794,2016-03-17 07:30:09.241000000,53.343794,2016-03-20 21:45:56.229000000,42.675035,2016-03-27 15:55:04.414000000,38.407532,2016-03-30 20:33:13.554000000


In [None]:
#Another solution
late_finishers=df[-df.index.isin(early_finishers.index)]
late_finishers.head()

In [10]:
# We calculate the mean of the early finishers and the late finishers and compare them
print('Early finishers mean {}'.format(early_finishers['assignment1_grade'].mean()))
print('Late finishers mean {}'.format(late_finishers['assignment1_grade'].mean()))

Early finishers mean 74.94728457024303
Late finishers mean 74.0450648477065


In [11]:
# We make the student' t-test: "These are the same"(null hypothesis) vs "These are different" (alternative hypothesis)
# When doing hypothesis testing, we have to choose a significance level as a threshold for how much of a chance we're
# willing to accept. This significance level is called alpha, and is usually fixed at 0.05 (5% of probability)

# We use the ttest_ind() function, which does an independent t-test (meaning the populations are not related to each other)
# The results of ttest_index() are the t-statistics and a p-value. The latter value is the probability of our null 
# hypothesis being True

from scipy.stats import ttest_ind

ttest_ind(early_finishers['assignment1_grade'], late_finishers['assignment1_grade'])

Ttest_indResult(statistic=1.322354085372139, pvalue=0.1861810110171455)

In [None]:
# The probability is of 0.18 (18%), which is greater than our alpha value, so we can't reject the null hypothesis.
# We try to compare both populations for other assignments

In [12]:
print(ttest_ind(early_finishers['assignment2_grade'], late_finishers['assignment2_grade']))
print(ttest_ind(early_finishers['assignment3_grade'], late_finishers['assignment3_grade']))
print(ttest_ind(early_finishers['assignment4_grade'], late_finishers['assignment4_grade']))
print(ttest_ind(early_finishers['assignment5_grade'], late_finishers['assignment5_grade']))
print(ttest_ind(early_finishers['assignment6_grade'], late_finishers['assignment6_grade']))

Ttest_indResult(statistic=1.2514717608216366, pvalue=0.2108889627004424)
Ttest_indResult(statistic=1.6133726558705392, pvalue=0.10679998102227865)
Ttest_indResult(statistic=0.049671157386456125, pvalue=0.960388729789337)
Ttest_indResult(statistic=-0.05279315545404755, pvalue=0.9579012739746492)
Ttest_indResult(statistic=-0.11609743352612056, pvalue=0.9075854011989656)


In [14]:
# P-values are being subtituted by confidence intervalues and bayesian analyses. 
# To a certain extent, this is due to the fact that p-values can give values statistically significant just by chance

# We simulate this by creating a random data frame of 100 columns with 100 numbers
df1=pd.DataFrame([np.random.random(100) for x in range(100)])
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.581311,0.743677,0.574313,0.830782,0.64071,0.038266,0.390913,0.241194,0.970414,0.868899,...,0.911083,0.224093,0.411807,0.081407,0.188717,0.423747,0.443961,0.830691,0.006412,0.501419
1,0.895667,0.092221,0.388887,0.471189,0.094813,0.336801,0.987982,0.721411,0.196705,0.709043,...,0.281798,0.049231,0.532779,0.542215,0.581133,0.340511,0.121178,0.678905,0.36552,0.771978
2,0.06138,0.465369,0.877793,0.751639,0.191279,0.735716,0.351122,0.146411,0.04248,0.980769,...,0.684948,0.652942,0.802162,0.203423,0.750375,0.184508,0.184437,0.03405,0.846986,0.950061
3,0.280769,0.908559,0.923416,0.1855,0.153278,0.062157,0.605002,0.108329,0.049231,0.384284,...,0.80415,0.341486,0.513475,0.377585,0.142065,0.549184,0.578526,0.543612,0.099152,0.046715
4,0.47541,0.083634,0.3305,0.589319,0.423962,0.232198,0.372335,0.398312,0.248361,0.21658,...,0.338052,0.227636,0.491099,0.076029,0.473023,0.15467,0.015773,0.023164,0.001579,0.663168


In [15]:
# And another one to compare if they are the same
df2=pd.DataFrame([np.random.random(100) for x in range(100)])

In [17]:
# We create a function for testing the similarity, with a value for alpha of 0.1
def test_columns(alpha=0.1):
    # We keep track of how many differ
    num_diff=0
    for col in df1.columns:
        # We run the ttest over the two dataframes
        teststat,pval=ttest_ind(df1[col], df2[col])
        # We check the pvalue vs the alpha
        if pval<= alpha:
            print("Col {} is statistically significantly different at alpha={}, pval={}".format(col, alpha, pval))
            num_diff=num_diff+1
    # We print some summary stats
    print("Total number different was {}, which is {}%".format(num_diff, float(num_diff)/len(df1.columns)*100))

# We run the test
test_columns()
            

Col 4 is statistically significantly different at alpha=0.1, pval=0.0827959692286876
Col 31 is statistically significantly different at alpha=0.1, pval=0.044042416341466134
Col 51 is statistically significantly different at alpha=0.1, pval=0.007140344318679064
Col 55 is statistically significantly different at alpha=0.1, pval=0.057978837974823776
Col 58 is statistically significantly different at alpha=0.1, pval=0.02588626092257888
Col 59 is statistically significantly different at alpha=0.1, pval=0.07183284825081902
Col 62 is statistically significantly different at alpha=0.1, pval=0.06675653329457414
Col 64 is statistically significantly different at alpha=0.1, pval=0.06517318319290906
Col 79 is statistically significantly different at alpha=0.1, pval=0.03745329774985086
Col 80 is statistically significantly different at alpha=0.1, pval=0.013482391658496368
Col 83 is statistically significantly different at alpha=0.1, pval=0.00718066534687339
Col 90 is statistically significantly dif

In [18]:
# As it can be observed, as our alpha value is 0.1 and this is a random test over 100 samples, the number of times the
# columns are different is of roughly 10%

# We can see this happens again with other values of alpha
test_columns(0.05)

Col 31 is statistically significantly different at alpha=0.05, pval=0.044042416341466134
Col 51 is statistically significantly different at alpha=0.05, pval=0.007140344318679064
Col 58 is statistically significantly different at alpha=0.05, pval=0.02588626092257888
Col 79 is statistically significantly different at alpha=0.05, pval=0.03745329774985086
Col 80 is statistically significantly different at alpha=0.05, pval=0.013482391658496368
Col 83 is statistically significantly different at alpha=0.05, pval=0.00718066534687339
Col 90 is statistically significantly different at alpha=0.05, pval=0.03457473502372782
Total number different was 7, which is 7.000000000000001%


In [20]:
# We try the same experiment with a chi squared distribution
df2=pd.DataFrame([np.random.chisquare(df=1, size=100) for x in range(100)])
test_columns()

Col 0 is statistically significantly different at alpha=0.1, pval=0.03951201661024284
Col 1 is statistically significantly different at alpha=0.1, pval=0.004389353195341004
Col 3 is statistically significantly different at alpha=0.1, pval=0.0011724395290818206
Col 4 is statistically significantly different at alpha=0.1, pval=2.4282066806883463e-05
Col 5 is statistically significantly different at alpha=0.1, pval=0.022138722403428672
Col 6 is statistically significantly different at alpha=0.1, pval=0.015078032479012701
Col 7 is statistically significantly different at alpha=0.1, pval=8.581330107553512e-05
Col 8 is statistically significantly different at alpha=0.1, pval=4.1075977093934194e-05
Col 9 is statistically significantly different at alpha=0.1, pval=5.231271870680809e-05
Col 10 is statistically significantly different at alpha=0.1, pval=3.670651850951761e-06
Col 11 is statistically significantly different at alpha=0.1, pval=0.00016922736321794304
Col 12 is statistically signific