In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import math

## One Sample t Test

#### Analyze if college students get 7.2 hours of sleep, on average, based on a sample of students, alpha = 0.05

In [2]:
df = pd.read_csv('students.csv')

In [3]:
df.head()

Unnamed: 0,ID,Gender,Classification,Height,Shoe Size,Phone Time,# of Shoes,Birth order,Pets,Happy,...,Exercise,Stat Pre,Stat Post,Phone Type,Sleep,Social Media,Impact of SocNetworking,Political,Animal,Superhero
0,1,male,senior,67.75,7.0,12.0,12.0,youngest,5.0,0.8,...,360,3.0,,iPhone,7.0,180.0,worse,Democrat,Dog person,Batman
1,2,male,freshman,71.0,7.5,1.5,5.0,middle,4.0,0.75,...,200,9.0,,Android smartphone,7.0,20.0,better,Democrat,Dog person,Batman
2,3,female,freshman,64.0,6.0,25.0,15.0,oldest,8.0,0.9,...,30,7.0,5.0,Android smartphone,8.0,60.0,better,Republican,Dog person,Batman
3,4,female,freshman,63.0,6.5,30.0,30.0,middle,12.0,0.98,...,180,6.0,7.0,iPhone,6.0,60.0,better,Republican,Both,Superman
4,5,male,senior,69.0,6.5,23.0,8.0,oldest,4.0,0.75,...,180,4.0,7.0,iPhone,5.5,60.0,worse,Independent,Dog person,Superman


In [4]:
# H0: mu = 7.2
# Ha: mu != 7.2
# it is two tailed test

In [5]:
onesample = stats.ttest_1samp(df['Sleep'], 7.2)         # sample, pop_mean

In [6]:
onesample.statistic

-1.92552134000487

In [7]:
onesample.pvalue           # fail to reject H0 because p_value is higher than alpha (significance level)

0.05795525591903326

In [8]:
alpha = 0.05
p_value = onesample.pvalue
if p_value<alpha:
    print('At {} level of significance, we can reject the null hypothesis in the favor of Ha.'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))

At 0.05 level of significance, we fail to reject the null hypothesis.


#### The principal of the school thinks that the average hours of sleep is at most 7.2, alpha = 0.05

In [9]:
# H0: mu = 7.2
# Ha: mu < 7.2
# it is one tailed test

In [10]:
onesample = stats.ttest_1samp(df['Sleep'], 7.2)

In [11]:
onesample.pvalue / 2       # reject H0 because p_value is lower than alpha (significance level)

0.02897762795951663

In [12]:
alpha = 0.05
p_value = onesample.pvalue/2
if p_value<alpha:
    print('At {} level of significance, we can reject the null hypothesis in the favor of Ha.'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))

At 0.05 level of significance, we can reject the null hypothesis in the favor of Ha.


## Independent Samples T Test (variances unknown and equal)

In [13]:
# H0 : mu1 = mu2
# Ha : mu1 != mu2

In [14]:
df = pd.read_csv('catalysts.csv')

In [15]:
df

Unnamed: 0,Observation Number,Catalyst1,Catalyst2
0,1,91.5,89.19
1,2,94.18,90.95
2,3,92.18,90.46
3,4,95.39,93.21
4,5,91.79,97.19
5,6,89.07,97.04
6,7,94.72,91.07
7,8,89.21,92.75


In [16]:
xbar1 = df['Catalyst1'].mean()
xbar2 = df['Catalyst2'].mean()

s1 = df['Catalyst1'].std()
s2 = df['Catalyst2'].std()

In [17]:
xbar1

92.255

In [18]:
xbar2

92.73249999999999

In [19]:
s1

2.385018718105646

In [20]:
s2

2.9834531766298715

In [21]:
s_pooled = math.sqrt(((len(df['Catalyst1']) - 1) * (s1 ** 2) + (len(df['Catalyst2']) - 1) * (s2 ** 2)) / (len(df['Catalyst1']) - 1 + len(df['Catalyst2']) - 1))
print('spooled = {:.3f}'.format(s_pooled))

spooled = 2.701


In [22]:
t_statistic = (xbar1-xbar2)/(s_pooled*math.sqrt(1/len(df['Catalyst1']) + 1/len(df['Catalyst2'])))
print ('t_statistic = {:.3f}'.format(t_statistic))

t_statistic = -0.354


In [23]:
# degrees_of_freedom = n1 + n2 - 2

In [24]:
p_value = 2 * stats.t.cdf(t_statistic, 14)

In [25]:
p_value

0.7289136186068217

In [26]:
alpha = 0.05

if p_value<alpha:
    print('At {} level of significance, we can reject the null hypothesis in the favor of Ha.'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))

At 0.05 level of significance, we fail to reject the null hypothesis.


### scipy.stats.ttest for 2 groups

In [27]:
ind_test_w_2gr = stats.ttest_ind(df['Catalyst1'], df['Catalyst2'], equal_var = True)

In [28]:
ind_test_w_2gr.statistic

-0.3535908643461798

In [29]:
ind_test_w_2gr.pvalue

0.7289136186068217

### rp.ttest for 2 groups

In [30]:
import researchpy as rp

In [31]:
rp.ttest(df['Catalyst1'], df['Catalyst2'])

(    Variable     N      Mean        SD        SE  95% Conf.   Interval
 0  Catalyst1   8.0  92.25500  2.385019  0.843231  90.261074  94.248926
 1  Catalyst2   8.0  92.73250  2.983453  1.054810  90.238271  95.226729
 2   combined  16.0  92.49375  2.620905  0.655226  91.097168  93.890332,
                       Independent t-test  results
 0  Difference (Catalyst1 - Catalyst2) =   -0.4775
 1                  Degrees of freedom =   14.0000
 2                                   t =   -0.3536
 3               Two side test p value =    0.7289
 4              Difference < 0 p value =    0.3645
 5              Difference > 0 p value =    0.6355
 6                           Cohen's d =   -0.1768
 7                           Hedge's g =   -0.1672
 8                       Glass's delta =   -0.2002
 9                                   r =    0.0941)

## Arsenic concentration in public drinking water supplies is a potential health risk. An article in the Arizona Republic (May 27, 2001) reported drinking water arsenic concentrations in parts per billion (ppb) for 10 metropolitan Phoenix communities and 10 communities in rural Arizona. You can find the data in CSV file.

In [32]:
df = pd.read_csv('arsenic.csv')

In [34]:
df

Unnamed: 0,Metro Phoenix,x1,Rural Arizona,x2
0,Phoenix,3,Rimrock,48
1,Chandler,7,Goodyear,44
2,Gilbert,25,New River,40
3,Glendale,10,Apache Junction,38
4,Mesa,15,Buckeye,33
5,Paradise Valley,6,Nogales,21
6,Peoria,12,Black Canyon City,20
7,Scottsdale,25,Sedona,12
8,Tempe,15,Payson,1
9,Sun City,7,Casa Grande,18


In [36]:
# Independent Samples T Test (assumption that --> variances unknown and equal), small size

In [37]:
ind_test_w_2gr = stats.ttest_ind(df['x1'], df['x2'], equal_var = True)

In [38]:
ind_test_w_2gr.statistic

-2.7669395785560553

In [39]:
p_value = ind_test_w_2gr.pvalue

In [40]:
alpha = 0.05

if p_value<alpha:
    print('At {} level of significance, we can reject the null hypothesis in the favor of Ha.'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))

At 0.05 level of significance, we can reject the null hypothesis in the favor of Ha.


Drinking water arsenic concentrations in ppb are different for rural Arizona and metropolitan Phoenix.

## Paired Sample Test

In [41]:
df = pd.read_csv('prozac.csv')

In [43]:
df

Unnamed: 0,moodpre,moodpost,difference
0,3,5,2
1,0,1,1
2,6,5,-1
3,7,7,0
4,4,10,6
5,3,9,6
6,2,7,5
7,1,11,10
8,4,8,4


In [46]:
paired_test = stats.ttest_rel(df['moodpre'], df['moodpost'])

In [48]:
paired_test.pvalue        # it is for two side

0.013745824394788489

In [49]:
paired_test.pvalue / 2    # it is for one side

0.006872912197394244

In [52]:
rp.ttest(df['moodpre'], df['moodpost'], paired = True)           # with researchpy

(   Variable    N      Mean        SD        SE  95% Conf.  Interval
 0   moodpre  9.0  3.333333  2.236068  0.745356   1.614539  5.052127
 1  moodpost  9.0  7.000000  3.041381  1.013794   4.662187  9.337813
 2      diff  9.0 -3.666667  3.500000  1.166667  -6.357005 -0.976329,
                 Paired samples t-test  results
 0  Difference (moodpre - moodpost) =   -3.6667
 1               Degrees of freedom =    8.0000
 2                                t =   -3.1429
 3            Two side test p value =    0.0137
 4           Difference < 0 p value =    0.0069
 5           Difference > 0 p value =    0.9931
 6                        Cohen's d =   -1.0476
 7                        Hedge's g =   -0.9977
 8                    Glass's delta =   -1.6398
 9                                r =    0.7433)