In [1]:
import numpy as np
import scipy.stats as scs
import matplotlib.pyplot as plt
%matplotlib inline

**Note:** 
- The tests give p-value (be careful of scientific notation), we can compare it with our alpha of choice to decide whether to accept or reject the null hypothesis.
- This test underestimates differences in variance, only cares about the mean value.
- For 3 or more samples, use ANOVA

# 1-Sample Mean
Testing if the mean of a sample is not different from a hypothetical value

In [13]:
# Generate a sample
x = np.random.random_sample(size=100)

# Test if the mean of the sample is 0.5
print x.mean()
scs.ttest_1samp(x, 0.5)

0.511280394017


Ttest_1sampResult(statistic=0.38397564200844991, pvalue=0.70181995123630081)

# 1-Sample Proportion
This applies when we have a Bernoulli distribution

In [48]:
# Generate a Bernoulli distribution
x = [item for sublist in [[0]*80 + [1]*28] for item in sublist]

# Testing if the proportion is 0.25
print str(scs.ttest_1samp(x, 0.25))
print 'p-value large, accept null'


# Another test
x = [item for sublist in [[0]*70 + [1]*40] for item in sublist]
print str(scs.ttest_1samp(x, 0.25))
print 'p-value small reject null'

Ttest_1sampResult(statistic=0.21855858972563169, pvalue=0.82741023156164006)
p-value large, accept null
Ttest_1sampResult(statistic=2.466290592309488, pvalue=0.015210081667458115)
p-value small reject null


# 2-Sample Means
Compare to see if the means of 2 samples are not different, i.e., whether 2 samples are from the same population

In [49]:
# Generate 2 samples with similar means
x1 = np.random.random_sample(size=500)
x2 = np.random.random_sample(size=500)
print x1.mean(), x2.mean()
print scs.ttest_ind(x1, x2)
print 'p-value large, accept null'
print '---------------'

# Generate 2 samples with different means
y1 = np.random.random_sample(size=100)*2
y2 = np.random.random_sample(size=100)
print y1.mean(), y2.mean()
print scs.ttest_ind(y1, y2)
print 'p-value small, reject null'

0.512352844858 0.48942419107
Ttest_indResult(statistic=1.2553096703645392, pvalue=0.20966020908622976)
p-value large, accept null
---------------
1.09867951049 0.5169690375
Ttest_indResult(statistic=9.1442879064494669, pvalue=7.3318414569629565e-17)
p-value small, reject null


# 2-Sample Proportions

In [56]:
# Generate Bernoulli distributions with similar proportions
z1 = [item for sublist in [[0]*30 + [1]*70] for item in sublist]
z2 = [item for sublist in [[0]*25 + [1]*75] for item in sublist]
print scs.ttest_ind(z1, z2)
print 'p-value large, accept null'
print '---------------------'

# Generate Bernoulli distributions with different proportions
z1 = [item for sublist in [[0]*30 + [1]*70] for item in sublist]
z2 = [item for sublist in [[0]*20 + [1]*90] for item in sublist]
print scs.ttest_ind(z1, z2)
print 'p-value large, accept null'

Ttest_indResult(statistic=-0.78907636476703791, pvalue=0.43101120573602236)
p-value large, accept null
---------------------
Ttest_indResult(statistic=-2.0181082671329791, pvalue=0.04486433017706145)
p-value large, accept null


# ANOVA f-test

In [60]:
# Generate 3 samples with similar means
a1 = np.random.random_sample(size=500)
a2 = np.random.random_sample(size=500)
a3 = np.random.random_sample(size=500)
print a1.mean(), a2.mean(), a3.mean()
print scs.f_oneway(a1, a2, a3)
print 'p-value large, accept null'
print '---------------'

# Generate 3 samples with different means
b1 = np.random.random_sample(size=500)
b2 = np.random.random_sample(size=500)*1.3
b3 = np.random.random_sample(size=500)
print b1.mean(), b2.mean(), b3.mean()
print scs.f_oneway(b1, b2, b3)
print 'p-value large, accept null'

0.469202139503 0.498124907517 0.505961771149
F_onewayResult(statistic=2.2198395801574713, pvalue=0.10898398160447441)
p-value large, accept null
---------------
0.488278779843 0.634099105606 0.506311652019
F_onewayResult(statistic=31.290637441537974, pvalue=4.8643009112395043e-14)
p-value large, accept null
