In [162]:
import numpy as np
import scipy.stats as stats

In [163]:
stats.norm.ppf(0.95)

1.6448536269514722

In [161]:
def print_test_result(test_title, statistic_value, p_value, significance_level, accept_message, reject_message):
    print(test_title)
    print("statistic_value= %.3f p_value= %.3f" % (statistic_value, p_value))
    if p_value > significance_level:
        print("Fail to reject null hyperthesis: "+accept_message)
    elif p_value <= significance_level:
        print("Reject null hyperthesis: "+reject_message)

# Student's t-test
### Assumptions
- each sampling is iid, same variance, normal distribution

### Interpertation
- H0: $\mu_1$ = $\mu_2$
- H1: $\mu_1$ $\neq$ $\mu_2$

In [64]:
n = 10
sigma = 1
significance_level = 0.05
data1 = np.random.randn(n)*sigma
data2 = np.random.randn(n)*sigma
stat, p = stats.ttest_ind(data1, data2, alternative="two-sided") #alternative={"two-sided", "less", "greater"}
print_test_result("student's t-test", stat, p, significance_level, "probably same distribution", "probably different distribution")

student's t-test
statistic_value= 0.626 p_value= 0.539
probably same distribution


# ANOVA(One Way)
### Assumptions
- each sampling is iid, same variance, normal distribution

### Interpertation
- H0: $\mu_1$ = $\mu_2$ ... = $\mu_s$
- H1: one or more of the means of the samples ar unequal

In [26]:
n = 50
sigma = 1
significance_level = 0.05
data1 = np.random.randn(n)*sigma
data2 = np.random.randn(n)*sigma
data3 = np.random.randn(n)*sigma
stat, p = stats.f_oneway(data1, data2, data3)
print_test_result("one way ANOVA", stat, p, significance_level, "probably same distribution", "probably different distribution")

one way ANOVA
statistic_value= 0.351 p_value= 0.704
probably same distribution


In [27]:
n = 50
sigma = 1
significance_level = 0.05
data1 = np.random.randn(n)*sigma
data2 = np.random.randn(n)*sigma+1
data3 = np.random.randn(n)*sigma
stat, p = stats.f_oneway(data1, data2, data3)
print_test_result("one way ANOVA", stat, p, significance_level, "probably same distribution", "probably different distribution")

one way ANOVA
statistic_value= 15.498 p_value= 0.000
probably different distribution


# Sharpiro Test(Normality Test)
### Assumptions
- each sampling is iid

### Interpretation
- H0: the sample has a normal distribution
- H1: the sample dosen't have a normal distribution

In [40]:
n = 30
mean = 3
sigma = 1
significance_level = 0.05
data = np.random.randn(n)*sigma+mean
stat, p = stats.shapiro(data)
print_test_result("Sharpiro test", stat, p, significance_level, "probably normal distribution", "probably not normal distribution")

Sharpiro test
statistic_value= 0.979 p_value= 0.808
probably normal distribution


In [41]:
n = 30
significance_level = 0.05
data = np.random.rand(n)
stat, p = stats.shapiro(data)
print_test_result("Sharpiro test", stat, p, significance_level, "probably normal distribution", "probably not normal distribution")

Sharpiro test
statistic_value= 0.943 p_value= 0.108
probably normal distribution


# Kolmogorov-Smirnov Test(Continuous Distribution Goodness of Fit test)
### Interpretation
- H0: the sample comes from specified distribution
- H1: the sample dosen;t from specified distribution

In [148]:
n = 1000
mu = 4
sigma = 1
significance_level = 0.05
data = np.random.randn(n)*sigma+mu
stat, p = stats.ks_1samp(data, stats.norm.cdf, args=(mu, sigma)) # alternative = {"two-sided", "less", "greater"}
print_test_result("Kolmogorov-Smirnov Test", stat, p, significance_level, "probably the data comes from the specified distribution", "at least one value does not match the specified distribution")

Kolmogorov-Smirnov Test
statistic_value= 0.027 p_value= 0.464
probably the data comes from the specified distribution


In [157]:
n = 1000
mu = 4
significance_level = 0.05
data = stats.expon.rvs(loc=0, scale=1/mu, size=n)
stat, p = stats.ks_1samp(data, stats.expon.cdf, args=(0, 1/mu)) # alternative = {"two-sided", "less", "greater"}
print_test_result("Kolmogorov-Smirnov Test", stat, p, significance_level, "probably the data comes from the specified distribution", "at least one value does not match the specified distribution")

Kolmogorov-Smirnov Test
statistic_value= 0.036 p_value= 0.139
probably the data comes from the specified distribution


In [138]:
data1 = np.random.exponential(scale=1/mu, size=n)
data2 = np.random.exponential(scale=1/mu, size=n)
stat, p = stats.ks_2samp(data1, data2)
print_test_result("Kolmogorov-Smirnov Test", stat, p, significance_level, "same distribution", "different distribution")

Kolmogorov-Smirnov Test
statistic_value= 0.080 p_value= 0.908
same distribution
