In [1]:
%load_ext autoreload
%autoreload 2

import sys
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
import statsmodels.api as sm
import statsmodels.stats as smstats
from IPython.display import HTML, Markdown, Latex, Math
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 80)
pd.set_option('display.precision', 4)
%precision 4
%matplotlib inline

random_seed = 42
h3 = r"<h3>{text}</h3>"
h4_centered = r"<h4><center>{text}</center></h4>"

# Normality test
Test whether samples are drawn from a normal distribution
- Could use histogram or Q-Q plot for qualititive checking
- Statistical test methods
 - Shapiro-Wilk test:  $$stat = \frac{\left(\sum_{i=1}^n a_ix(i)\right)^2}{\sum_{i=1}^n(x-\bar x)^2}$$
 - D’Agostino’s K^2 Test: test_stat = skewtest_stat^2+kurtosistest^2
 - Anderson-Darling Test: 

In [58]:
from statsmodels.graphics.gofplots import qqplot
N = 100
normal_data = np.random.randn(N)*2+5
uniform_data = np.random.randint(0, 10, N)
rayleigh_data = stats.rayleigh(loc=0, scale=2).rvs(N)

#descriptive
#stats.stats.describe(normal_data)
#stats.stats.describe(rayleigh_data)
stats.stats.moment(normal_data, moment=[1,2,3,4])
stats.stats.moment(rayleigh_data, moment=[1,2,3,4])

#qqplot(normal_data, line='s')
#qqplot(rayleigh_data, line='s')

# Shapiro-Wilk test
HTML(h3.format(text='Shapiro-Wilk test'))
stat, p = stats.shapiro(normal_data)
'Normal data', stat, p
stat, p = stats.shapiro(uniform_data)
'Uniform data', stat, p
stat, p = stats.shapiro(rayleigh_data)
'Rayleigh data', stat, p
# D’Agostino’s K^2 Test
HTML(h3.format(text='D’Agostino’s K^2 Test'))
stat, p = stats.normaltest(normal_data)
'Normal data', stat, p
stat, p = stats.normaltest(uniform_data)
'Uniform data', stat, p
stat, p = stats.normaltest(rayleigh_data)
'Rayleigh data', stat, p
# Anderson-Darling Test
# can check with norm/expon/logistic, default dist='norm', 
HTML(h3.format(text='Anderson-Darling Test'))
stat, p, level = stats.anderson(normal_data, dist='norm')
'Normal data', stat, p, level
stat, p, level = stats.anderson(uniform_data, dist='norm')
'Uniform data', stat, p, level
stat, p, level = stats.anderson(rayleigh_data, dist='expon')
'Rayleigh data', stat, p, level


array([ 0.    ,  2.6437,  0.7492, 21.1387])

array([0.    , 1.6857, 0.3992, 6.3357])

('Normal data', 0.9900, 0.6633)

('Uniform data', 0.9356, 0.0001)

('Rayleigh data', 0.9779, 0.0914)

('Normal data', 0.7072155341407046, 0.7021503198277096)

('Uniform data', 26.369416587112287, 1.8791175238075755e-06)

('Rayleigh data', 5.712157324042067, 0.05749377131395891)

('Normal data',
 0.40060129693965507,
 array([0.555, 0.632, 0.759, 0.885, 1.053]),
 array([15. , 10. ,  5. ,  2.5,  1. ]))

('Uniform data',
 1.711796224702539,
 array([0.555, 0.632, 0.759, 0.885, 1.053]),
 array([15. , 10. ,  5. ,  2.5,  1. ]))

('Rayleigh data',
 8.084964132961986,
 array([0.917, 1.072, 1.333, 1.596, 1.945]),
 array([15. , 10. ,  5. ,  2.5,  1. ]))

## Correlation

### Correlation between two variables
- Pearson corrleation
 - assume two samples are both from normal distribution
 - test for linear relationship
 - pearson correlation coefficient = $\frac{\mathrm{cov}(X,\ Y)}{\mathrm{std}(X)*\mathrm{std}(Y)}$
 - H_0: two samples are uncorrelated
- Spearman correlation
 - non-Gaussian, non-linear relationship
 - Spearman's correlation coefficient = $\frac{\mathrm{cov}(rank(X),\ rank(Y))}{\mathrm{std}(rank(X))*\mathrm{std}(rank(Y))}$
 - H_0: two samples are uncorrelated

In [78]:
N=100
x1 = 10 * np.random.randn(N) + 10
x2 = 0.3*x1 + (10 * np.random.randn(N) + 5)
# pearson correlation coefficient
stats.pearsonr(x1, x2)
# spearman rank correlation
stats.spearmanr(x1, x2)
# kendall's tau rank correlation
stats.kendalltau(x1, x2)

(0.3385933968861355, 0.0005695147113265803)

SpearmanrResult(correlation=0.31227122712271227, pvalue=0.0015618350238907378)

KendalltauResult(correlation=0.21131313131313134, pvalue=0.001838762250921649)

## Distribution test
Test whether two or more samples distribution are equal
 - Mann-Whitney U test: two independent samples
 - Wilcoxon Signed-Rank Test: two paired or dependent samples
 - Kruskal-Wallis H Test: two or more independent samples
 - Friedman Test: two or more dependent samples

In [12]:
N = 100
x1 = np.random.randn(N)*2+4
x2 = np.random.randn(N)*2+3
x3 = np.random.randint(-3, 3, N)
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]

# Mann-Whitney U test
HTML(h3.format(text='Mann-Whitney U test'))
stat, p = stats.mannwhitneyu(data1, data2)
stat, p
stat, p = stats.mannwhitneyu(x1, x2)
stat, p

# Wilcoxon Signed-Rank Test
HTML(h3.format(text='Wilcoxon Signed-Rank Test'))
stat, p = stats.wilcoxon(data1, data2)
stat, p
stat, p = stats.wilcoxon(x1, x2)
stat, p

# Kruskal-Wallis H Test
HTML(h3.format(text='Kruskal-Wallis H Test'))
stat, p = stats.kruskal(data1, data2)
stat, p
stat, p = stats.kruskal(x1, x2)
stat, p

# Friedman Test
HTML(h3.format(text='Friedman Test'))
stat, p = stats.friedmanchisquare(x1, x2, x3)
stat, p

(40.0, 0.23633779675579358)

(3679.0, 0.0006265820516807091)

(21.0, 0.5076243443095237)

(1516.0, 0.0005218751483653687)

(0.5714285714285694, 0.4496917979688917)

(10.418155223880603, 0.0012478247895852392)

(114.66000000000008, 1.2644374564977978e-25)

## Determine sample size

## Sampling
