# Statistical hypothesis test
![image-6.png](attachment:image-6.png)

In [1]:
import numpy as np
import pandas as pd
from scipy import stats

%precision 3

'%.3f'

In [2]:
df = pd.read_excel('data/ch11_potato.xlsx')
df

Unnamed: 0,жин
0,122.02
1,131.73
2,130.6
3,131.82
4,132.05
5,126.12
6,124.43
7,132.89
8,122.79
9,129.95


In [3]:
sample = np.array(df['жин'])
sample.shape

(14,)

In [4]:
s_mean = np.mean(sample)
s_mean

128.4507142857143

## Статистик таамаглалын тест
![image-6.png](attachment:image-6.png)

In [5]:
rv = stats.norm(130, np.sqrt(9/14))
rv.isf(0.95)

128.68118313069039

![image-9.png](attachment:image-9.png)

![image-5.png](attachment:image-5.png)

![image-2.png](attachment:image-2.png)

In [6]:
#test statistic  
z = (s_mean - 130) / np.sqrt(9/14)
z

-1.932298779026813

In [7]:
#critical value 
rv = stats.norm()
rv.isf(0.95)

-1.6448536269514722

In [8]:
#cumulative distribution function
rv.cdf(z)

0.026661319523126635

![image.png](attachment:image.png)

### one-sided test or two-sided test
![image-3.png](attachment:image-3.png)

In [9]:
#test statistic  
z = (s_mean - 130) / np.sqrt(9/14)
z

-1.932298779026813

In [10]:
#critical value 
rv = stats.norm()
rv.interval(0.95)

(-1.959963984540054, 1.959963984540054)

In [11]:
#p-value
rv.cdf(z) * 2

0.05332263904625327

### Таамаглалыг шалгахад гарах хоёр төрлийн алдаа
![image-2.png](attachment:image-2.png)

In [12]:
rv = stats.norm(130, 3)

In [13]:
c = stats.norm().isf(0.95)
n_samples = 10000
cnt = 0
for _ in range(n_samples):
    sample_ = np.round(rv.rvs(14), 2)
    s_mean_ = np.mean(sample_)
    z = (s_mean_ - 130) / np.sqrt(9/14)
    if z < c:
        cnt += 1
cnt / n_samples

0.052

![image.png](attachment:image.png)

In [14]:
rv = stats.norm(128, 3)

In [15]:
c = stats.norm().isf(0.95)
n_samples = 10000
cnt = 0
for _ in range(n_samples):
    sample_ = np.round(rv.rvs(14), 2)
    s_mean_ = np.mean(sample_)
    z = (s_mean_ - 130) / np.sqrt(9/14)
    if z >= c:
        cnt += 1
        
cnt / n_samples

0.196

![image.png](attachment:image.png)

## таамаглалыг шалгах


### Хэвийн тархалтай эх олонлогийн дунджийн хувьд шалгах(эх олонлогийн вариацыг мэдэгдэж буй үед)
![image-2.png](attachment:image-2.png)

In [16]:
def pmean_test(sample, mean0, p_var, alpha=0.05):
    s_mean = np.mean(sample)
    n = len(sample)
    rv = stats.norm()
    interval = rv.interval(1-alpha)

    z = (s_mean - mean0) / np.sqrt(p_var/n)
    if interval[0] <= z <= interval[1]:
        print('тэг таамаглалыг батална')
    else:
        print('тэг таамаглалыг  няцаана')

    if z < 0:
        p = rv.cdf(z) * 2
    else:
        p = (1 - rv.cdf(z)) * 2
    print(f'p value {p:.3f}')

In [17]:
pmean_test(sample, 130, 9)

тэг таамаглалыг батална
p value 0.053


### хэвийн тархалттай эх олонлогийн вариацын хувьд шалгах
![image.png](attachment:image.png)

In [18]:
def pvar_test(sample, var0, alpha=0.05):
    u_var = np.var(sample, ddof=1)
    n = len(sample)
    rv = stats.chi2(df=n-1)
    interval = rv.interval(1-alpha)
    
    y = (n-1) * u_var / var0
    if interval[0] <= y <= interval[1]:
        print('тэг таамаглалыг  батлана')
    else:
        print('тэг таамаглалыг  няцаана')

    if y < rv.isf(0.5):
        p = rv.cdf(y) * 2
    else:
        p = (1 - rv.cdf(y)) * 2
    print(f'pvalue {p:.3f}')

In [19]:
pvar_test(sample, 9)

тэг таамаглалыг  батлана
pvalue 0.085


### Хэвийн тархалтай эх олонлогийн дунджийн хувьд шалгах(эх олонлогийн вариацыг мэдэхгүй үед)
![image-2.png](attachment:image-2.png)

In [20]:
def pmean_test(sample, mean0, alpha=0.05):
    s_mean = np.mean(sample)
    u_var = np.var(sample, ddof=1)
    n = len(sample)
    rv = stats.t(df=n-1)
    interval = rv.interval(1-alpha)

    t = (s_mean - mean0) / np.sqrt(u_var/n)
    if interval[0] <= t <= interval[1]:
        print('тэг таамаглалыг  батлана')
    else:
        print('тэг таамаглалыг  няцаана')

    if t < 0:
        p = rv.cdf(t) * 2
    else:
        p = (1 - rv.cdf(t)) * 2
    print(f'pvalue {p:.3f}')

In [21]:
pmean_test(sample, 130)

тэг таамаглалыг  батлана
pvalue 0.169


In [22]:
t, p = stats.ttest_1samp(sample, 130)
t, p

(-1.4551960206404198, 0.16933464230414275)

## Two – sample problem
![image-2.png](attachment:image-2.png)

### paired t - test
![image-4.png](attachment:image-4.png)
![image-2.png](attachment:image-2.png)

In [23]:
training_rel = pd.read_excel('data/ch11_training_rel.xlsx')
print(training_rel.shape)
training_rel.head()

(20, 2)


Unnamed: 0,өмнө,хойно
0,59,41
1,52,63
2,55,68
3,61,59
4,59,84


![image-2.png](attachment:image-2.png)

In [24]:
training_rel['зөрүү'] = training_rel['хойно'] - training_rel['өмнө']
training_rel.head()

Unnamed: 0,өмнө,хойно,зөрүү
0,59,41,-18
1,52,63,11
2,55,68,13
3,61,59,-2
4,59,84,25


![image.png](attachment:image.png)

In [25]:
t, p = stats.ttest_1samp(training_rel['зөрүү'], 0)
p

0.04004419061842953

In [26]:
t, p = stats.ttest_rel(training_rel['хойно'], training_rel['өмнө'])
p

0.04004419061842953

### Independent t- test
![image-2.png](attachment:image-2.png)
![image-4.png](attachment:image-4.png)

In [27]:
training_ind = pd.read_csv('data/ch11_training_ind.csv')
print(training_ind.shape)
training_ind.head()

(20, 2)


Unnamed: 0,A,B
0,47,49
1,50,52
2,37,54
3,60,48
4,39,51


![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png)

In [28]:
t, p = stats.ttest_ind(training_ind['A'], training_ind['B'],
                       equal_var=False)
p

0.08695731107259361

### Wilcoxon signed – rank test
![image-2.png](attachment:image-2.png)

In [33]:
training_rel = pd.read_excel('data/ch11_training_rel.xlsx')
toy_df = training_rel[:6].copy()
toy_df

Unnamed: 0,өмнө,хойно
0,59,41
1,52,63
2,55,68
3,61,59
4,59,84
5,45,37


In [34]:
diff = toy_df['хойно'] - toy_df['өмнө']
toy_df['зөрүү'] = diff
toy_df

Unnamed: 0,өмнө,хойно,зөрүү
0,59,41,-18
1,52,63,11
2,55,68,13
3,61,59,-2
4,59,84,25
5,45,37,-8


In [35]:
rank = stats.rankdata(abs(diff)).astype(int)
toy_df['зэрэглэл'] = rank
toy_df

Unnamed: 0,өмнө,хойно,зөрүү,зэрэглэл
0,59,41,-18,5
1,52,63,11,3
2,55,68,13,4
3,61,59,-2,1
4,59,84,25,6
5,45,37,-8,2


In [36]:
r_minus = np.sum((diff < 0) * rank)
r_plus = np.sum((diff > 0) * rank)

r_minus, r_plus

(8, 13)

In [37]:
toy_df['хойно'] = toy_df['өмнө'] + np.arange(1, 7)
diff = toy_df['хойно'] - toy_df['өмнө']
rank = stats.rankdata(abs(diff)).astype(int)
toy_df['зөрүү'] = diff
toy_df['зэрэглэл'] = rank
toy_df

Unnamed: 0,өмнө,хойно,зөрүү,зэрэглэл
0,59,60,1,1
1,52,54,2,2
2,55,58,3,3
3,61,65,4,4
4,59,64,5,5
5,45,51,6,6


In [38]:
r_minus = np.sum((diff < 0) * rank)
r_plus = np.sum((diff > 0) * rank)

r_minus, r_plus

(0, 21)

In [39]:
toy_df['хойно'] = toy_df['өмнө'] + [1, -2, -3, 4, 5, -6]
diff = toy_df['хойно'] - toy_df['өмнө']
rank = stats.rankdata(abs(diff)).astype(int)
toy_df['зөрүү'] = diff
toy_df['зэрэглэл'] = rank
toy_df

Unnamed: 0,өмнө,хойно,зөрүү,зэрэглэл
0,59,60,1,1
1,52,50,-2,2
2,55,52,-3,3
3,61,65,4,4
4,59,64,5,5
5,45,39,-6,6


In [40]:
r_minus = np.sum((diff < 0) * rank)
r_plus = np.sum((diff > 0) * rank)

r_minus, r_plus

(11, 10)

![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png)

In [41]:
T, p = stats.wilcoxon(training_rel['өмнө'], training_rel['хойно'])
p

0.03623390197753906

In [42]:
n = 10000
diffs = np.round(stats.norm(3, 4).rvs(size=(n, 20)))

In [43]:
cnt = 0
alpha = 0.05
for diff in diffs:
    t, p = stats.ttest_1samp(diff, 0)
    if p < alpha:
        cnt += 1
cnt / n

0.893

In [44]:
cnt = 0
alpha = 0.05
for diff in diffs:
    T, p = stats.wilcoxon(diff)
    if p < alpha:
        cnt += 1
cnt / n



0.877

### Mann –Whitney U test(Wilcoxon Rank Sum Test)
![image-2.png](attachment:image-2.png)

In [45]:
training_ind = pd.read_csv('data/ch11_training_ind.csv')
toy_df = training_ind[:5].copy()
toy_df

Unnamed: 0,A,B
0,47,49
1,50,52
2,37,54
3,60,48
4,39,51


In [46]:
rank = stats.rankdata(np.concatenate([toy_df['A'],
                                      toy_df['B']]))
rank_df = pd.DataFrame({'A': rank[:5],
                        'B': rank[5:10]}).astype(int)
rank_df

Unnamed: 0,A,B
0,3,5
1,6,8
2,1,9
3,10,4
4,2,7


![image-3.png](attachment:image-3.png)

In [47]:
n1 = len(rank_df['A'])
u = rank_df['A'].sum() - (n1*(n1+1))/2
u

7.0

In [48]:
rank_df = pd.DataFrame(np.arange(1, 11).reshape(2, 5).T,
                       columns=['A', 'B'])
rank_df

Unnamed: 0,A,B
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [49]:
u = rank_df['A'].sum() - (n1*(n1+1))/2
u

0.0

In [50]:
rank_df = pd.DataFrame(np.arange(1, 11).reshape(2, 5)[::-1].T,
                       columns=['A', 'B'])
rank_df

Unnamed: 0,A,B
0,6,1
1,7,2
2,8,3
3,9,4
4,10,5


In [51]:
u = rank_df['A'].sum() - (n1*(n1+1))/2
u

25.0

![image.png](attachment:image.png)

In [52]:
u, p = stats.mannwhitneyu(training_ind['A'], training_ind['B'],
                          alternative='two-sided')
p, u

(0.05948611166127324, 130.0)

### Chi- square test
![image-7.png](attachment:image-7.png)

In [53]:
ad_df = pd.read_excel('data/ch11_ad.xlsx')
n = len(ad_df)
print(n)
ad_df

1000


Unnamed: 0,зар/сурталчилгаа,худалдан авалт
0,B,хийгээгүй
1,B,хийгээгүй
2,A,хийсэн
3,A,хийсэн
4,B,хийгээгүй
...,...,...
995,B,хийгээгүй
996,B,хийгээгүй
997,B,хийгээгүй
998,B,хийгээгүй


In [54]:
ad_cross = pd.crosstab(ad_df['зар/сурталчилгаа'], ad_df['худалдан авалт'])
ad_cross

худалдан авалт,хийгээгүй,хийсэн
зар/сурталчилгаа,Unnamed: 1_level_1,Unnamed: 2_level_1
A,351,49
B,549,51


In [55]:
ad_cross['хийсэн'] / (ad_cross['хийсэн'] + ad_cross['хийгээгүй'])

зар/сурталчилгаа
A    0.1225
B    0.0850
dtype: float64

In [56]:
#худалдаж аваагүй, хувалдан авсан хүний тоо
n_not, n_yes = ad_cross.sum()
n_not, n_yes

(900, 100)

In [57]:
n_adA, n_adB = ad_cross.sum(axis=1)
n_adA, n_adB

(400, 600)

![image.png](attachment:image.png)

In [58]:
#expected frequency 
ad_ef = pd.DataFrame({'хийсэн': [n_adA * n_yes / n,
                              n_adB * n_yes / n],
                      'хийгээгүй': [n_adA * n_not / n,
                                   n_adB * n_not / n]},
                      index=['A', 'B'])
ad_ef

Unnamed: 0,хийсэн,хийгээгүй
A,40.0,360.0
B,60.0,540.0


![image-2.png](attachment:image-2.png)

In [59]:
y = ((ad_cross - ad_ef) ** 2 / ad_ef).sum().sum()
y

3.75

In [60]:
rv = stats.chi2(1)
1 - rv.cdf(y)

0.052807511416113395

In [61]:
chi2, p, dof, ef = stats.chi2_contingency(ad_cross,
                                          correction=False)
chi2, p, dof

(3.75, 0.052807511416113395, 1)

In [62]:
ef

array([[360.,  40.],
       [540.,  60.]])