In [1]:
import pandas as pd
import numpy as np

from scipy import stats
import statsmodels.formula.api as sm
from statsmodels import regression
import statsmodels

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager, rc, rcParams

# 15. 범주형 자료분석
## 15.1 서론
- 범주형 자료 : 관측치들이 몇 개의 범주로 분류되고 각 범주의 도수로 자료가 주어지는 것

In [24]:
# example 9 
obs = [18,55,27]
pr = [0.25, 0.5, 0.25]
n = np.sum(obs)
exp = np.multiply(n, pr)
df = len(obs) - 1

stats.chisquare(obs, exp) # 적합도 검정

Power_divergenceResult(statistic=2.62, pvalue=0.26982005638468687)

In [25]:
# example 10
data = pd.DataFrame([['a', 'ok', 37], ['a', 'mid', 24], ['a', 'ng', 19], ['b', 'ok', 17], ['b', 'mid', 33], ['b', 'ng', 20]])
diet = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')#, margins=True)

stats.chi2_contingency(diet) # 동질성 검정

(8.22398544438018,
 0.01637511094543408,
 2,
 array([[30.4, 20.8, 28.8],
        [26.6, 18.2, 25.2]]))

In [2]:
# example 1
a = 18 ; b = 55 ; c = 27
all_sum = 100
pa = a / all_sum
pb = b / all_sum
pc = c / all_sum
print(pa, pb, pc)

0.18 0.55 0.27


In [4]:
# example 2
data = pd.DataFrame([['a', 'ok', 37], ['a', 'mid', 24], ['a', 'ng', 19], ['b', 'ok', 17], ['b', 'mid', 33], ['b', 'ng', 20]])
diet = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum', margins=True)

diet

1,mid,ng,ok,All
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,24,19,37,80
b,33,20,17,70
All,57,39,54,150


In [13]:
diet_ratio = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum', margins=True, normalize='columns')
diet_ratio
"동질성 검정의 귀무가설은 p_{Ai} = p_{Bi}"

'동질성 검정의 귀무가설은 p_{Ai} = p_{Bi}'

In [15]:
# example 3
data = pd.DataFrame([['male', 'much', 378], ['male', 'ok', 237], ['male', 'less', 26], ['female', 'much', 388], ['female', 'ok', 196], ['female', 'less', 25]])
watch = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum', margins=True)

print(watch)
"독립성 검정 : 방영에 대한 의견은 개인의 성별과 무관하다는 귀무가설 검정"

1       less  much   ok   All
0                            
female    25   388  196   609
male      26   378  237   641
All       51   766  433  1250


'독립성 검정 : 방영에 대한 의견은 개인의 성별과 무관하다는 귀무가설 검정'

## 15.2 피어슨의 $\chi^2$(카이제곱) 적합도 검정
- 귀무가설에서 제시한 각 범주의 비율이 실제 자료에 적합한가?
$$H_0:p_1=p_{10},\dots,p_k=p_{k0}$$
$$Observed=n_i\quad Expected=n\times p_{i0] \\ \chi^2=\sum\frac{(O-E)^2}{E}=\sum^k_{i=1}\frac{(n_i-np_{i0})^2}{np_{i0}} \ quad df=(범주의\ 갯수)-1$$

> **피어슨의 $\chi^2$ 적합도 검정**
>
> $$H0:p_1=p_{10}, \dots, p_k=p_{k0}$$
> $$\chi^2=\sum\frac{(O-E)^2}{E}=\sum^k_{i=1}\frac{(n_i-np_{i0})^2}{np_{i0}} \quad df=(범주의\ 갯수)-1$$
> 유의수준 $\alpha$에 대한 기각역은 $R:\chi^2\geq\chi^2_\alpha(k-1)$

> **$\chi^2$ 분포의 특성**
>
> 1) 독립인 표본으로부터 계산된 $\chi^2$통계량들을 더하면, 그 합도 $\chi^2$ 분포를 따르며, 자유도는 각 자유도의 합과 같다 $$\chi^2(k_1)+\dots+\chi^2(k_r)\sim \chi^2(k_1+\dots+k_r)$$
> 2) $\chi^2$통계량을 계산하는데 만약 모수의 추정치를 사용하였다면, 통계량 분포의 자유도는 추정한 모수의 개수만큼 감소하게 된다 $$\chi^2의 자유도=(모수를\ 알고\ 있을\ 경우의\ 자유도)-(추정된\ 모수의\ 수)$$

- 피어슨 $\chi^2$검정은 표본의 크기가 큰 경우에만 적합 : 각 칸의 기대도수가 5 이상이면 가능
    - 기대도수가 5보다 작은 칸이 많은 경우에는, 여러 칸을 합하여 기대도수를 크게 하거나, $2\times 2$ 분할표에서는 피셔의 정확검정법 사용

In [33]:
# example 4
obs = [18, 55, 27]
exp = [25, 50 ,25]
chi2 = np.sum(np.divide(np.subtract(obs, exp)**2, exp)); print(chi2)
stats.chi2.ppf(1-0.05/2, df=len(obs)-1)

2.62


7.377758908227871

In [34]:
# prob 2.2
obs = [38, 43, 10, 5]
exp = [96/4, 96/4, 96/4, 96/4]

alpha = 0.05
chi2 = np.sum(np.divide(np.subtract(obs, exp)**2, exp)); print(chi2)
stats.chi2.ppf(1-alpha/2, df=len(obs)-1)

46.416666666666664


9.348403604496148

In [36]:
# prob 2.4
obs = [61, 55, 41, 41]
exp = [198/4, 198/4, 198/4, 198/4]

alpha = 0.05
chi2 = np.sum(np.divide(np.subtract(obs, exp)**2, exp)); print(chi2)
stats.chi2.ppf(1-alpha/2, df=len(obs)-1)

6.2020202020202015


9.348403604496148

In [41]:
# prob 2.6
obs = [141, 291, 132]
exp = np.multiply(np.sum(obs), [1/4, 1/2, 1/4])

alpha = 0.05
chi2 = np.sum(np.divide(np.subtract(obs, exp)**2, exp)); print(chi2)
stats.chi2.ppf(1-alpha/2, df=len(obs)-1)

0.8617021276595745


7.377758908227871

In [42]:
# prob 2.8
obs = [462, 171, 76, 57, 92, 74]
exp = np.multiply(np.sum(obs), [0.59, 0.15, 0.06, 0.06, 0.07, 0.07])

alpha = 0.01
chi2 = np.sum(np.divide(np.subtract(obs, exp)**2, exp)); print(chi2)
stats.chi2.ppf(1-alpha/2, df=len(obs)-1)

40.39164735509392


16.74960234363904

In [52]:
# prob 2.10
# 1
print(
    stats.binom.pmf(k=0,n=3,p=0.5),
    stats.binom.pmf(k=1,n=3,p=0.5),
    stats.binom.pmf(k=2,n=3,p=0.5),
    stats.binom.pmf(k=3,n=3,p=0.5),
)

0.125 0.3750000000000001 0.3750000000000001 0.125


In [53]:
# 2
obs = [31, 40, 16, 13]
p = [stats.binom.pmf(k=0,n=3,p=0.5),
    stats.binom.pmf(k=1,n=3,p=0.5),
    stats.binom.pmf(k=2,n=3,p=0.5),
    stats.binom.pmf(k=3,n=3,p=0.5)]
exp = np.multiply(np.sum(obs), p)

alpha = 0.05
chi2 = np.sum(np.divide(np.subtract(obs, exp)**2, exp)); print(chi2)
stats.chi2.ppf(1-alpha/2, df=len(obs)-1)

39.893333333333345


9.348403604496148

## 15.3 동질성 검정
- 모집단을 분류된 범주에 따라 2원 분할표로 표현 후, 모집단들이 각 범주에 대하여 같은 반응을 보이는지 검정
- 각 반응범주에서 관측된 비율을 가지고 모집단 별로 같은지 조사
$$H_0: p_{A1}=p_{B1},\  p_{A2}=p_{B2},\ p_{A3}=p_{B3},$$
$$칸의\ 추정기대도수=\frac{칸이\ 속한\ 행의\ 합계\times 칸이\ 속한\ 열의\ 합계}{전체\ 합계}$$

> **분할표에서의 $\chi^2$ 동질성 검정**
>
> - r개의 모집단에서 독립적으로 추출한 표본을 c개의 반응범주로 분류
>    - 각 칸의 추정기대도수 : $\frac{(행의\ 합 \times 열의 합)}{(전체 합}$
>    - $\chi^2$의 자유도 : r(c-1) - (c-1) = (r-1)(c-1)
>        - 각 r개의 행이 독립인 모집단, c개의 범주 
>        => 성질 1) 모수가 알려져 있다면 자유도 r(c-1)
>        => 성질 2) 각 범주 공통비율 $p_k$를 추정하면, 자유도는 추정한 모수의 개수만큼 감소, $\sum p_k=1$로부터 추정 모수의 개수는 (c-1)
>    - $\chi^2 = \sum\frac{(O-E)^2}{E}$
>    - 기각역$ R: \chi^2 \geq \chi^2_\alpha$

In [233]:
# example 5
data = pd.DataFrame([['a', 'ok', 37], ['a', 'mid', 24], ['a', 'ng', 19], ['b', 'ok', 17], ['b', 'mid', 33], ['b', 'ng', 20]])
diet = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')#, margins=True)

diet

1,mid,ng,ok
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,24,19,37
b,33,20,17


In [269]:
# expectation
diet_exp = pd.DataFrame(index=diet.index, columns=diet.columns)
all_sum = np.sum(diet.sum())
row_sum = col_sum = 0

for row in list(diet_exp.index):
    row_sum = np.sum(diet.loc[row, :])
    for col in list(diet_exp.columns):
        col_sum = np.sum(diet.loc[:, col])
        diet_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
diet_chi = pd.DataFrame(index=diet.index, columns=diet.columns)
for row in list(diet_exp.index):
    for col in list(diet_exp.columns):
        diet_chi.loc[row,col] = (diet.loc[row,col]-diet_exp.loc[row,col])**2/diet_exp.loc[row,col]

f_val = np.sum(diet_chi.sum())

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(diet.index)-1) * (len(diet.columns)-1)))

8.22398544438018 5.991464547107979


In [263]:
stats.chi2.ppf(1-alpha/2, df=2)

7.377758908227871

In [213]:
np.sum(diet.loc[:, 'ok']) * np.sum(diet.loc['b',:]) / all_sum

25.2

In [264]:
# example 6
data = pd.DataFrame([['office', 'addict', 32], ['office', 'ok', 268], 
                     ['edu', 'addict', 51], ['edu', 'ok', 199], 
                     ['biz', 'addict', 67], ['biz', 'ok', 233],
                     ['merch', 'addict', 83], ['merch', 'ok', 267]])
addict = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')#, margins=True)

addict

1,addict,ok
0,Unnamed: 1_level_1,Unnamed: 2_level_1
biz,67,233
edu,51,199
merch,83,267
office,32,268


In [268]:
# expectation
addict_exp = pd.DataFrame(index=addict.index, columns=addict.columns)
all_sum = np.sum(addict.sum())
row_sum = col_sum = 0

for row in list(addict_exp.index):
    row_sum = np.sum(addict.loc[row, :])
    for col in list(addict_exp.columns):
        col_sum = np.sum(addict.loc[:, col])
        addict_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
addict_chi = pd.DataFrame(index=addict.index, columns=addict.columns)
for row in list(addict_exp.index):
    for col in list(addict_exp.columns):
        addict_chi.loc[row,col] = (addict.loc[row,col]-addict_exp.loc[row,col])**2/addict_exp.loc[row,col]

f_val = np.sum(addict_chi.sum())

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(addict.index)-1)*(len(addict.columns)-1)))

20.596749762391923 7.814727903251179


In [272]:
# example 7
data = pd.DataFrame([
    ['pro', 'sprout', 88], ['pro', 'no sprout', 12],
    ['no pro', 'sprout', 126], ['no pro', 'no sprout', 24]
])
sprout = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')
sprout

1,no sprout,sprout
0,Unnamed: 1_level_1,Unnamed: 2_level_1
no pro,24,126
pro,12,88


In [273]:
sprout_exp = pd.DataFrame(index=sprout.index, columns=sprout.columns)
all_sum = np.sum(sprout.sum())
row_sum = col_sum = 0

for row in list(sprout_exp.index):
    row_sum = np.sum(sprout.loc[row, :])
    for col in list(sprout_exp.columns):
        col_sum = np.sum(sprout.loc[:, col])
        sprout_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
sprout_chi = pd.DataFrame(index=sprout.index, columns=sprout.columns)
for row in list(sprout_exp.index):
    for col in list(sprout_exp.columns):
        sprout_chi.loc[row,col] = (sprout.loc[row,col]-sprout_exp.loc[row,col])**2/sprout_exp.loc[row,col]

f_val = np.sum(sprout_chi.sum())

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(sprout.index)-1)*(len(sprout.columns)-1)))

0.7788161993769473 3.841458820694124


In [276]:
# prob 3.2
data = pd.DataFrame([
    ['a', 'no', 36], ['a', 'listen', 55], ['a', 'buy', 109],
    ['b', 'no', 45], ['b', 'listen', 56], ['b', 'buy', 49],
    ['c', 'no', 54], ['c', 'listen', 78], ['c', 'buy', 168],    
])

market = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')

market_exp = pd.DataFrame(index=market.index, columns=market.columns)
all_sum = np.sum(market.sum())
row_sum = col_sum = 0

for row in list(market_exp.index):
    row_sum = np.sum(market.loc[row, :])
    for col in list(market_exp.columns):
        col_sum = np.sum(market.loc[:, col])
        market_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
market_chi = pd.DataFrame(index=market.index, columns=market.columns)
for row in list(market_exp.index):
    for col in list(market_exp.columns):
        market_chi.loc[row,col] = (market.loc[row,col]-market_exp.loc[row,col])**2/market_exp.loc[row,col]

f_val = np.sum(market_chi.sum())

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(market.index)-1)*(len(market.columns)-1)))

24.60796923859296 9.487729036781154


In [278]:
# prob 3.4
# 1
data = pd.DataFrame([
    ['yes','pos',23], ['yes','impos',18],
    ['no','pos',25], ['no', 'impos', 16]
])

deaf = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')

deaf_exp = pd.DataFrame(index=deaf.index, columns=deaf.columns)
all_sum = np.sum(deaf.sum())
row_sum = col_sum = 0

for row in list(deaf_exp.index):
    row_sum = np.sum(deaf.loc[row, :])
    for col in list(deaf_exp.columns):
        col_sum = np.sum(deaf.loc[:, col])
        deaf_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
deaf_chi = pd.DataFrame(index=deaf.index, columns=deaf.columns)
for row in list(deaf_exp.index):
    for col in list(deaf_exp.columns):
        deaf_chi.loc[row,col] = (deaf.loc[row,col]-deaf_exp.loc[row,col])**2/deaf_exp.loc[row,col]

f_val = np.sum(deaf_chi.sum())

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(deaf.index)-1)*(len(deaf.columns)-1)))

0.20098039215686275 3.841458820694124


In [281]:
# 2 
# 두 모비율
alpha = 0.05

p_yes = 23/48 ; p_no = 18/34
n_yes = 48 ; n_no = 34
z_alpha = stats.norm.ppf(1-alpha/2)
z_alpha * np.sqrt(((p_yes*(1-p_yes)/n_yes)+(p_no*(1-p_no)/n_no)))

0.21936532601033856

In [284]:
# prob 3.6 
# 1
data = pd.DataFrame([
    ['a','die',58], ['a','survive',57],
    ['c','die',56], ['c', 'survive', 42]
])

pesticide = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')

pesticide_exp = pd.DataFrame(index=pesticide.index, columns=pesticide.columns)
all_sum = np.sum(pesticide.sum())
row_sum = col_sum = 0

for row in list(pesticide_exp.index):
    row_sum = np.sum(pesticide.loc[row, :])
    for col in list(pesticide_exp.columns):
        col_sum = np.sum(pesticide.loc[:, col])
        pesticide_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
pesticide_chi = pd.DataFrame(index=pesticide.index, columns=pesticide.columns)
for row in list(pesticide_exp.index):
    for col in list(pesticide_exp.columns):
        pesticide_chi.loc[row,col] = (pesticide.loc[row,col]-pesticide_exp.loc[row,col])**2/pesticide_exp.loc[row,col]

f_val = np.sum(pesticide_chi.sum())

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(pesticide.index)-1)*(len(pesticide.columns)-1)))

0.9571042230081135 3.841458820694124


In [286]:
# 2
alpha = 0.05

n_a = 115 ; n_c = 98
p_a = 58/n_a ; p_c = 56/n_c

z_alpha = stats.norm.ppf(1-alpha/2)
z_alpha * np.sqrt(((p_a*(1-p_a)/n_a)+(p_c*(1-p_c)/n_c)))

0.1339776592557308

## 15.4 독립성 검정
- 한 임의표본의 각 원소에 대하여 두 가지 특성을 관찰한다면 자료들은 이 두 가지 특성에 따라 동시에 분류 가능. 행이나 열 어느 쪽의 합도 고정되지 않은 분햘표
- 두 특성이 독립이려면 두 사건의 곱사건 확률이 각 사건의 확률의 곱으로 나타나야 함 $$p_{Mi}=p_M p_i\\ H_0:각\ 칸의\ 확률은\ 그\ 칸이\ 속한\ 열의\ 합\ 확률과\ 행의\ 합\ 확률의\ 곱으로\ 표현됨$$
- 기대도수 : 표본의 수에 각 칸의 확률 곱하여 얻어짐$$총\ 도수 \times p_{Mi} = 총\ 도수 \times p_{M} \times p_{i} \\ 추정기대도수 = \frac{칸이\ 속한\ 열의\ 합\times 칸이\ 속한\ 행의\ 합}{전체\ 합} $$
- $\chi^2의\ 자유도=(칸의\ 수) - 1 - (추정한\ 모수의\ 개수)$
$$\chi^2 = \sum_n\frac{(O-E)^2}{E} \\ \chi^2의 자유도=모집단\ 사이즈-1\ 추정하는\ 모수의\ 갯수가\ 열에서\ c-1\ 행에서\ r-1 \\rc-1-(r-1)-(c-1)$$
- **독립성 검정은 동질성 검정과 검정통계량, 자유도, 기각역이 모두 동일하고, 단지 검정하고자 하는 가설이 달라서 검정결과에 대한 해석이 달라짐**

In [291]:
# example 8
data = pd.DataFrame([['male', 'much', 378], ['male', 'ok', 237], ['male', 'less', 26], ['female', 'much', 388], ['female', 'ok', 196], ['female', 'less', 25]])
watch = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum', margins=True)

watch_p = np.divide(watch, 1250)
watch_p

1,less,much,ok,All
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.02,0.3104,0.1568,0.4872
male,0.0208,0.3024,0.1896,0.5128
All,0.0408,0.6128,0.3464,1.0


In [321]:
watch_exp = pd.DataFrame(index=watch.index, columns=watch.columns)
all_sum = np.sum(watch.sum())
row_sum = col_sum = 0

for row in list(watch_exp.index):
    row_sum = np.sum(watch.loc[row, :])
    for col in list(watch_exp.columns):
        col_sum = np.sum(watch.loc[:, col])
        watch_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
watch_chi = pd.DataFrame(index=watch.index, columns=watch.columns)
for row in list(watch_exp.index):
    for col in list(watch_exp.columns):
        watch_chi.loc[row,col] = (watch.loc[row,col]-watch_exp.loc[row,col])**2/watch_exp.loc[row,col]

f_val = np.sum(watch_chi.sum()); 

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(work.index)-1)*(len(work.columns)-1)))

3.2152804022430193 5.991464547107979


In [320]:
# prob 4.2
data = pd.DataFrame([['symp', 'alco', 54], ['symp', 'ok', 27], ['no symp', 'alco', 22], ['no symp', 'ok', 82]])
stress = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')
                     
stress_exp = pd.DataFrame(index=stress.index, columns=stress.columns)
all_sum = np.sum(stress.sum())
row_sum = col_sum = 0

for row in list(stress_exp.index):
    row_sum = np.sum(stress.loc[row, :])
    for col in list(stress_exp.columns):
        col_sum = np.sum(stress.loc[:, col])
        stress_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
stress_chi = pd.DataFrame(index=stress.index, columns=stress.columns)
for row in list(stress_exp.index):
    for col in list(stress_exp.columns):
        stress_chi.loc[row,col] = (stress.loc[row,col]-stress_exp.loc[row,col])**2/stress_exp.loc[row,col]

f_val = np.sum(stress_chi.sum()); 

alpha = 0.1
print(f_val, stats.chi2.ppf(1-alpha, df=(len(stress.index)-1)*(len(stress.columns)-1)))

38.968841696690546 2.705543454095404


In [319]:
# prob 4.4
data = pd.DataFrame([
    ['list', 'pro', 112], ['list', 'ok', 36], ['list', 'con', 28], 
    ['no list', 'pro', 84], ['no list', 'ok', 68], ['no list', 'con', 72],
])
budget = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')
                     
budget_exp = pd.DataFrame(index=budget.index, columns=budget.columns)
all_sum = np.sum(budget.sum())
row_sum = col_sum = 0

for row in list(budget_exp.index):
    row_sum = np.sum(budget.loc[row, :])
    for col in list(budget_exp.columns):
        col_sum = np.sum(budget.loc[:, col])
        budget_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
budget_chi = pd.DataFrame(index=budget.index, columns=budget.columns)
for row in list(budget_exp.index):
    for col in list(budget_exp.columns):
        budget_chi.loc[row,col] = (budget.loc[row,col]-budget_exp.loc[row,col])**2/budget_exp.loc[row,col]

f_val = np.sum(budget_chi.sum()); 

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(budget.index)-1)*(len(budget.columns)-1)))

27.847152847152852 5.991464547107979


In [318]:
# prob 4.6

data = pd.DataFrame([
    ['great', 'work', 9], ['great', 'quit', 9], 
    ['ok', 'work', 8], ['ok', 'quit', 12], 
    ['bad', 'work', 3], ['bad', 'quit', 14],
])
work = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')
                     
work_exp = pd.DataFrame(index=work.index, columns=work.columns)
all_sum = np.sum(work.sum())
row_sum = col_sum = 0

for row in list(work_exp.index):
    row_sum = np.sum(work.loc[row, :])
    for col in list(work_exp.columns):
        col_sum = np.sum(work.loc[:, col])
        work_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
work_chi = pd.DataFrame(index=work.index, columns=work.columns)
for row in list(work_exp.index):
    for col in list(work_exp.columns):
        work_chi.loc[row,col] = (work.loc[row,col]-work_exp.loc[row,col])**2/work_exp.loc[row,col]

f_val = np.sum(work_chi.sum()); 

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(work.index)-1)*(len(work.columns)-1)))

4.134243697478992 5.991464547107979


In [310]:
# prob 5.2
obs = [14, 18, 18, 26, 24]
exp = np.multiply(np.sum(obs), [0.2, 0.3, 0.2, 0.2, 0.1])

alpha = 0.05
print(stats.chisquare(obs, exp))
stats.chi2.ppf(1-alpha/2, df=len(obs)-1)

Power_divergenceResult(statistic=28.200000000000003, pvalue=1.1361214318167942e-05)


11.143286781877796

In [314]:
# prob 5.4 
obs = [3478, 3333, 3771, 3542, 3479, 3304, 3476, 3495, 3490, 3331, 3188, 3321]

alpha = 0.01
print(stats.chisquare(obs))
stats.chi2.ppf(1-alpha/2, df=len(obs)-1)

Power_divergenceResult(statistic=72.45486313337217, pvalue=4.161301079039476e-11)


26.756848916469636

In [317]:
# prob 5.6
# 1
n1 = 84+417 ; n2 = 43+357
p1 = 84/n1 ; p2 = 43/n2
alpha = 0.05

p = (p1*n1+p2*n2)/(n1+n2)
z_val = (p1-p2)/(np.sqrt(p * (1-p)) * np.sqrt(1/n1+1/n2))
z_alpha = stats.norm.ppf(1-alpha/2)
print(z_val, z_alpha)

2.5785790922964797 1.959963984540054


In [322]:
# 2 
data = pd.DataFrame([
    ['exps', 'abn', 84], ['exps', 'nor', 417], 
    ['no exps', 'abn', 43], ['no exps', 'nor', 357], 
])
rad = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')

rad_exp = pd.DataFrame(index=rad.index, columns=rad.columns)
all_sum = np.sum(rad.sum())
row_sum = col_sum = 0

for row in list(rad_exp.index):
    row_sum = np.sum(rad.loc[row, :])
    for col in list(rad_exp.columns):
        col_sum = np.sum(rad.loc[:, col])
        rad_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
rad_chi = pd.DataFrame(index=rad.index, columns=rad.columns)
for row in list(rad_exp.index):
    for col in list(rad_exp.columns):
        rad_chi.loc[row,col] = (rad.loc[row,col]-rad_exp.loc[row,col])**2/rad_exp.loc[row,col]

f_val = np.sum(rad_chi.sum()); 

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(rad.index)-1)*(len(rad.columns)-1)))

6.649070135228531 3.841458820694124


In [340]:
# prob 5.8 
# 1
data = pd.DataFrame([
    ['a', 'ok', 23], ['a', 'pain', 52-23], 
    ['b', 'ok', 30], ['b', 'pain', 48-30], 
    ['c', 'ok', 19], ['c', 'pain', 50-19], 
    ['d', 'ok', 29], ['d', 'pain', 45-29], 
])
painkiller = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')

painkiller_exp = pd.DataFrame(index=painkiller.index, columns=painkiller.columns)
all_sum = np.sum(painkiller.sum())
row_sum = col_sum = 0

for row in list(painkiller_exp.index):
    row_sum = np.sum(painkiller.loc[row, :])
    for col in list(painkiller_exp.columns):
        col_sum = np.sum(painkiller.loc[:, col])
        painkiller_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
painkiller_chi = pd.DataFrame(index=painkiller.index, columns=painkiller.columns)
for row in list(painkiller_exp.index):
    for col in list(painkiller_exp.columns):
        painkiller_chi.loc[row,col] = (painkiller.loc[row,col]-painkiller_exp.loc[row,col])**2/painkiller_exp.loc[row,col]

f_val = np.sum(painkiller_chi.sum()); 

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(painkiller.index)-1)*(len(painkiller.columns)-1)))

10.08958289445966 7.814727903251179


In [352]:
# 2
n1 = 52; p1 = 23/n1
n2 = 48; p2 = 30/n2
n3 = 50; p3 = 19/n3
n4 = 45; p4 = 29/n4

alpha = 0.05
z_alpha = stats.norm.ppf(1-alpha/2)
print(p1, z_alpha*np.sqrt(p1 * (1-p1)/n1))
print(p2, z_alpha*np.sqrt(p2 * (1-p2)/n2))
print(p3, z_alpha*np.sqrt(p3 * (1-p3)/n3))
print(p4, z_alpha*np.sqrt(p4 * (1-p4)/n4))

0.4423076923076923 0.1349913661443207
0.625 0.13695664696509088
0.38 0.13453978580000311
0.6444444444444445 0.13985832113105579


In [353]:
# prob 5.10
# 1 p4 - p3 > 0?
data = pd.DataFrame([
    ['c', 'no pain', 19], ['c', 'pain', 50-19], 
    ['d', 'no pain', 29], ['d', 'pain', 45-29], 
])
rad = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')

rad_exp = pd.DataFrame(index=rad.index, columns=rad.columns)
all_sum = np.sum(rad.sum())
row_sum = col_sum = 0

for row in list(rad_exp.index):
    row_sum = np.sum(rad.loc[row, :])
    for col in list(rad_exp.columns):
        col_sum = np.sum(rad.loc[:, col])
        rad_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
rad_chi = pd.DataFrame(index=rad.index, columns=rad.columns)
for row in list(rad_exp.index):
    for col in list(rad_exp.columns):
        rad_chi.loc[row,col] = (rad.loc[row,col]-rad_exp.loc[row,col])**2/rad_exp.loc[row,col]

f_val = np.sum(rad_chi.sum()); 

alpha = 0.05
1-stats.chi2.cdf(f_val,df=(len(rad.index)-1)*(len(rad.columns)-1))
# print(f_val, stats.chi2.ppf(1-alpha, df=(len(rad.index)-1)*(len(rad.columns)-1)))

0.010051405351573783

In [354]:
# 2 p4 - p3 95% ci?
alpha = 0.05
z_alpha = stats.norm.ppf(1-alpha/2)
print(p4-p3, z_alpha * np.sqrt(p3*(1-p3)/n3 + p4*(1-p4)/n4))

0.2644444444444445 0.1940652054148508


In [355]:
# prob 5.11
data = pd.DataFrame([
    ['test', 'bad', 4], ['test', 'ok', 8], ['test', 'better', 37],
    ['comp', 'bad', 8], ['comp', 'ok', 25], ['comp', 'better', 14],
])

heart = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')
heart_exp = pd.DataFrame(index=heart.index, columns=heart.columns)
all_sum = np.sum(heart.sum())
row_sum = col_sum = 0

for row in list(heart_exp.index):
    row_sum = np.sum(heart.loc[row, :])
    for col in list(heart_exp.columns):
        col_sum = np.sum(heart.loc[:, col])
        heart_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
heart_chi = pd.DataFrame(index=heart.index, columns=heart.columns)
for row in list(heart_exp.index):
    for col in list(heart_exp.columns):
        heart_chi.loc[row,col] = (heart.loc[row,col]-heart_exp.loc[row,col])**2/heart_exp.loc[row,col]

f_val = np.sum(heart_chi.sum()); 

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(heart.index)-1)*(len(heart.columns)-1)))

20.430658917338693 5.991464547107979


In [356]:
# prob 5.12
# 1
n_test = 49 ; n_comp = 47
p_test = 37/49 ; p_comp = 14/47
alpha = 0.05
z_alpha = stats.norm.ppf(1-alpha/2)
print(p_test, z_alpha * np.sqrt(p_test*(1-p_test)/n_test))
print(p_comp, z_alpha * np.sqrt(p_comp*(1-p_comp)/n_comp))

0.7551020408163265 0.12040525894563049
0.2978723404255319 0.13074415780890009


In [357]:
# 2
alpha = 0.1
z_alpha = stats.norm.ppf(1-alpha/2)

print(p_test-p_comp, z_alpha * np.sqrt(p_test*(1-p_test)/n_test + p_comp*(1-p_comp)/n_comp))

0.4572297003907946 0.14916400055424656


In [358]:
# prob 5.14
data = pd.DataFrame([
    ['man', 'pro', 93], ['man', 'ok', 72], ['man', 'con', 21],
    ['woman', 'pro', 55], ['woman', 'ok', 79], ['woman', 'con', 30],
])

sex = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')
sex_exp = pd.DataFrame(index=sex.index, columns=sex.columns)
all_sum = np.sum(sex.sum())
row_sum = col_sum = 0

for row in list(sex_exp.index):
    row_sum = np.sum(sex.loc[row, :])
    for col in list(sex_exp.columns):
        col_sum = np.sum(sex.loc[:, col])
        sex_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
sex_chi = pd.DataFrame(index=sex.index, columns=sex.columns)
for row in list(sex_exp.index):
    for col in list(sex_exp.columns):
        sex_chi.loc[row,col] = (sex.loc[row,col]-sex_exp.loc[row,col])**2/sex_exp.loc[row,col]

f_val = np.sum(sex_chi.sum()); 

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(sex.index)-1)*(len(sex.columns)-1)))

10.327442153990082 5.991464547107979


In [359]:
# prob 5.16
data = pd.DataFrame([
    ['have', 'one', 28], ['have', 'no', 19],
    ['no have', 'one', 35], ['no have', 'no', 50], 
])

gene = pd.crosstab(columns=data[1], index=data[0], values=data[2], aggfunc='sum')
gene_exp = pd.DataFrame(index=gene.index, columns=gene.columns)
all_sum = np.sum(gene.sum())
row_sum = col_sum = 0

for row in list(gene_exp.index):
    row_sum = np.sum(gene.loc[row, :])
    for col in list(gene_exp.columns):
        col_sum = np.sum(gene.loc[:, col])
        gene_exp.loc[row,col] = row_sum * col_sum / all_sum

# chi square calculation        
gene_chi = pd.DataFrame(index=gene.index, columns=gene.columns)
for row in list(gene_exp.index):
    for col in list(gene_exp.columns):
        gene_chi.loc[row,col] = (gene.loc[row,col]-gene_exp.loc[row,col])**2/gene_exp.loc[row,col]

f_val = np.sum(gene_chi.sum()); 

alpha = 0.05
print(f_val, stats.chi2.ppf(1-alpha, df=(len(gene.index)-1)*(len(gene.columns)-1)))

4.106219731185721 3.841458820694124
