In [1]:
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

# Chap3 평균 검정
## 3.1 T검정

In [2]:
t_val = (135-115)/(25/np.sqrt(20))
t_val

3.577708763999664

In [3]:
# pt(t_val, df=20-1, lower.tail=False)
stats.t.cdf(-t_val, df=20-1)

0.0010038537130763801

In [4]:
stats.t.cdf(-t_val, df=20-1)*2

0.0020077074261527603

In [5]:
stats.t.ppf(1-0.025, df=20-1)

2.093024054408263

## 3.2 일표본 평균 검정

### 평균 검정

In [37]:
cats = pd.read_csv("data/cats.csv")
cats.head()

Unnamed: 0,rownames,Sex,Bwt,Hwt
0,1,F,2.0,7.0
1,2,F,2.0,7.4
2,3,F,2.0,9.5
3,4,F,2.1,7.2
4,5,F,2.1,7.3


In [7]:
stats.ttest_1samp(cats['Bwt'], popmean=2.6)

TtestResult(statistic=3.0564867998078107, pvalue=0.0026730362561723613, df=143)

In [8]:
stats.ttest_1samp(cats['Bwt'], popmean=2.6).confidence_interval()

ConfidenceInterval(low=2.6436692906770736, high=2.8035529315451493)

In [9]:
stats.ttest_1samp(cats['Bwt'], popmean=2.6).confidence_interval(0.99)

ConfidenceInterval(low=2.618030752171321, high=2.829191470050902)

In [10]:
stats.ttest_1samp(cats['Bwt'], popmean=2.6, alternative='greater')

TtestResult(statistic=3.0564867998078107, pvalue=0.0013365181280861806, df=143)

### 비율 검정

In [11]:
#prop.test(x=18, n=30, p=0.5, alternative="greater")
table = np.array([[18, 72], [12, 72]])
stats.chi2_contingency(table).pvalue

0.4258425213937843

## 3.3 독립표본 평균검정

In [38]:
cats['Sex'].value_counts()

Sex
M    97
F    47
Name: count, dtype: int64

In [41]:
male = cats.loc[cats['Sex']=='M']['Bwt']
female = cats.loc[cats['Sex']=='F']['Bwt']
print(np.mean(male), np.mean(female))

2.8999999999999995 2.359574468085107


In [43]:
stats.levene(male, female)

LeveneResult(statistic=19.43101190877999, pvalue=2.0435285255189404e-05)

In [42]:
stats.ttest_ind(a=male, b=female, equal_var=False)

TtestResult(statistic=8.70948849909559, pvalue=8.831034455859356e-15, df=136.83788299625363)

In [56]:
patients = [86, 93, 136, 82] ; smokers = [83, 90, 129, 70]
smoker_table = np.array([smokers, np.subtract(patients, smokers)]) # 우측값이 전체 값이면 안됨
smoker_table
# stats.levene(patients, smokers)

array([[ 83,  90, 129,  70],
       [  3,   3,   7,  12]])

In [55]:
# 비율 검정
stats.chi2_contingency(smoker_table)

Chi2ContingencyResult(statistic=12.600411297127632, pvalue=0.005585476661627042, dof=3, expected_freq=array([[ 80.58438287,  87.14357683, 127.43576826,  76.83627204],
       [  5.41561713,   5.85642317,   8.56423174,   5.16372796]]))

## 3.4 대응표본 평균검정
- 독립표본 평균검정 : 두 표본이 서로 독립인 모집단에서 추출 가정
ex) 무작위로 실험테스트에 응하게 하고, 각 실험자에 대해 하나씩 테스트 점수
- 검정하려고 하는 두 개의 표본이 서로 독립이 아닌 모집단에서 추출된 경우
ex) 무작위로 실험대상자 선정하고, 각 실험자 대상으로 테스트를 두 차례 실시하여 점수를 얻음

In [58]:
sleep = pd.read_csv("data/sleep.csv")
sleep.head()

Unnamed: 0,rownames,extra,group,ID
0,1,0.7,1,1
1,2,-1.6,1,2
2,3,-0.2,1,3
3,4,-1.2,1,4
4,5,-0.1,1,5


In [60]:
sleep['group'].value_counts()

group
1    10
2    10
Name: count, dtype: int64

In [61]:
group1 = sleep.loc[sleep['group']==1]['extra']
group2 = sleep.loc[sleep['group']==2]['extra']
print(group1.shape, group2.shape)

(10,) (10,)


In [62]:
stats.levene(group1, group2)

LeveneResult(statistic=0.2482049608355091, pvalue=0.6243742854148879)

In [64]:
stats.ttest_rel(group1, group2)

TtestResult(statistic=-4.062127683382037, pvalue=0.00283289019738427, df=9)