# 통계적 가설검정

In [34]:
import numpy as np
import pandas as pd
from scipy import stats

%precision 3
np.random.seed(1111)

In [35]:
# 구글 드라이브에서 파일 가져오기
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
df = pd.read_csv('/content/drive/MyDrive/누구나 파이썬 통계분석/python_stat_sample-master/data/ch11_potato.csv')
sample = np.array(df['무게'])
sample

array([122.02, 131.73, 130.6 , 131.82, 132.05, 126.12, 124.43, 132.89,
       122.79, 129.95, 126.14, 134.45, 127.64, 125.68])

In [37]:
s_mean = np.mean(sample)
s_mean

128.451

## 통계적 가설검정이란

### 통계적 가설검정의 흐름

In [38]:
rv = stats.norm(130, np.sqrt(9/14))
rv.isf(0.95)

128.681

In [39]:
z = (s_mean - 130) / np.sqrt(9/14)
z

-1.932

In [40]:
rv = stats.norm()
rv.isf(0.95)

-1.645

In [41]:
rv.cdf(z)

0.027

### 단측검정과 양측검정

In [69]:
z = (s_mean - 130) / np.sqrt(9/14)
z

-1.932

In [70]:
rv = stats.norm()
rv.interval(0.95)

(-1.960, 1.960)

In [71]:
rv.cdf(z) * 2

0.053

In [72]:
rv.cdf(-1.932) * 2

0.053

In [73]:
rv.cdf(z) * 2


0.053

In [45]:
rv=stats.norm(130, 3)

In [46]:
rv.rvs()

126.100

In [47]:
rv = stats.norm(130, 3)

In [48]:
c = stats.norm().isf(0.95)
n_samples = 10000
cnt = 0
for _ in range(n_samples):
    sample_ = np.round(rv.rvs(14), 2)
    s_mean_ = np.mean(sample_)
    z = (s_mean_ - 130) / np.sqrt(9/14)
    if z < c:
        cnt += 1
cnt / n_samples

0.051

In [49]:
rv = stats.norm(128, 3)

In [50]:
c = stats.norm().isf(0.95)
n_samples = 10000
cnt = 0
for _ in range(n_samples):
    sample_ = np.round(rv.rvs(14), 2)
    s_mean_ = np.mean(sample_)
    z = (s_mean_ - 130) / np.sqrt(9/14)
    if z >= c:
        cnt += 1
        
cnt / n_samples

0.198

## 가설검정

### 정규분포의 모평균에 대한 검정(모분산을 알고 있음)

In [51]:
def pmean_test(sample, mean0, p_var, alpha=0.05):
    s_mean = np.mean(sample)
    n = len(sample)
    rv = stats.norm()
    interval = rv.interval(1-alpha)

    z = (s_mean - mean0) / np.sqrt(p_var/n)
    if interval[0] <= z <= interval[1]:
        print('귀무가설을 채택')
    else:
        print('귀무가설을 기각')

    if z < 0:
        p = rv.cdf(z) * 2
    else:
        p = (1 - rv.cdf(z)) * 2
    print(f'p값은 {p:.3f}')

In [52]:
pmean_test(sample, 130, 9)

귀무가설을 채택
p값은 0.053


### 정규분포의 모분산에 대한 검정

In [53]:
def pvar_test(sample, var0, alpha=0.05):
    u_var = np.var(sample, ddof=1)
    n = len(sample)
    rv = stats.chi2(df=n-1)
    interval = rv.interval(1-alpha)
    
    y = (n-1) * u_var / var0
    if interval[0] <= y <= interval[1]:
        print('귀무가설을 채택')
    else:
        print('귀무가설을 기각')

    if y < rv.isf(0.5):
        p = rv.cdf(y) * 2
    else:
        p = (1 - rv.cdf(y)) * 2
    print(f'p값은 {p:.3f}')

In [54]:
pvar_test(sample, 9)

귀무가설을 채택
p값은 0.085


### 정규분포의 모평균에 대한 검정(모분산을 알지 못함)

In [55]:
def pmean_test(sample, mean0, alpha=0.05):
    s_mean = np.mean(sample)
    u_var = np.var(sample, ddof=1)
    n = len(sample)
    rv = stats.t(df=n-1)
    interval = rv.interval(1-alpha)

    t = (s_mean - mean0) / np.sqrt(u_var/n)
    if interval[0] <= t <= interval[1]:
        print('귀무가설을 채택')
    else:
        print('귀무가설을 기각')

    if t < 0:
        p = rv.cdf(t) * 2
    else:
        p = (1 - rv.cdf(t)) * 2
    print(f'p값은 {p:.3f}')

In [56]:
pmean_test(sample, 130)

귀무가설을 채택
p값은 0.169


In [57]:
t, p = stats.ttest_1samp(sample, 130)
t, p

(-1.455, 0.169)

In [58]:
rv.cdf(0.169)

0.000

In [59]:
rv.cdf(-1.455)

0.000

## 2표본 문제에 관한 가설검정

### 대응비교 t검정

In [60]:
training_rel = pd.read_csv('/content/drive/MyDrive/누구나 파이썬 통계분석/python_stat_sample-master/data/ch11_training_rel.csv')
print(training_rel.shape)
training_rel.head()

(20, 2)


Unnamed: 0,전,후
0,59,41
1,52,63
2,55,68
3,61,59
4,59,84


In [61]:
training_rel['차'] = training_rel['후'] - training_rel['전']
training_rel.head()

Unnamed: 0,전,후,차
0,59,41,-18
1,52,63,11
2,55,68,13
3,61,59,-2
4,59,84,25


In [62]:
t, p = stats.ttest_1samp(training_rel['차'], 0)
p

0.040

In [63]:
t, p = stats.ttest_rel(training_rel['후'], training_rel['전'])
p

0.040

### 독립비교 t검정

In [64]:
training_ind = pd.read_csv('../data/ch11_training_ind.csv')
print(training_ind.shape)
training_ind.head()

FileNotFoundError: ignored

In [None]:
t, p = stats.ttest_ind(training_ind['A'], training_ind['B'],
                       equal_var=False)
p

### 윌콕슨의 부호순위검정

In [None]:
training_rel = pd.read_csv('../data/ch11_training_rel.csv')
toy_df = training_rel[:6].copy()
toy_df

In [None]:
cnt = 0


In [None]:
cnt = 0
alpha = 0.05
for diff in diffs:

cnt / n

### 만・위트니의 U검정

In [None]:
training_ind = pd.read_csv('../data/ch11_training_ind.csv')
toy_df = training_ind[:5].copy()
toy_df

In [None]:
rank = stats.rankdata(np.concatenate([toy_df['A'],


### 카이제곱검정

In [None]:
ad_df = pd.read_csv('../data/ch11_ad.csv')
n = len(ad_df)
print(n)
ad_df.head()

In [None]:
ad_cross = pd.crosstab(ad_df['광고'], ad_df['구입'])
ad_cross

In [None]:
ad_cross['했다'] / (ad_cross['했다'] + ad_cross['하지 않았다'])

In [None]:
n_not, n_yes = ad_cross.sum()
n_not, n_yes

In [None]:
n_adA, n_adB = ad_cross.sum(axis=1)
n_adA, n_adB

In [None]:
ad_ef = pd.DataFrame({'했다': [n_adA * n_yes / n,
                              n_adB * n_yes / n],
                      '하지 않았다': [n_adA * n_not / n,
                                   n_adB * n_not / n]},
                      index=['A', 'B'])
ad_ef

In [None]:
y = ((ad_cross - ad_ef) ** 2 / ad_ef).sum().sum()
y

In [None]:
rv = stats.chi2(1)
1 - rv.cdf(y)

In [None]:
chi2, p, dof, ef = stats.chi2_contingency(ad_cross,
                                          correction=False)
chi2, p, dof

In [None]:
ef