### 단일 표본 검정

In [15]:
import pandas as pd

df = pd.DataFrame({
    'Caffeine(mg)': [
        94.2,93.7,95.5,93.9,94.0,95.2,94.7,93.5,92.8,94.4,
        93.8,94.6,93.3,95.1,94.3,94.9,93.9,94.8,95.0,94.2,
        93.7,94.4,95.1,94.0,93.6
    ]
})
df
# 표본 평균
print(df.mean())

# 정규성 검정
from scipy import stats
print(stats.shapiro(df['Caffeine(mg)']))

# 단일 표본 t-검정
print(stats.ttest_1samp(df['Caffeine(mg)'], 95, alternative='less'))
statistic, pvalue = stats.ttest_1samp(df['Caffeine(mg)'], 95, alternative='less')
print("{:.10f}".format(pvalue))

Caffeine(mg)    94.264
dtype: float64
ShapiroResult(statistic=0.9826578166170536, pvalue=0.9322031137746971)
TtestResult(statistic=-5.501737036221897, pvalue=5.8686553916715e-06, df=24)
0.0000058687


### 독립 표본 검정

In [25]:
import pandas as pd
from scipy import stats

df = pd.DataFrame({
    '충전기': ['New'] * 10 + ['Old'] * 10,
    '충전시간': [
        1.5,1.6,1.4,1.7,1.5,1.6,1.7,1.4,1.6,1.5,
        1.7,1.8,1.7,1.9,1.8,1.7,1.8,1.9,1.7,1.6
    ]
})
print(df.head(2))

# 독립 표본 t-검정
new_cond = df['충전기'] == 'New'
old_cond = df['충전기'] == 'Old'
print(stats.ttest_ind(df[new_cond]['충전시간'], df[old_cond]['충전시간'],
            alternative='less', equal_var=True))

   충전기  충전시간
0  New   1.5
1  New   1.6
TtestResult(statistic=-4.582575694955849, pvalue=0.00011546547787696304, df=18.0)


In [31]:
# 대응 표본 검정(쌍체 표본)
import pandas as pd
from scipy import stats

df = pd.DataFrame({
    'User': list(range(1, 11)),
    '기존방법': [60.4, 60.7, 60.5, 60.3, 60.8, 60.6, 60.2, 60.5, 60.7, 60.4],
    '새로운방법': [59.8, 60.2, 60.1, 59.9, 59.7, 58.4, 57.0, 60.3, 59.6, 59.8]
})
df

# 표본 평균
df['diff'] = df['새로운방법'] - df['기존방법']
print(df['diff'].mean())

# 대응 표본 t-검정
print(stats.ttest_rel(df['새로운방법'], df['기존방법'], alternative='less'))

-1.0300000000000005
TtestResult(statistic=-3.407973078114844, pvalue=0.0038872633380070652, df=9)


In [53]:
# 일원 분산 분석
import pandas as pd
from scipy import stats

df = pd.read_csv("math.csv")
print(df.info())
print(df.head())

# Shapiro-Wilk 검정(정규성)
condA = df['groups'] == 'group_A'
print(stats.shapiro(df[condA]['scores']))

condB = df['groups'] == 'group_B'
print(stats.shapiro(df[condB]['scores']))

condC = df['groups'] == 'group_C'
print(stats.shapiro(df[condC]['scores']))

condD = df['groups'] == 'group_D'
print(stats.shapiro(df[condD]['scores']))

# Levene 검정(등분산성)
print(stats.levene(df[condA]['scores'], df[condB]['scores'], df[condC]['scores'], df[condD]['scores']))

# 일원 분산분석을 위한 모델 학습
from statsmodels.formula.api import ols
model = ols('scores ~ groups', df).fit()

# ANOVA 테이블
from statsmodels.stats.anova import anova_lm
print(anova_lm(model))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   groups  40 non-null     object
 1   scores  40 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 772.0+ bytes
None
    groups  scores
0  group_A      85
1  group_A      88
2  group_A      90
3  group_A      82
4  group_A      87
ShapiroResult(statistic=0.9715896670696531, pvalue=0.9051800443853569)
ShapiroResult(statistic=0.9499422438060351, pvalue=0.6678172590861611)
ShapiroResult(statistic=0.9299424104842702, pvalue=0.44732595113862045)
ShapiroResult(statistic=0.9065684572704982, pvalue=0.25824165549017347)
LeveneResult(statistic=1.757685352622062, pvalue=0.17270284963232108)
            df  sum_sq     mean_sq          F        PR(>F)
groups     3.0   411.8  137.266667  34.174274  1.240642e-10
Residual  36.0   144.6    4.016667        NaN           NaN
