# 단일표본 t-검정

In [2]:
import pandas as pd

import scipy.stats as stats
from math import sqrt

a = pd.read_csv('data/trees.csv')
a

Unnamed: 0,Girth,Height,Volume
0,8.3,70,10.3
1,8.6,65,10.3
2,8.8,63,10.2
3,10.5,72,16.4
4,10.7,81,18.8
5,10.8,83,19.7
6,11.0,66,15.6
7,11.0,75,18.2
8,11.1,80,22.6
9,11.2,75,19.9


In [4]:
round(a['Height'].mean(),2) # 표본평균

76.0

In [6]:
t_score, p_value = stats.ttest_1samp(a['Height'],75)

In [9]:
print(round(t_score,2)) # 검정 통계량

0.87


In [12]:
print(round(p_value,4))

if p_value >= 0.05:
    print('채택')
else :
    print('기각')

0.3892
채택


# 독립표본 t-검정

In [1]:
import pandas as pd

import scipy.stats as stats
from math import sqrt

a = pd.read_csv('data/toothgrowth.csv')
a.head()

Unnamed: 0,len,supp,dose
0,4.2,VC,0.5
1,11.5,VC,0.5
2,7.3,VC,0.5
3,5.8,VC,0.5
4,6.4,VC,0.5


In [2]:
stats.levene(a.loc[a.supp=='VC','len'], a.loc[a.supp=='OJ','len']) # Levene 검정으로 등분산성 확인

LeveneResult(statistic=1.2135720656945064, pvalue=0.2751764616144052)

In [3]:
vc_a = a[a['supp']=='VC']
vc_b = a[a['supp']=='OJ']

In [4]:
print(round(vc_a['len'].mean(),2))
print(round(vc_b['len'].mean(),2)) # 표본평균

16.96
20.66


In [6]:
t_score, p_value = stats.ttest_ind(vc_a['len'], vc_b['len'], equal_var = True)

In [8]:
print(round(t_score,2))

-1.92


In [10]:
print(round(p_value,4))

if p_value >= 0.05:
    print('채택')
else:
    print('기각')

0.0604
채택


# 쌍체 표본 t-검정

In [11]:
a = pd.read_csv('data/insectsprays.csv')
a.head()

Unnamed: 0,before_spr,after_spr
0,10,0
1,7,1
2,20,7
3,14,2
4,14,3


In [15]:
diff = a['after_spr']-a['before_spr']
round(diff.mean(),2) # 표본 평균

-12.0

In [19]:
t_score, p_value = stats.ttest_rel(a['before_spr'],a['after_spr'], alternative='greater')

In [17]:
print(round(t_score,2))

14.89


In [20]:
print(round(p_value,4))

if p_value>0.05:
    print('채택')
else:
    print('기각')

0.0
기각


# 일원분산분석(One-way ANOVA)

In [21]:
import pandas as pd
import scipy.stats as stats

data = pd.read_csv('data/iris.csv')
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [24]:
print(round(data[data['species']=='versicolor']['sepal_width'].mean(),2)) # 표본 평균

2.77


In [25]:
data['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [29]:
x1 = data.loc[data['species']=='setosa', 'sepal_width']
x2 = data.loc[data['species']=='versicolor', 'sepal_width']
x3 = data.loc[data['species']=='virginica', 'sepal_width']

t_score, p_value = stats.f_oneway(x1,x2,x3)

In [30]:
print(round(t_score,2))

49.16


In [31]:
print(round(p_value,4))

if p_value >= 0.05:
    print('채택')
else:
    print('기각')

0.0
기각


# 카이제곱검정(적합도)

In [1]:
import pandas as pd
import scipy.stats as stats

data = pd.read_csv('data/cellphone.csv')
data.head()

Unnamed: 0,id,제조사
0,1,삼성
1,2,애플
2,3,삼성
3,4,애플
4,5,애플


In [13]:
count_df = data.groupby('제조사').count()
count_df

Unnamed: 0_level_0,id
제조사,Unnamed: 1_level_1
기타,5
삼성,117
애플,78


In [15]:
count_df.loc['삼성','id'] - count_df.loc['애플','id']

39

In [16]:
total_sum = len(data)
expected = [int(total_sum*0.1),
            int(total_sum*0.6),
            int(total_sum*0.3)]

print(expected)

[20, 120, 60]


In [17]:
observed = count_df['id'].values
t_score, p_value = stats.chisquare(observed, f_exp = expected)

In [18]:
print(round(t_score,2))

16.73


In [19]:
print(round(p_value,4))

if p_value >= 0.05:
    print('채택')
else:
    print('기각')

0.0002
기각


# 카이제곱검정(독립성 검정)

In [20]:
import pandas as pd
import scipy.stats as stats

a = pd.read_csv('data/student.csv')
a.head()

Unnamed: 0,ID,단과대학,수강 과목 수
0,1,경영대학,4개 이하
1,2,경영대학,4개 이하
2,3,경영대학,4개 이하
3,4,경영대학,4개 이하
4,5,경영대학,4개 이하


In [25]:
table = pd.crosstab(a['단과대학'],a['수강 과목 수'])
table

수강 과목 수,4개 이하,5개,6개 이상
단과대학,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
경영대학,12,78,30
공과대학,30,42,28
자연대학,30,42,8


In [29]:
table[table['6개 이상']==table['6개 이상'].max()].index[0]

'경영대학'

In [37]:
x1 = table.loc['경영대학',:]
x2 = table.loc['공과대학',:]
x3 = table.loc['자연대학',:]

test_result = stats.chi2_contingency([x1,x2,x3])

In [39]:
print(round(test_result.statistic,2))

30.13


In [44]:
print(round(test_result.pvalue,4))

if test_result.pvalue >= 0.05:
    print('채택')
else:
    print('기각')

0.0
기각
