In [1]:

from pandas import read_excel
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from scipy.stats import shapiro
from scipy.stats import levene
from scipy.stats import bartlett
from scipy.stats import ttest_ind

from statsmodels.sandbox.stats.multicomp import MultiComparison
from statsmodels.stats.multicomp import pairwise_tukeyhsd


In [2]:
df = read_excel('http://itpaper.co.kr/data/group_weight.xlsx', engine='openpyxl')
df.head()

Unnamed: 0,weight,group
0,4.17,ctrl
1,5.58,ctrl
2,5.18,ctrl
3,6.11,ctrl
4,4.5,ctrl


In [3]:
unique = df["group"].unique()
unique

array(['ctrl', 'trt1', 'trt2'], dtype=object)

In [4]:
df['group'] = df['group'].astype('category').cat.rename_categories({'ctrl': 1, 'trt1': 2, 'trt2': 3})
unique = df['group'].unique()
unique

[1, 2, 3]
Categories (3, int64): [1, 2, 3]

In [5]:
model = ols('weight ~ C(group)', df)
fit = model.fit()
result = anova_lm(fit)
result

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(group),2.0,3.76634,1.88317,4.846088,0.01591
Residual,27.0,10.49209,0.388596,,


In [6]:

for u in unique:
    s = shapiro(df['weight'][df['group']==u])
    print(s)
    print("%s 수준의 검정통계량: %0.2f, p-value: %0.2f\n" % (u, s.statistic, s.pvalue))


ShapiroResult(statistic=0.9566815495491028, pvalue=0.7474744915962219)
1 수준의 검정통계량: 0.96, p-value: 0.75

ShapiroResult(statistic=0.9304108619689941, pvalue=0.451945960521698)
2 수준의 검정통계량: 0.93, p-value: 0.45

ShapiroResult(statistic=0.9410051107406616, pvalue=0.5642509460449219)
3 수준의 검정통계량: 0.94, p-value: 0.56



In [7]:
levene(
    df['weight'][df['group'] == 1],
    df['weight'][df['group'] == 2],
    df['weight'][df['group'] == 3] )

LeveneResult(statistic=1.1191856948703909, pvalue=0.3412266241254737)

In [8]:
bartlett(
    df['weight'][df['group'] == 1],
    df['weight'][df['group'] == 2],
    df['weight'][df['group'] == 3] )


BartlettResult(statistic=2.8785737872360935, pvalue=0.23709677363455822)

In [9]:
comp = MultiComparison(df["weight"], df["group"])
result = comp.allpairtest(ttest_ind)
print(result[0])

Test Multiple Comparison ttest_ind 
FWER=0.05 method=bonf
alphacSidak=0.02, alphacBonf=0.017
group1 group2   stat   pval  pval_corr reject
---------------------------------------------
     1      2  1.1913  0.249    0.7471  False
     1      3  -2.134 0.0469    0.1406  False
     2      3 -3.0101 0.0075    0.0226   True
---------------------------------------------


In [10]:
hsd = pairwise_tukeyhsd(df['weight'], df['group'])
print(hsd.summary())

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     1      2   -0.371 0.3921 -1.0621 0.3201  False
     1      3    0.494  0.198 -0.1971 1.1851  False
     2      3    0.865  0.012  0.1739 1.5561   True
---------------------------------------------------


NameError: name 'DataFrame' is not defined