## 일원배치 분산분석(one-way ANOVA)

### 기본 가정
#### 1. 각 변수는 정규분포를 따라야함. (검정방법 : Shapiro test) 
* 샤피로 검정은 p-value구하기 까다롭기 때문에 왜도 절댓값이 2 미만으로 정규성 확인
* H0 : 변수는 정규분포를 따를 것이다.
* H1 : 변수는 정규분포를 따르지 않을 것이다.

#### 2. 각 변수는 동일한 수준의 분산을 가져야함.
* Barlett 검정, levene 방법 사용
* H0 : 변수 간 분산에 유의미한 차이가 없을 것이다.(비슷 할 것이다)
* H1 : 변수 간 분산에 유의미한 차이가 있을 것이다.

#### 3. 각 변수는 독립적이다. (변수 사이에 영향을 주지 않는다.)

In [2]:
import pandas as pd
import numpy as np

pd.options.display.float_format = '{:.3f}'.format

In [5]:
data = pd.DataFrame({'feed' : [1,1,1,1,1,1,1,1,1,
                               2,2,2,2,2,2,2,2,2,
                               3,3,3,3,3,3,3,3,3,
                               4,4,4,4,4,4,4,4,4],
                     'kg' : [30.3, 33.7, 32.3, 31.1, 32.2, 32.3, 34.2, 35.2, 32.3,
                             32.6, 37.4, 32.2, 35.3, 36.2, 32.8, 35.3, 33.8, 34.2,
                             31.3, 29.3, 35.3, 31.1, 30.0, 33.2, 31.3, 36.3, 32.4,
                             32.4, 30.5, 31.2, 33.1, 34.1, 32.2, 33.1, 32.3, 31.5]})
data.head(10)

Unnamed: 0,feed,kg
0,1,30.3
1,1,33.7
2,1,32.3
3,1,31.1
4,1,32.2
5,1,32.3
6,1,34.2
7,1,35.2
8,1,32.3
9,2,32.6


In [54]:
data['feed'] = data['feed'].astype(object)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   feed    36 non-null     object 
 1   kg      36 non-null     float64
dtypes: float64(1), object(1)
memory usage: 704.0+ bytes


In [55]:
x1 = data[0:9]
x2 = data[9:18]
x3 = data[18:27]
x4 = data[27:36]
print(len(x1),len(x2),len(x3),len(x4))

9 9 9 9


In [56]:
from scipy.stats import shapiro

In [57]:
p_value_feed1 = shapiro(x1)
print('사료 1 :', p_value_feed1)

p_value_feed2 = shapiro(x2)
print('사료 2 :', p_value_feed2)

p_value_feed3 = shapiro(x3)
print('사료 3 :', p_value_feed3)

p_value_feed4 = shapiro(x4)
print('사료 4 :', p_value_feed4)

사료 1 : ShapiroResult(statistic=0.6818474531173706, pvalue=4.949344656779431e-05)
사료 2 : ShapiroResult(statistic=0.6876568794250488, pvalue=5.758753832196817e-05)
사료 3 : ShapiroResult(statistic=0.7068629860877991, pvalue=9.60638135438785e-05)
사료 4 : ShapiroResult(statistic=0.6745582818984985, pvalue=4.101284866919741e-05)


In [58]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm, anova_single

model = ols('kg ~ C(feed)', data).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(feed),3.0,29.022,9.674,3.207,0.036
Residual,32.0,96.533,3.017,,


In [87]:
data = pd.DataFrame({'country' : [1,1,1,1,
                                  2,2,2,2,
                                  3,3,3,3],
                     'temp' : [180,190,200,210,
                               180,190,200,210,
                               180,190,200,210],
                     'value' : [97.6, 98.6, 99.0, 98.0,
                                97.3, 98.2, 98.0, 97.7,
                                96.7, 96.9, 97.9, 96.5]})
data

Unnamed: 0,country,temp,value
0,1,180,97.6
1,1,190,98.6
2,1,200,99.0
3,1,210,98.0
4,2,180,97.3
5,2,190,98.2
6,2,200,98.0
7,2,210,97.7
8,3,180,96.7
9,3,190,96.9


In [88]:
model = ols('value ~ C(temp) * C(country)', data).fit()
anova_lm(model)

  (model.ssr / model.df_resid))


Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(temp),3.0,2.22,0.74,0.0,
C(country),2.0,3.44,1.72,0.0,
C(temp):C(country),6.0,0.56,0.093,0.0,
Residual,0.0,0.0,inf,,
