## Statsmodels 활용 선형회귀 / 분산 분석

In [63]:
from scipy.stats import norm
from statsmodels.api import stats
from statsmodels.formula.api import ols
import pandas as pd
data = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'y': [2, 3, 5, 7, 11, 13, 17, 19, 23, 29]})

lm = ols('y ~ x', data=data).fit()
# lm = ols('y ~ x + I(x**2) - 1', data=data).fit() # constant 없애기 + 2차항 추가
lm.summary()





  res = hypotest_fun_out(*samples, **kwds)


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.998
Model:,OLS,Adj. R-squared (uncentered):,0.998
Method:,Least Squares,F-statistic:,2226.0
Date:,"Wed, 09 Oct 2024",Prob (F-statistic):,1.03e-11
Time:,23:04:02,Log-Likelihood:,-9.9681
No. Observations:,10,AIC:,23.94
Df Residuals:,8,BIC:,24.54
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x,1.2378,0.150,8.235,0.000,0.891,1.584
I(x ** 2),0.1572,0.019,8.483,0.000,0.114,0.200

0,1,2,3
Omnibus:,1.585,Durbin-Watson:,2.231
Prob(Omnibus):,0.453,Jarque-Bera (JB):,0.761
Skew:,-0.1,Prob(JB):,0.684
Kurtosis:,1.664,Cond. No.,33.1


In [39]:
lm.ess, lm.ssr, lm.fvalue, lm.f_pvalue, lm.tvalues

(712.8030303030303,
 20.09696969696969,
 283.745476477684,
 1.563224914620566e-07,
 Intercept    -3.017040
 x            16.844746
 dtype: float64)

In [40]:
from scipy.stats import norm

mu = {'A': 0.3, 'B': 0.3, 'C': 0.3}
sig = 0.2
n = {'A': 30, 'B': 25, 'C': 35}

df_ind = pd.concat([
    pd.DataFrame(norm.rvs(loc=mu[i], scale=sig, size=n[i], random_state=123), columns=['X']).assign(factor=i)
    for i in ['A', 'B', 'C']
], axis=0)

mu = {'A': 0.3, 'B': 0.5, 'C': 0.7}
sig = 0.2
n = {'A': 30, 'B': 25, 'C': 35}

df_dep = pd.concat([
    pd.DataFrame(norm.rvs(loc=mu[i], scale=sig, size=n[i], random_state=123), columns=['X']).assign(factor=i)
    for i in ['A', 'B', 'C']
], axis=0)

In [41]:
df_ind

Unnamed: 0,X,factor
0,0.082874,A
1,0.499469,A
2,0.356596,A
3,-0.001259,A
4,0.184280,A
...,...,...
30,0.248876,C
31,-0.259718,C
32,-0.054307,C
33,0.160025,C


In [42]:
df_dep

Unnamed: 0,X,factor
0,0.082874,A
1,0.499469,A
2,0.356596,A
3,-0.001259,A
4,0.184280,A
...,...,...
30,0.648876,C
31,0.140282,C
32,0.345693,C
33,0.560025,C


In [43]:
# df_ind의 factor 변수에 대한 분산 분석

lm = ols('X ~ C(factor)', data=df_ind).fit()
df_anova = stats.anova_lm(lm)
df_anova

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(factor),2.0,0.03303,0.016515,0.275669,0.759725
Residual,87.0,5.212032,0.059908,,


**[Ex.2]**

<div style="border: 1px solid #ddd; padding: 12px; margin-top: 10px">

df_dep에서 categ 에 대한 X의 모집단에 대한 분산분석을 합니다.

분산 분석에 앞서 categ의 범주에 따라 모집단이 정규분포를 따르는지 Kolmogorov-smironof 검정으로 확인하고,

Bartlett 검정을 통해 서로 등분산인지 확인합니다.
    
</div>

In [55]:
from scipy.stats import kstest, bartlett, zscore
test_list = [
df_dep[df_dep['factor'] == 'A']['X'].values,
df_dep[df_dep['factor'] == 'B']['X'].values,
df_dep[df_dep['factor'] == 'C']['X'].values
]

# kstest(*test_list) # 잘못된 사례, 이렇게 하면 각 분포가 동일한 분포를 따르는지 보는 것임

kstest(zscore(test_list[0], ddof=1), norm.cdf), kstest(zscore(test_list[1], ddof=1), norm.cdf), kstest(zscore(test_list[2], ddof=1), norm.cdf) # 각 분포가 동일한 분포를 따르는지 보는 것임

(KstestResult(statistic=0.15503278091207373, pvalue=0.4237383487671724, statistic_location=-0.39894403974597836, statistic_sign=1),
 KstestResult(statistic=0.15795015932240858, pvalue=0.510746173338965, statistic_location=-0.46197439644523863, statistic_sign=1),
 KstestResult(statistic=0.12030734423089573, pvalue=0.647729582069612, statistic_location=-0.26896483929049464, statistic_sign=1))

In [54]:
bartlett(*test_list)

BartlettResult(statistic=0.08114650401701767, pvalue=0.9602388225127009)

In [56]:
lm = ols('X ~ C(factor)', data=df_dep).fit()
df_anova = stats.anova_lm(lm)
df_anova


Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(factor),2.0,2.247012,1.123506,18.753726,1.690791e-07
Residual,87.0,5.212032,0.059908,,


In [57]:
import itertools 
mu = {
    ('A', '1'): 0.2, ('A', '2') : 0.3, ('A','3') : 0.4,
    ('B', '1'): 0.3, ('B', '2') : 0.4, ('B','3') : 0.2
}
sig = 0.2
n = {
    ('A', '1'): 15, ('A', '2') : 25, ('A','3') : 20,
    ('B', '1'): 20, ('B', '2') : 30, ('B','3') : 25
}

df_two = pd.concat([
    pd.DataFrame(norm.rvs(loc=mu[(i, j)], scale=sig, size=n[(i, j)], random_state=123), columns=['X']).assign(factor_1=i, factor_2=j)
    for i,j in itertools.product(['A', 'B'], ['1', '2', '3'])
], axis=0)

In [58]:
df_two

Unnamed: 0,X,factor_1,factor_2
0,-0.017126,A,1
1,0.399469,A,1
2,0.256596,A,1
3,-0.101259,A,1
4,0.084280,A,1
...,...,...,...
20,0.347474,B,3
21,0.498146,B,3
22,0.012833,B,3
23,0.435166,B,3


In [72]:
lm = ols('X ~ C(factor_1) + C(factor_2) + C(factor_1):C(factor_2)', data=df_two).fit()
df_anova = stats.anova_lm(lm)
df_anova

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(factor_1),1.0,0.002269,0.002269,0.038022,0.845706
C(factor_2),2.0,0.309988,0.154994,2.59712,0.078379
C(factor_1):C(factor_2),2.0,0.737363,0.368682,6.177722,0.002741
Residual,129.0,7.698618,0.059679,,


In [73]:
lm = ols('X ~ C(factor_1)*C(factor_2)', data=df_two).fit()
df_anova = stats.anova_lm(lm)
df_anova

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(factor_1),1.0,0.002269,0.002269,0.038022,0.845706
C(factor_2),2.0,0.309988,0.154994,2.59712,0.078379
C(factor_1):C(factor_2),2.0,0.737363,0.368682,6.177722,0.002741
Residual,129.0,7.698618,0.059679,,


In [62]:
lm = ols('X ~ C(factor_1):C(factor_2)', data=df_two).fit()
df_anova = stats.anova_lm(lm)
df_anova

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(factor_1):C(factor_2),5.0,1.04962,0.209924,3.517541,0.005156
Residual,129.0,7.698618,0.059679,,
