# 04.04 범주형 독립변수

## 1. 월 평균 기온 데이터 예제

In [1]:
import datetime
from calendar import isleap

def convert_partial_year(number):
    "연 단위 숫자에서 날짜를 계산하는 코드"
    year = int(number)
    d = datetime.timedelta(days=(number - year) * (365 + isleap(year)))
    day_one = datetime.datetime(year, 1, 1)
    date = d + day_one
    return date

df_nottem = sm.datasets.get_rdataset("nottem").data
df_nottem["date0"] = df_nottem[["time"]].applymap(convert_partial_year)
df_nottem["date"] = pd.DatetimeIndex(df_nottem["date0"]).round('60min') + datetime.timedelta(seconds=3600*24)
df_nottem["month"] = df_nottem["date"].dt.strftime("%m").astype('category')
del df_nottem["date0"], df_nottem["date"]
df_nottem.tail()

Unnamed: 0,time,value,month
235,1939.583333,61.8,8
236,1939.666667,58.2,9
237,1939.75,46.7,10
238,1939.833333,46.6,11
239,1939.916667,37.8,12


## 1) OLS 회귀분석

    - 범주형 데이터

**풀랭크 더미변수화**

In [3]:
model = sm.OLS.from_formula("value ~ C(month) + 0",df_nottem)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  value   R-squared:                       0.930
Model:                            OLS   Adj. R-squared:                  0.927
Method:                 Least Squares   F-statistic:                     277.3
Date:                Sun, 17 May 2020   Prob (F-statistic):          2.96e-125
Time:                        19:28:10   Log-Likelihood:                -535.82
No. Observations:                 240   AIC:                             1096.
Df Residuals:                     228   BIC:                             1137.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
C(month)[01]    39.6950      0.518     76.691   


**축소랭크 더미변수화**

In [4]:
model = sm.OLS.from_formula("value ~ C(month)",df_nottem)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  value   R-squared:                       0.930
Model:                            OLS   Adj. R-squared:                  0.927
Method:                 Least Squares   F-statistic:                     277.3
Date:                Sun, 17 May 2020   Prob (F-statistic):          2.96e-125
Time:                        19:29:36   Log-Likelihood:                -535.82
No. Observations:                 240   AIC:                             1096.
Df Residuals:                     228   BIC:                             1137.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         39.6950      0.518     76.

## 2. 보스턴 집값 데이터 예제

In [12]:
from sklearn.datasets import load_boston

boston = load_boston()

dfx = pd.DataFrame(boston.data,columns = boston.feature_names)
dfy = pd.DataFrame(boston.target,columns = ["MEDV"])

# boston 데이터 ==>> CHAS 변수는 범주형 데이터 (클래스 = 2)
dfx.CHAS.unique()

array([0., 1.])

## 1) OLS 회귀분석
    - 이진 범주형 변수 존재 ("CHAS")
    
    

**풀랭크 더미변수화**

In [16]:
df = pd.concat([dfx,dfy],axis=1)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [27]:
# 풀랭크 방식 => 별도의 상수항 만들지 않음. 따라서, 범주형 변수를 명확히 범주형으로 지정해줘야 함 C연산자

feature_names_full = [name for name in boston.feature_names]
feature_names_full.remove('CHAS')
feature_names_full = [name for name in boston.feature_names] + ['C(CHAS)']

model2 = sm.OLS.from_formula('MEDV ~ 0 +' + "+".join(feature_names_full),data=df)
result2 = model2.fit()
print(result2.summary())

                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Sun, 17 May 2020   Prob (F-statistic):          6.72e-135
Time:                        19:51:48   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
C(CHAS)[0.0]    36.4595      5.103      7.144   

**축소랭크 더미변수화**

=> 기준값을 만드는 것. formula문자열에서 +0 을 제외하면, 축소랭크형으로 더미변수 만듬

In [31]:
model1 = sm.OLS.from_formula("MEDV ~ " + "+".join(boston.feature_names),df)
result1 = model1.fit()
print(result1.summary())

                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Sun, 17 May 2020   Prob (F-statistic):          6.72e-135
Time:                        19:57:58   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     36.4595      5.103      7.144      0.0