# 1. 메모리 변수 제거

In [1]:
# 메모리 변수 모두 제거
all = [var for var in globals() if var[0] != "_"]
for var in all:
    del globals()[var]

# 2. 사용 패키지

In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# 3. 데이터 로드

In [3]:
train = pd.read_csv('data/train_pre.csv')

# 4. 데이터 정보

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   일자              1205 non-null   object 
 1   년               1205 non-null   int64  
 2   월               1205 non-null   int64  
 3   일               1205 non-null   int64  
 4   요일              1205 non-null   int64  
 5   휴일전날            1205 non-null   int64  
 6   본사정원수           1205 non-null   int64  
 7   본사휴가자수          1205 non-null   int64  
 8   본사출장자수          1205 non-null   int64  
 9   본사시간외근무명령서승인건수  1205 non-null   int64  
 10  현본사소속재택근무자수     1205 non-null   int64  
 11  식사가능자수          1205 non-null   int64  
 12  중식계             1205 non-null   int64  
 13  석식계             1205 non-null   int64  
 14  중식참여율           1205 non-null   float64
 15  석식참여율           1205 non-null   float64
 16  체감온도(중식)        1205 non-null   float64
 17  불쾌지수(중식)        1205 non-null   i

# 5. 모델 선택

## 1) 데이터 분할

In [5]:
from sklearn.model_selection import train_test_split
X = train.drop(['일자', '중식계', '석식계', '중식참여율', '석식참여율'], axis = 1)
y = train[['중식계', '석식계']]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size = 0.3, random_state = 2021)

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((843, 19), (362, 19), (843, 2), (362, 2))

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 843 entries, 798 to 1140
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   년               843 non-null    int64  
 1   월               843 non-null    int64  
 2   일               843 non-null    int64  
 3   요일              843 non-null    int64  
 4   휴일전날            843 non-null    int64  
 5   본사정원수           843 non-null    int64  
 6   본사휴가자수          843 non-null    int64  
 7   본사출장자수          843 non-null    int64  
 8   본사시간외근무명령서승인건수  843 non-null    int64  
 9   현본사소속재택근무자수     843 non-null    int64  
 10  식사가능자수          843 non-null    int64  
 11  체감온도(중식)        843 non-null    float64
 12  불쾌지수(중식)        843 non-null    int64  
 13  우산(중식)          843 non-null    int64  
 14  체감온도(석식)        843 non-null    float64
 15  불쾌지수(석식)        843 non-null    int64  
 16  우산(석식)          843 non-null    int64  
 17  코로나신규확진자        843 non-null    

## 2) Standard Scaler

In [8]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)

## 3) Linear Regression

In [9]:
from sklearn.linear_model import LinearRegression
# Linear Regression
lr = LinearRegression()
# 모델 학습
lr.fit(X_train, y_train)
# 예측값 생성
pred_lr = lr.predict(X_test)

## 4) Lasso

In [10]:
from sklearn.linear_model import Lasso
# Lasso
ls = Lasso()
# 모델 학습
ls.fit(X_train, y_train)
# 예측값 생성
pred_ls = ls.predict(X_test)

## 5) Ridge

In [11]:
from sklearn.linear_model import Ridge
# Ridge
rg = Ridge()
# 모델 학습
rg.fit(X_train, y_train)
# 예측값 생성
pred_rg = rg.predict(X_test)

## 6) Random Forest

In [12]:
from sklearn.ensemble import RandomForestRegressor
# Random Forest Regressor
rfr = RandomForestRegressor()
# 모델 학습
rfr.fit(X_train, y_train)
# 예측값 생성
pred_rfr = rfr.predict(X_test)

## 7) 예측 비교
- R2 : 분산 기반, 1에 가까울수록 예측 정확도 높음
- MSE : (실제값-예측값)² 평균
- RMSE : √(MSE)
- MAE : |실제값-예측값|

In [13]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
# Linear Regression
lr_R2 = r2_score(y_test, pred_lr)
lr_MSE = mean_squared_error(y_test, pred_lr)
lr_MAE = mean_absolute_error(y_test, pred_lr)
# Lasso
ls_R2 = r2_score(y_test, pred_ls)
ls_MSE = mean_squared_error(y_test, pred_ls)
ls_MAE = mean_absolute_error(y_test, pred_ls)
# Ridge
rg_R2 = r2_score(y_test, pred_rg)
rg_MSE = mean_squared_error(y_test, pred_rg)
rg_MAE = mean_absolute_error(y_test, pred_rg)
# Random Forest Regression
rfr_R2 = r2_score(y_test, pred_rfr)
rfr_MSE = mean_squared_error(y_test, pred_rfr)
rfr_MAE = mean_absolute_error(y_test, pred_rfr)
# Print
print('R2 Score - Linear Reg : %.2f, Lasso : %.2f, Ridge : %.2f, Random Forest Reg : %.2f' %(lr_R2, ls_R2, rg_R2, rfr_R2))
print('MSE - Linear Reg : %.2f, Lasso : %.2f, Ridge : %.2f, Random Forest Reg : %.2f' %(lr_MSE, ls_MSE, rg_MSE, rfr_MSE))
print('MAE - Linear Reg : %.2f, Lasso : %.2f, Ridge : %.2f, Random Forest Reg : %.2f' %(lr_MAE, ls_MAE, rg_MAE, rfr_MAE))

R2 Score - Linear Reg : 0.58, Lasso : 0.59, Ridge : 0.58, Random Forest Reg : 0.69
MSE - Linear Reg : 11236.88, Lasso : 11185.03, Ridge : 11230.47, Random Forest Reg : 8550.35
MAE - Linear Reg : 78.71, Lasso : 78.47, Ridge : 78.68, Random Forest Reg : 66.78


# 6. 모델 선택 결과
- R2 Score : Random Forest Regression
- MSE : Random Forest Regression
- MAE : Random Forest Regression

# 7. 모델 평가

## 1) About Random Forest
- 여러 개의 결정 트리(Decision Tree)를 배깅(Bagging)
- 배깅(Bagging) : 동일 알고리즘 여러 개의 분류기 생성, 각 분류기는 부트스트래핑(Bootstrapping)으로 학습, 보팅(Voting)을 통해 예측 결정
- 부트스트래핑(Bootstrapping) : 전체 데이터에서 일부 데이터의 중첩 허용
- 장점 : 쉬움, 직관적, 앙상블 알고리즘 중에서는 빠름, 좋은 성능
- 단점 : 하이퍼 파라미터가 많아 튜닝을 많이 해야 함

## 2) Random Forest Regressor - Hyper Parameter
- n_estimators : int, default = 100, 트리 수
- criterion : {'squared_error', 'absolute_error', 'poisson'}, Default = 'squared_error', 분할 품질 측정
- (1) 'squared_error' : 평균 제곱 오차
- (2) 'absolute_error' : 평균 절대값 오차
- (3) 'poisson' : 포아송 이탈도 감소
- max_depth : int, default = None, 트리 최대 깊이
- min_samples_split : int/float, default = 2, 내부 노드 분할 위해 필요 최소 샘플 수
- min_samples_leaf : int/float, default = 1, 리프 노드에 있어야 할 최소 샘플 수
- min_weight_fraction_leaf : float, default = 0.0, 가중치 합계의 최소 가중비
- max_features : {'auto', 'sqrt', 'log2'}, int/float, default = auto, Feature 수
- (1) 'auto', max_features = n_features
- (2) 'sqrt', max_features = sqrt(n_features)
- (3) 'log2', max_features = log2(n_features)
- (4) None, max_features = n_features
- max_leaf_nodes : int, default = None, 리프 노드의 최대 개수

## 3) Randomized Search CV

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
# Parameters
parameters = {
    'n_estimators' : [700, 800, 900],
    'max_depth' : [10, 20, 30],
    'bootstrap' : [True, False]
}
# Random Forest Regression
rfr = RandomForestRegressor(n_jobs = -1, random_state = 2021)
# Randomized Search CV
rfr_random = RandomizedSearchCV(
    estimator = rfr,
    param_distributions = parameters,
    n_iter = 100, cv = 3, verbose = 2,
    n_jobs = -1, random_state = 2021
)
# Fit Model
rfr_random.fit(X_train, y_train)
# Print
print(rfr_random.best_params_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END .....bootstrap=True, max_depth=10, n_estimators=700; total time=   9.0s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=700; total time=   9.0s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=700; total time=   9.3s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=800; total time=  10.0s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=800; total time=  10.0s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=800; total time=  10.0s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=900; total time=  11.1s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=900; total time=  11.0s
[CV] END .....bootstrap=True, max_depth=20, n_estimators=700; total time=   8.4s
[CV] END .....bootstrap=True, max_depth=20, n_estimators=700; total time=   8.6s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=900; total time=  10.4s
[CV] END .....bootstrap=True, max_depth=20, n_es

## 4) Grid Search CV

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# Parameters
parameters = {
    'n_estimators' : [700, 800, 900],
    'max_depth' : [10, 20, 30],
    'bootstrap' : [True, False]
}
# Random Forest Regression
rfr = RandomForestRegressor(n_jobs = -1, random_state = 2021)
# Grid Search CV
rfr_grid = GridSearchCV(
    estimator = rfr,
    param_grid = parameters,
    cv = 3, verbose = 2,
    n_jobs = -1
)
# Fit Model
rfr_grid.fit(X_train, y_train)
# Print
print(rfr_grid.best_params_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END .....bootstrap=True, max_depth=10, n_estimators=700; total time=   6.9s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=700; total time=   6.9s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=700; total time=   6.9s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=800; total time=   7.8s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=800; total time=   7.2s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=800; total time=   7.2s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=900; total time=   8.0s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=900; total time=   9.0s
[CV] END .....bootstrap=True, max_depth=20, n_estimators=700; total time=   7.5s
[CV] END .....bootstrap=True, max_depth=20, n_estimators=700; total time=   8.0s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=900; total time=   9.2s
[CV] END .....bootstrap=True, max_depth=20, n_es

## 5) 기본 설정값 vs 최적값
- n_estimators = 800
- max_depth = 20
- bootstrap = True

In [16]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
pred_default = rfr.predict(X_test)

In [17]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(
    n_estimators = 800, max_depth = 20, bootstrap = True
)
rfr.fit(X_train, y_train)
pred_set = rfr.predict(X_test)

## 6) 결과 비교
- R2 : 분산 기반, 1에 가까울수록 예측 정확도 높음
- MSE : (실제값-예측값)² 평균
- RMSE : √(MSE)
- MAE : |실제값-예측값|

In [18]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
# 기본 설정값
default_R2 = r2_score(y_test, pred_default)
default_MSE = mean_squared_error(y_test, pred_default)
default_MAE = mean_absolute_error(y_test, pred_default)
# 최적값
set_R2 = r2_score(y_test, pred_set)
set_MSE = mean_squared_error(y_test, pred_set)
set_MAE = mean_absolute_error(y_test, pred_set)
print('R2 Score - Default : %.2f, Set : %.2f' %(default_R2, set_R2))
print('MSE - Default : %.2f, Set : %.2f' %(default_MSE, set_MSE))
print('MAE - Default : %.2f, Set : %.2f' %(default_MAE, set_MAE))

R2 Score - Default : 0.69, Set : 0.69
MSE - Default : 8552.27, Set : 8429.55
MAE - Default : 66.85, Set : 66.24
