# 예측

## 메모리 변수 제거

In [1]:
# 메모리 변수 모두 제거
all = [var for var in globals() if var[0] != "_"]
for var in all:
    del globals()[var]

## 사용 패키지

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# plt.rc('font', family = 'Malgun Gothic') # WINDOWS
plt.rc('font', family = 'AppleGothic') # MAC
plt.rc('axes', unicode_minus = False)
import warnings
warnings.filterwarnings('ignore')

## 데이터 로드
- 전처리 완료된 데이터

In [3]:
train = pd.read_csv('/data/train_pre.csv')
test = pd.read_csv('/data/test_pre.csv')
submission = pd.read_csv('/data/sample_submission.csv')

### Train
- 년 : 
- 월 : 
- 일 : 
- 요일 : 월, 화, 수, 목, 금 -> 0, 1, 2, 3, 4
- 휴일전날 : True or False
- 식사가능자수 : 총 인원 중 휴가자, 재택근무자를 제외한 식사 가능한 인원 수
- 중식계 : 점심 식사 이용한 인원
- 석식계 : 저녁 식사 이용한 인원
- 중식참여율 : (중식계) / (식사가능자수)
- 석식참여율 : (석식계) / (식사가능자수)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   년       1205 non-null   int64  
 1   월       1205 non-null   int64  
 2   일       1205 non-null   int64  
 3   요일      1205 non-null   int64  
 4   휴일전날    1205 non-null   bool   
 5   식사가능자수  1205 non-null   int64  
 6   중식계     1205 non-null   int64  
 7   석식계     1205 non-null   int64  
 8   중식참여율   1205 non-null   float64
 9   석식참여율   1205 non-null   float64
dtypes: bool(1), float64(2), int64(7)
memory usage: 86.0 KB


### Test
- 년 : 
- 월 : 
- 일 : 
- 요일 : 월, 화, 수, 목, 금 -> 0, 1, 2, 3, 4
- 휴일전날 : True or False
- 식사가능자수 : 총 인원 중 휴가자, 재택근무자를 제외한 식사 가능한 인원 수

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   년       50 non-null     int64
 1   월       50 non-null     int64
 2   일       50 non-null     int64
 3   요일      50 non-null     int64
 4   휴일전날    50 non-null     bool 
 5   식사가능자수  50 non-null     int64
dtypes: bool(1), int64(5)
memory usage: 2.1 KB


### Submission
- 일자 : 년-월-일
- 중식계 : 점심 식사 이용한 인원
- 석식계 : 저녁 식사 이용한 인원

In [6]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   일자      50 non-null     object
 1   중식계     50 non-null     int64 
 2   석식계     50 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.3+ KB


## 상관계수

In [7]:
train.corr()[['중식계', '석식계']]

Unnamed: 0,중식계,석식계
년,-0.078804,-0.194792
월,-0.154664,-0.127142
일,-0.097392,-0.185565
요일,-0.734273,-0.31324
휴일전날,-0.170249,-0.193838
식사가능자수,0.151029,0.118615
중식계,1.0,0.508287
석식계,0.508287,1.0
중식참여율,0.957156,0.47536
석식참여율,0.479768,0.973722


## 머신러닝 모델

### Train, Test 분할

In [8]:
X_train = train[['요일', '휴일전날', '식사가능자수']]
X_test = test[['요일', '휴일전날', '식사가능자수']]
y1_train = train['중식계']
y2_train = train['석식계']

### Light GBM

In [9]:
import lightgbm as lgb
# LGBM Regressor
lgbm = lgb.LGBMRegressor(random_state = 2021)
# 중식 모델 학습
lgbm.fit(X_train, y1_train)
# 중식 인원 예측값 생성
y1_test = lgbm.predict(X_test)
# 석식 모델 학습
lgbm.fit(X_train, y2_train)
# 석식 인원 예측값 생성
y2_test = lgbm.predict(X_test)

In [10]:
y1_test # 중식 인원 예측 결과 - Light GBM

array([ 876.62980857,  842.18917382,  587.5614592 , 1120.80494413,
        898.16452871,  927.11971143,  835.48497449,  751.54679846,
       1189.63233243, 1056.44764811,  661.08494489, 1126.12404872,
       1013.14523696,  933.61577705,  867.89907569,  711.07505955,
       1243.06910011,  931.74733292,  941.13051857,  844.4154167 ,
        660.05889782,  992.86025242,  943.29996006,  795.79718564,
        621.16004217, 1173.57204875,  936.43534399,  943.29996006,
        936.62717419,  650.86413025, 1160.1719282 , 1056.44764811,
        951.23801879,  811.01081635,  723.7293946 , 1120.80494413,
       1000.20961694,  969.87519508,  852.00339019,  756.04208399,
       1142.05907661,  952.28782091,  985.67749228,  876.97814735,
        657.58674838, 1195.9788161 ,  978.395935  ,  940.26750845,
        851.79417932,  668.33441731])

In [11]:
y2_test # 석식 인원 예측 결과 - Light GBM

array([363.53769457, 459.83893671, 345.06448152, 463.02192033,
       507.19055675, 433.03053364, 472.3600569 , 406.65018401,
       523.2062813 , 483.40059913, 230.49218599, 443.09226774,
       617.07745928, 323.50064495, 437.69919543, 440.2671964 ,
       589.86919895, 513.34450515, 419.69262529, 523.59293467,
       435.02223044, 559.160531  , 450.64614457, 468.24684945,
       429.02284047, 584.05520712, 486.24997061, 450.64614457,
       453.38006835, 436.64387756, 553.22065769, 483.40059913,
       480.75393057, 475.10536235, 486.78303275, 524.97204241,
       571.02324216, 358.94301011, 467.06394818, 419.89570711,
       550.80623607, 524.77071856, 355.68174752, 486.9346022 ,
       441.56815955, 596.95360156, 530.77903826, 394.52936644,
       472.99430408, 479.51627867])

### Random Forest

In [12]:
from sklearn.ensemble import RandomForestRegressor
# Random Forest Regressor
rfr = RandomForestRegressor(random_state = 2021)
# 중식 모델 학습
rfr.fit(X_train, y1_train)
# 중식 인원 예측값 생성
y3_test = rfr.predict(X_test)
# 석식 모델 학습
rfr.fit(X_train, y2_train)
# 석식 인원 예측값 생성
y4_test = rfr.predict(X_test)

In [13]:
y3_test # 중식 인원 예측 결과 - Random Forest

array([ 831.33      ,  881.91      ,  559.        , 1103.21      ,
        931.62      ,  925.43766667,  859.60916667,  753.68233333,
       1253.54183333,  938.02      ,  784.6       , 1054.38      ,
       1052.29      ,  943.01      ,  892.20866667,  755.1125    ,
       1206.579     ,  851.98      , 1002.71      ,  820.25933333,
        654.682     , 1072.13614683,  951.45390476,  821.31833333,
        706.20083333, 1156.32      ,  937.54      ,  951.45390476,
        940.285     ,  733.84033333, 1197.30670094,  986.53      ,
       1042.05      ,  773.00409524,  702.51333333, 1103.21      ,
        974.18      ,  982.02964286,  835.21066667,  572.01228571,
       1172.91583189,  960.3825    , 1021.86815476,  837.84338492,
        605.73333333, 1185.5105    ,  953.26114286,  982.01      ,
        853.91      ,  652.406     ])

In [14]:
y4_test # 석식 인원 예측 결과 - Random Forest


array([244.89      , 565.89      , 420.07      , 512.36      ,
       486.81166667, 485.89316667, 455.46916667, 462.374     ,
       510.92583333, 459.35      , 213.06      , 434.47      ,
       610.95      , 308.9       , 463.49466667, 480.58511905,
       583.85633333, 493.02      , 514.95      , 532.86666667,
       414.41130952, 599.01370635, 458.68152381, 475.34      ,
       524.93      , 501.22      , 527.88      , 458.68152381,
       542.595     , 484.13630952, 578.59647222, 459.72      ,
       410.22      , 465.08771429, 544.1125    , 517.68      ,
       628.8       , 318.63778571, 463.438     , 370.1772381 ,
       563.67237698, 507.92333333, 267.46416667, 475.16136905,
       456.61      , 617.9335    , 484.26152381, 533.53      ,
       451.36      , 414.986     ])

## 중식 인원 예측 랜덤포레스트 (그리드서치 이용)



In [15]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [None, 6, 9, 12],
    'min_samples_split': [0.01, 0.05, 0.1],
    'max_features': ['auto', 'sqrt'],
}


In [16]:
estimator = RandomForestRegressor()

In [17]:
from sklearn.model_selection import KFold

kf = KFold(random_state=30,
           n_splits=10,
           shuffle=True,
          )

In [18]:
grid_search = GridSearchCV(estimator=estimator, 
                           param_grid=param_grid, 
                           cv=kf, 
                           n_jobs=-1, 
                           verbose=2
                          )
grid_search.fit(X_train, y1_train)


Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   45.2s
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  4.1min finished


GridSearchCV(cv=KFold(n_splits=10, random_state=30, shuffle=True),
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verb

In [19]:
grid_search.best_params_

{'max_depth': 6,
 'max_features': 'auto',
 'min_samples_split': 0.05,
 'n_estimators': 250}

In [20]:
estimator=grid_search.best_estimator_

In [None]:
best_model = grid_search.best_estimator_
y_predict = best_model.predict(x_test)


In [46]:
y_predict =estimator.predict(X_test)
y_predict1=y_predict.astype("int")
y_predict1

array([ 943,  846,  646, 1167,  950,  942,  846,  689, 1174,  952,  765,
       1069,  951,  943,  845,  686, 1212,  950,  943,  849,  687,  949,
        940,  836,  622, 1206,  950,  940,  886,  688, 1174,  952,  945,
        844,  686, 1167,  950,  942,  846,  669, 1174,  950,  943,  845,
        686, 1174,  950,  943,  845,  688])

In [21]:
print("최고 평균 정확도: {0:.4f}".format(grid_search.best_score_))
print("최고의 파라미터 :",grid_search.best_params_)

최고 평균 정확도: 0.6592
최고의 파라미터 : {'max_depth': 6, 'max_features': 'auto', 'min_samples_split': 0.05, 'n_estimators': 250}


## 석식 인원 예측 랜덤포레스트(그리드서치 이용)




In [47]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [None, 6, 9, 12],
    'min_samples_split': [0.01, 0.05, 0.1],
    'max_features': ['auto', 'sqrt'],
}


In [48]:
estimator = RandomForestRegressor()

In [49]:
from sklearn.model_selection import KFold

kf = KFold(random_state=30,
           n_splits=10,
           shuffle=True,
          )

In [51]:
grid_search2 = GridSearchCV(estimator=estimator, 
                           param_grid=param_grid, 
                           cv=kf, 
                           n_jobs=-1, 
                           verbose=2
                          )
grid_search2.fit(X_train, y2_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  4.1min finished


GridSearchCV(cv=KFold(n_splits=10, random_state=30, shuffle=True),
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verb

In [54]:
grid_search2.best_params_

{'max_depth': 9,
 'max_features': 'auto',
 'min_samples_split': 0.1,
 'n_estimators': 150}

In [55]:
estimator=grid_search2.best_estimator_

In [57]:
best_model = grid_search2.best_estimator_
y_predict = best_model.predict(X_test)


In [58]:
y_predict =estimator.predict(X_test)
y_predict2=y_predict.astype("int")
y_predict2

array([395, 480, 352, 539, 526, 397, 488, 438, 545, 534, 351, 520, 560,
       389, 471, 439, 561, 538, 393, 491, 434, 538, 390, 491, 402, 566,
       538, 390, 491, 439, 558, 534, 398, 471, 443, 539, 539, 396, 485,
       431, 557, 537, 396, 488, 439, 565, 541, 396, 477, 439])

In [59]:
print("최고 평균 정확도: {0:.4f}".format(grid_search2.best_score_))
print("최고의 파라미터 :",grid_search2.best_params_)

최고 평균 정확도: 0.2746
최고의 파라미터 : {'max_depth': 9, 'max_features': 'auto', 'min_samples_split': 0.1, 'n_estimators': 150}
