# 1. 메모리 변수 제거

In [1]:
# 메모리 변수 모두 제거
all = [var for var in globals() if var[0] != "_"]
for var in all:
    del globals()[var]

# 2. 사용 패키지

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# plt.rc('font', family = 'Malgun Gothic') # WINDOWS
plt.rc('font', family = 'AppleGothic') # MAC
plt.rc('axes', unicode_minus = False)
import warnings
warnings.filterwarnings('ignore')

# 3. 데이터 로드

In [3]:
train = pd.read_csv('data/train_pre.csv')
test = pd.read_csv('data/test_pre.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [4]:
sub1 = pd.read_csv('data/sample_submission.csv')
sub2 = pd.read_csv('data/sample_submission.csv')
sub3 = pd.read_csv('data/sample_submission.csv')
sub4 = pd.read_csv('data/sample_submission.csv')

# 4. 데이터 정보

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   일자              1205 non-null   object 
 1   년               1205 non-null   int64  
 2   월               1205 non-null   int64  
 3   일               1205 non-null   int64  
 4   요일              1205 non-null   int64  
 5   휴일전날            1205 non-null   int64  
 6   본사정원수           1205 non-null   int64  
 7   본사휴가자수          1205 non-null   int64  
 8   본사출장자수          1205 non-null   int64  
 9   본사시간외근무명령서승인건수  1205 non-null   int64  
 10  현본사소속재택근무자수     1205 non-null   int64  
 11  식사가능자수          1205 non-null   int64  
 12  중식계             1205 non-null   int64  
 13  석식계             1205 non-null   int64  
 14  중식참여율           1205 non-null   float64
 15  석식참여율           1205 non-null   float64
 16  체감온도(중식)        1205 non-null   float64
 17  불쾌지수(중식)        1205 non-null   i

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   일자              50 non-null     object 
 1   년               50 non-null     int64  
 2   월               50 non-null     int64  
 3   일               50 non-null     int64  
 4   요일              50 non-null     int64  
 5   휴일전날            50 non-null     int64  
 6   본사정원수           50 non-null     int64  
 7   본사휴가자수          50 non-null     int64  
 8   본사출장자수          50 non-null     int64  
 9   본사시간외근무명령서승인건수  50 non-null     int64  
 10  현본사소속재택근무자수     50 non-null     int64  
 11  식사가능자수          50 non-null     int64  
 12  체감온도(중식)        50 non-null     float64
 13  불쾌지수(중식)        50 non-null     int64  
 14  우산(중식)          50 non-null     int64  
 15  체감온도(석식)        50 non-null     float64
 16  불쾌지수(석식)        50 non-null     int64  
 17  우산(석식)          50 non-null     int64

# 5. 인원 수 예측

## 1) 데이터 분할

In [7]:
X_train = train.drop(['일자', '중식계', '석식계', '중식참여율', '석식참여율'], axis = 1)
X_test = test.drop(['일자'], axis = 1)
y_train = train[['중식계', '석식계']]
y_test = submission[['중식계', '석식계']]

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1205, 19), (50, 19), (1205, 2), (50, 2))

## 2) 4가지 모델

### (1)
- default

In [9]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state = 2021)
rfr.fit(X_train, y_train)
pred_1 = rfr.predict(X_test)
sub1[['중식계', '석식계']] = pred_1

### (2)
- max_features = 11, min_samples_leaf = 2, n_estimators = 200

In [10]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(
    max_features = 11, min_samples_leaf = 2,
    n_estimators = 200, random_state = 2021
)
rfr.fit(X_train, y_train)
pred_2 = rfr.predict(X_test)
sub2[['중식계', '석식계']] = pred_2

### (3)
- max_features = 7, min_samples_leaf = 1, n_estimators = 209

In [11]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(
    max_features = 8, min_samples_leaf = 1,
    n_estimators = 418, random_state = 2021
)
rfr.fit(X_train, y_train)
pred_3 = rfr.predict(X_test)
sub3[['중식계', '석식계']] = pred_3

### (4)
- n_estimators = 800, max_depth = 20, bootstrap = True

In [12]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(
    n_estimators = 800, max_depth = 20,
    bootstrap = True, random_state = 2021
)
rfr.fit(X_train, y_train)
pred_4 = rfr.predict(X_test)
sub4[['중식계', '석식계']] = pred_4

## 3) 4가지 모델 예측값의 평균

In [13]:
sub1.rename(columns = {'중식계' : '중식계1'}, inplace = True)
sub1.rename(columns = {'석식계' : '석식계1'}, inplace = True)
sub2.rename(columns = {'중식계' : '중식계2'}, inplace = True)
sub2.rename(columns = {'석식계' : '석식계2'}, inplace = True)
sub3.rename(columns = {'중식계' : '중식계3'}, inplace = True)
sub3.rename(columns = {'석식계' : '석식계3'}, inplace = True)
sub4.rename(columns = {'중식계' : '중식계4'}, inplace = True)
sub4.rename(columns = {'석식계' : '석식계4'}, inplace = True)
sub = pd.merge(sub1, sub2, left_on = '일자', right_on = '일자', how = 'left')
sub = pd.merge(sub, sub3, left_on = '일자', right_on = '일자', how = 'left')
sub = pd.merge(sub, sub4, left_on = '일자', right_on = '일자', how = 'left')
sub = sub.drop(['일자'], axis = 1)

In [14]:
pred_lunch = sub.drop(['석식계1', '석식계2', '석식계3', '석식계4'], axis = 1)
pred_dinner = sub.drop(['중식계1', '중식계2', '중식계3', '중식계4'], axis = 1)

In [15]:
pred_lunch.head(1)

Unnamed: 0,중식계1,중식계2,중식계3,중식계4
0,978.06,976.130581,993.444976,1000.4675


In [16]:
pred_dinner.head(1)

Unnamed: 0,석식계1,석식계2,석식계3,석식계4
0,204.24,208.829968,212.672249,176.37125


In [17]:
submission['중식계'] = pred_lunch.mean(axis = 1).astype(int)
submission['석식계'] = pred_dinner.mean(axis = 1).astype(int)

In [18]:
submission.tail(1)

Unnamed: 0,일자,중식계,석식계
49,2021-04-09,588,284


In [19]:
submission.to_csv('data/submission_complete.csv', index = False)