In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../cli_train.tsv',sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20010 entries, 0 to 20009
Data columns (total 11 columns):
일자             20010 non-null int64
time           20010 non-null object
식사내용           20010 non-null object
매출일자           20010 non-null float64
수량             20010 non-null float64
일시             20010 non-null object
평균기온(°C)       20010 non-null float64
최저기온(°C)       20010 non-null float64
최고기온(°C)       20010 non-null float64
강수 계속시간(hr)    20010 non-null float64
일강수량(mm)       20010 non-null float64
dtypes: float64(7), int64(1), object(3)
memory usage: 1.7+ MB


In [3]:
#df[df['일자']== 20030722]

In [4]:
#df.drop([428],axis=0,inplace=True)

In [5]:
df['year'] = (df['일자']/10000).astype(int)
df['month'] = (df['일자'] % 10000 /100).astype(int)
df['day'] = (df['일자'] % 100).astype(int)

In [6]:
df['weekday'] = pd.to_datetime(df['일자'], format='%Y%m%d').dt.dayofweek
df.drop(['일자'],axis=1,inplace=True)

In [7]:
df['year'] = (df['year']-min(df['year'])) / (max(df['year']) - min(df['year']))
df['month_sin'] = [np.sin(x*2*np.pi/12) for x in df['month']]
df['month_cos'] = [np.cos(x*2*np.pi/12) for x in df['month']]
df['day_sin'] = [np.sin(x*2*np.pi/31) for x in df['day']]
df['day_cos'] = [np.cos(x*2*np.pi/31) for x in df['day']]
df['weekday_sin'] = [np.sin(x*2*np.pi/7) for x in df['weekday']]
df['weekday_cos'] = [np.cos(x*2*np.pi/7) for x in df['weekday']]
df.drop(['month','day','weekday'], axis=1,inplace=True)

In [8]:
df = df.join(pd.get_dummies(df['time'], prefix='식사명'))
df.drop(['time'],axis=1, inplace=True)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
def tokenize(text):
    return text.split(',')
vectorizer = CountVectorizer(tokenizer=tokenize)
bow = vectorizer.fit_transform(df['식사내용']).toarray()
df = df.join(pd.DataFrame(bow,columns=vectorizer.get_feature_names()))
df.drop(['식사내용'], axis=1, inplace=True)
df.head()

Unnamed: 0,매출일자,수량,일시,평균기온(°C),최저기온(°C),최고기온(°C),강수 계속시간(hr),일강수량(mm),year,month_sin,...,휘),흑미밥,흑미밥(현장),흑임자밥,흑임자죽,흑콩견과류조림,흑콩밥,흑콩조림,흰죽,흰콩곤약조림
0,20030301.0,37.472924,2003-03-01,6.1,4.5,8.0,9.25,8.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,20030301.0,19.566787,2003-03-01,6.1,4.5,8.0,9.25,8.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,20030301.0,31.191336,2003-03-01,6.1,4.5,8.0,9.25,8.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,20030302.0,36.101083,2003-03-02,9.5,3.7,15.1,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,20030302.0,21.949458,2003-03-02,9.5,3.7,15.1,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
%%time
coe_list = []
for feature in vectorizer.get_feature_names():
    corr_df = df[['수량',feature]].corr()
    coe_list.append(corr_df.iloc[0,1])

CPU times: user 2.24 s, sys: 13.2 ms, total: 2.25 s
Wall time: 2.25 s


In [11]:
corr_df = pd.DataFrame({'feature':vectorizer.get_feature_names(),'coe':coe_list})
corr_df.info()
corr_df.sort_values('coe',ascending=False)
drop_feature_list = corr_df[(corr_df['coe'] < 0.01) & (corr_df['coe']  > -0.01)].feature.tolist()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1875 entries, 0 to 1874
Data columns (total 2 columns):
coe        1874 non-null float64
feature    1875 non-null object
dtypes: float64(1), object(1)
memory usage: 29.4+ KB


In [12]:
df.drop(drop_feature_list,axis=1,inplace=True)

In [13]:
df.drop(['매출일자','일시'],axis=1,inplace=True)

In [14]:
df.drop(['평균기온(°C)', '최저기온(°C)', '최고기온(°C)', '강수 계속시간(hr)', '일강수량(mm)'],axis=1,inplace=True)

In [15]:
df.head()

Unnamed: 0,수량,year,month_sin,month_cos,day_sin,day_cos,weekday_sin,weekday_cos,식사명_아침,식사명_저녁,...,후랑크구이(2개씩),후루츠탕수육,후르츠펀치,훈제오리샐러드,휘),흑미밥,흑임자죽,흑콩밥,흑콩조림,흰죽
0,37.472924,0.0,1.0,6.123234000000001e-17,0.201299,0.97953,-0.974928,-0.222521,1,0,...,0,0,0,0,0,0,0,0,0,0
1,19.566787,0.0,1.0,6.123234000000001e-17,0.201299,0.97953,-0.974928,-0.222521,0,1,...,0,0,0,0,0,0,0,0,0,0
2,31.191336,0.0,1.0,6.123234000000001e-17,0.201299,0.97953,-0.974928,-0.222521,0,0,...,0,0,0,0,0,0,0,0,0,0
3,36.101083,0.0,1.0,6.123234000000001e-17,0.394356,0.918958,-0.781831,0.62349,1,0,...,0,0,1,0,0,0,0,0,0,0
4,21.949458,0.0,1.0,6.123234000000001e-17,0.394356,0.918958,-0.781831,0.62349,0,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
from sklearn.model_selection import train_test_split
train_df, dev_df = train_test_split(df,test_size=0.2, random_state=10)
print(train_df.shape)
print(dev_df.shape)

(16008, 932)
(4002, 932)


In [17]:
train_y = train_df['수량']
train_x = train_df.drop(['수량'],axis=1)
dev_y = dev_df['수량']
dev_x = dev_df.drop(['수량'],axis=1)

In [18]:
from sklearn.metrics import mean_squared_error
import math

def train_and_predict(model,train_x,train_y,dev_x):
    model.fit(train_x, train_y)
    return model.predict(dev_x)

def evaluate(predict_y, actual_y):
    rmse = math.sqrt(mean_squared_error(actual_y, predict_y))
    print('RMSE:', round(np.mean(rmse), 4))

In [19]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

### main_model

In [23]:
%%time
model = xgb.XGBRegressor(n_estimators=150, 
                         learning_rate=0.05, 
                         max_depth=12,
                         colsample_bytree=0.9,
                         gamma=0.4,
                         min_child_weight=7,
                         subsample=0.9,
                         reg_alpha=0.1,
                         seed=10)
pred = train_and_predict(model, train_x,train_y,dev_x)
evaluate(pred,dev_y)

RMSE: 4.695
CPU times: user 3min 58s, sys: 1.38 s, total: 3min 59s
Wall time: 4min


## sub_sample_model
### 아침

In [24]:
time_features = ['식사명_아침', '식사명_점심', '식사명_점심2', '식사명_저녁']

In [25]:
mor_train_df = train_df[train_df['식사명_아침'] == 1].drop(time_features,axis=1)
mor_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4147 entries, 4060 to 17728
Columns: 928 entries, 수량 to 흰죽
dtypes: float64(8), int64(920)
memory usage: 29.4 MB


In [26]:
mor_dev_df = dev_df[dev_df['식사명_아침'] == 1].drop(time_features,axis=1)
mor_dev_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1049 entries, 10185 to 11232
Columns: 928 entries, 수량 to 흰죽
dtypes: float64(8), int64(920)
memory usage: 7.4 MB


In [27]:
mor_train_y = mor_train_df['수량']
mor_train_x = mor_train_df.drop(['수량'],axis=1)
mor_dev_y = mor_dev_df['수량']
mor_dev_x = mor_dev_df.drop(['수량'],axis=1)

In [28]:
%%time
mor_model = xgb.XGBRegressor(n_estimators=150, 
                         learning_rate=0.05, 
                         max_depth=11,
                         colsample_bytree=0.9,
                         gamma=0.4,
                         min_child_weight=7,
                         subsample=0.9,
                         reg_alpha=0.1,
                         seed=10)
pred = train_and_predict(mor_model, mor_train_x,mor_train_y,mor_dev_x)
evaluate(pred,mor_dev_y)

RMSE: 4.7378
CPU times: user 53.2 s, sys: 328 ms, total: 53.5 s
Wall time: 53.8 s


In [29]:
pred = model.predict(dev_x[dev_x['식사명_아침']==1])
evaluate(pred,mor_dev_y)

RMSE: 4.7081


### 점심 

In [30]:
lun_train_df = train_df[train_df['식사명_점심'] == 1].drop(time_features,axis=1)
lun_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4192 entries, 3543 to 9372
Columns: 928 entries, 수량 to 흰죽
dtypes: float64(8), int64(920)
memory usage: 29.7 MB


In [31]:
lun_dev_df = dev_df[dev_df['식사명_점심'] == 1].drop(time_features,axis=1)
lun_dev_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1001 entries, 7252 to 13911
Columns: 928 entries, 수량 to 흰죽
dtypes: float64(8), int64(920)
memory usage: 7.1 MB


In [32]:
lun_train_y = lun_train_df['수량']
lun_train_x = lun_train_df.drop(['수량'],axis=1)
lun_dev_y = lun_dev_df['수량']
lun_dev_x = lun_dev_df.drop(['수량'],axis=1)

In [33]:
%%time
lun_model = xgb.XGBRegressor(n_estimators=150, 
                         learning_rate=0.05, 
                         max_depth=11,
                         colsample_bytree=0.9,
                         gamma=0.4,
                         min_child_weight=7,
                         subsample=0.9,
                         reg_alpha=0.1,
                         seed=10)
pred = train_and_predict(lun_model, lun_train_x,lun_train_y,lun_dev_x)
evaluate(pred,lun_dev_y)

RMSE: 6.122
CPU times: user 56.8 s, sys: 427 ms, total: 57.2 s
Wall time: 57.5 s


In [34]:
pred = model.predict(dev_x[dev_x['식사명_점심']==1])
evaluate(pred,lun_dev_y)

RMSE: 6.0308


### 점심-양식

In [35]:
lun2_train_df = train_df[train_df['식사명_점심2'] == 1].drop(time_features,axis=1)
lun2_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3489 entries, 19526 to 7293
Columns: 928 entries, 수량 to 흰죽
dtypes: float64(8), int64(920)
memory usage: 24.7 MB


In [36]:
lun2_dev_df = dev_df[dev_df['식사명_점심2'] == 1].drop(time_features,axis=1)
lun2_dev_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 936 entries, 19247 to 15861
Columns: 928 entries, 수량 to 흰죽
dtypes: float64(8), int64(920)
memory usage: 6.6 MB


In [37]:
lun2_train_y = lun2_train_df['수량']
lun2_train_x = lun2_train_df.drop(['수량'],axis=1)
lun2_dev_y = lun2_dev_df['수량']
lun2_dev_x = lun2_dev_df.drop(['수량'],axis=1)

In [38]:
%%time
lun2_model = xgb.XGBRegressor(n_estimators=150, 
                         learning_rate=0.05, 
                         max_depth=11,
                         colsample_bytree=0.9,
                         gamma=0.4,
                         min_child_weight=7,
                         subsample=0.9,
                         reg_alpha=0.1,
                         seed=10)
pred = train_and_predict(lun2_model, lun2_train_x,lun2_train_y,lun2_dev_x)
evaluate(pred,lun2_dev_y)

RMSE: 4.0105
CPU times: user 48.1 s, sys: 354 ms, total: 48.4 s
Wall time: 48.6 s


In [39]:
pred = model.predict(dev_x[dev_x['식사명_점심2']==1])
evaluate(pred,lun2_dev_y)

RMSE: 3.9856


### 저녁

In [40]:
din_train_df = train_df[train_df['식사명_저녁'] == 1].drop(time_features,axis=1)
din_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4180 entries, 14273 to 17673
Columns: 928 entries, 수량 to 흰죽
dtypes: float64(8), int64(920)
memory usage: 29.6 MB


In [41]:
din_dev_df = dev_df[dev_df['식사명_저녁'] == 1].drop(time_features,axis=1)
din_dev_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1016 entries, 1876 to 16106
Columns: 928 entries, 수량 to 흰죽
dtypes: float64(8), int64(920)
memory usage: 7.2 MB


In [42]:
din_train_y = din_train_df['수량']
din_train_x = din_train_df.drop(['수량'],axis=1)
din_dev_y = din_dev_df['수량']
din_dev_x = din_dev_df.drop(['수량'],axis=1)

In [43]:
%%time
din_model = xgb.XGBRegressor(n_estimators=150, 
                         learning_rate=0.05, 
                         max_depth=11,
                         colsample_bytree=0.9,
                         gamma=0.4,
                         min_child_weight=7,
                         subsample=0.9,
                         reg_alpha=0.1,
                         seed=10)
pred = train_and_predict(din_model, din_train_x,din_train_y,din_dev_x)
evaluate(pred,din_dev_y)

RMSE: 3.7191
CPU times: user 52.5 s, sys: 159 ms, total: 52.7 s
Wall time: 52.7 s


In [46]:
pred = model.predict(dev_x[dev_x['식사명_저녁']==1])
evaluate(pred,din_dev_y)

RMSE: 3.6706
