In [147]:
import pandas as pd
import numpy as np

## 데이터 전처리 (data_preprocessing.ipynb)
* Date Split
* Weekday
* Lunar Date
* Date Normalization
* 식사명 -> one-hot
* 식사내용 -> bag-of-word

In [148]:
# Read Data
train_df = pd.read_excel("data/train.xlsx")
test_df = pd.read_excel("data/test.xlsx")

In [149]:
train_df.head()

Unnamed: 0,일자,식사명,식사내용,수량
0,20030301,아침,"과일샐러드,닭죽,돈육마늘장조림,떡만두국,부추김무침,쌀밥,딸기잼(중),비엔나구이,스크...",37.472924
1,20030301,저녁,"감자으깸샐러드,비프까스,스위트피클,쌀밥,옥수수스프",19.566787
2,20030301,점심(일반),"골뱅이야채무침,새우맛살튀김,쌀밥(사무직),열무겉절이,칼국수",31.191336
3,20030302,아침,"계란죽,곤약멸치조림,김치국,마카로니샐러드,쌀밥,오징어회무침,딸기잼(중),삶은계란,야...",36.101083
4,20030302,저녁,"계란탕,단무지잔파무침,자장소스,잡채밥,탕수만두",21.949458


In [150]:
test_df.head()

Unnamed: 0,일자,식사명,식사내용
0,20100713,아침,"누룽지,닭살찜닭소스조림,두유,멸치볶음,배추김치,쌀밥,열무된장무침,콩나물국,딸기잼(1..."
1,20100713,저녁,"배추김치,손만두국,쌀밥,애느타리볶음,오징어잔파무침,치커리사과생채"
2,20100713,점심(양식),"단무지,배추김치,쌀밥,야채샐러드,야채스프,치즈함박스테이크&데미"
3,20100713,점심(일반),"깍두기(손칼),돈등뼈감자탕,두부달걀전,쌀밥,쫄면야채무침,토마토화채,풋고추된장무침"
4,20100714,아침,"배추김치,쇠고기가지볶음,시금치된장국,쌀밥,야채죽,진미도라지무침,팽이계란전,화인쿨,딸..."


In [151]:
df = pd.concat([train_df, test_df])
df = df.sort_values(by=['일자']).reset_index(drop=True)
print(df[:3])
print(df[-3:])

          수량                                               식사내용     식사명  \
0  37.472924  과일샐러드,닭죽,돈육마늘장조림,떡만두국,부추김무침,쌀밥,딸기잼(중),비엔나구이,스크...      아침   
1  19.566787                        감자으깸샐러드,비프까스,스위트피클,쌀밥,옥수수스프      저녁   
2  31.191336                   골뱅이야채무침,새우맛살튀김,쌀밥(사무직),열무겉절이,칼국수  점심(일반)   

         일자  
0  20030301  
1  20030301  
2  20030301  
              수량                                           식사내용     식사명  \
20603  11.046931                  깍두기(손칼),닭살떡국,미트볼고추장조림,부추장떡,쌀밥      저녁   
20604   3.610108        깍두기(손칼),식빵&딸기잼,쌀밥,야채샐러드,옥수수스프,피클,함박스테이크  점심(양식)   
20605   8.158845  계란파국,깍두기(손칼),날치알김치덮밥,생선까스&타르,쌀밥,오이생채,케이준치킨샐러드  점심(일반)   

             일자  
20603  20171231  
20604  20171231  
20605  20171231  


In [152]:
# 식사명 변환 (one-hot)
def convert_ont_hot(df):
    df = df.join(pd.get_dummies(df['식사명'], prefix='식사명'))
    df.drop(['식사명'], axis=1, inplace=True)
    return df

In [153]:
def moving_average(df, window_size):
    morning = df['수량'][df['식사명_아침']==1].rolling(window_size, min_periods=1).mean().shift(3)
    lunch = df['수량'][df['식사명_점심(일반)']==1].rolling(window_size, min_periods=1).mean().shift(3)
    lunch_west = df['수량'][df['식사명_점심(양식)']==1].rolling(window_size, min_periods=1).mean().shift(3)
    dinner = df['수량'][df['식사명_저녁']==1].rolling(window_size, min_periods=1).mean().shift(3)
    
    return pd.concat([morning, lunch, lunch_west, dinner]).sort_index()

In [154]:
# 식사내용 변환 (Bag-of-Word)
from sklearn.feature_extraction.text import CountVectorizer
def tokenize(text):
        return text.split(',')
def convert_bow(df):
    vectorizer = CountVectorizer(tokenizer=tokenize)
    bow = vectorizer.fit_transform(df['식사내용']).toarray()
    df = df.join(pd.DataFrame(bow, columns=vectorizer.get_feature_names()))
    df.drop(['식사내용'], axis=1, inplace=True)
    return df

In [155]:
# 년/월/일 분리(split) + 요일(Weekday) 추가
def split_date(df):
    # Normalize Date
    df['year'] = (df['일자'] / 10000).astype(int)
    df['month'] = (df['일자'] % 10000 / 100).astype(int)
    df['day'] = (df['일자'] % 100).astype(int)
    df['weekday'] = pd.to_datetime(df['일자'], format = '%Y%m%d').dt.dayofweek
    # df.drop(['일자'], axis=1, inplace=True)

In [156]:
# 음력 추가
from korean_lunar_calendar import KoreanLunarCalendar
from datetime import datetime

def add_lunar_date(df):
    calendar = KoreanLunarCalendar()
    
    lunar_y = []
    lunar_m = []
    lunar_d = []
    for y, m, d in zip (df['year'], df['month'], df['day']):
        calendar.setSolarDate(y, m, d)
        lunar_date = calendar.LunarIsoFormat()
        lunar_y.append(int(lunar_date[:4]))
        lunar_m.append(int(lunar_date[5:7]))
        lunar_d.append(int(lunar_date[8:10]))
        
    df['lunar_year'], df['lunar_month'], df['lunar_day'] = lunar_y, lunar_m, lunar_d


In [157]:
# 년/월/일 변환
def year_norm(df):
    df['year'] = (df['year']-min(df['year'])) / (max(df['year'])-min(df['year']))
    df['lunar_year'] = (df['lunar_year']-min(df['lunar_year'])) / (max(df['lunar_year'])-min(df['lunar_year']))
def month_norm(df):
    df['month_sin'] = [np.sin(x*2*np.pi/12) for x in df['month']]
    df['month_cos'] = [np.cos(x*2*np.pi/12) for x in df['month']]
    df['lunar_month_sin'] = [np.sin(x*2*np.pi/12) for x in df['lunar_month']]
    df['lunar_month_cos'] = [np.cos(x*2*np.pi/12) for x in df['lunar_month']]
    df.drop(['month', 'lunar_month'], axis=1, inplace=True)
def day_norm(df):
    df['day_sin'] = [np.sin(x*2*np.pi/31) for x in df['day']]
    df['day_cos'] = [np.cos(x*2*np.pi/31) for x in df['day']]
    df['lunar_ay_sin'] = [np.sin(x*2*np.pi/31) for x in df['lunar_day']]
    df['lunar_day_cos'] = [np.cos(x*2*np.pi/31) for x in df['lunar_day']]
    df.drop(['day', 'lunar_day'], axis=1, inplace=True)
def weekday_norm(df):
    df['weekday_sin'] = [np.sin(x*2*np.pi/7) for x in df['weekday']]
    df['weekday_cos'] = [np.cos(x*2*np.pi/7) for x in df['weekday']]

In [158]:
# convert 식사명 to one-hot
df = convert_ont_hot(df)

# Moving Average of 수량
df['MA_week'] = moving_average(df, 7)
df['MA_month'] = moving_average(df, 30)
df['MA_half_year'] = moving_average(df, 180)
df['MA_year'] = moving_average(df, 365)
df.drop(df[df.일자 < 20040326].index, inplace=True)
df.drop(df[(df.일자 > 20050109) & (df.일자 < 20060331) & (df['식사명_점심(양식)']==1)].index, inplace=True)
df.reset_index(drop=True, inplace=True)

# convert 식사내용 to Bag-of-Word Vector
df = convert_bow(df)

# Date
split_date(df)
add_lunar_date(df)

# Date Normalization
year_norm(df)
month_norm(df)
day_norm(df)
weekday_norm(df)

In [159]:
print("Number of Columns =", len(df.columns))
df.head()

Number of Columns = 1816


Unnamed: 0,수량,일자,식사명_아침,식사명_저녁,식사명_점심(양식),식사명_점심(일반),MA_week,MA_month,MA_half_year,MA_year,...,month_sin,month_cos,lunar_month_sin,lunar_month_cos,day_sin,day_cos,lunar_ay_sin,lunar_day_cos,weekday_sin,weekday_cos
0,41.877256,20040326,1,0,0,0,44.693141,43.586041,42.294826,44.090797,...,1.0,6.123234000000001e-17,0.866025,0.5,-0.848644,0.528964,0.937752,0.347305,-0.433884,-0.900969
1,13.718412,20040326,0,1,0,0,21.186178,19.845969,19.837545,20.801741,...,1.0,6.123234000000001e-17,0.866025,0.5,-0.848644,0.528964,0.937752,0.347305,-0.433884,-0.900969
2,40.361011,20040326,0,0,0,1,46.168128,41.468111,40.924188,43.456605,...,1.0,6.123234000000001e-17,0.866025,0.5,-0.848644,0.528964,0.937752,0.347305,-0.433884,-0.900969
3,21.877256,20040327,0,0,0,1,45.497679,41.959085,40.86803,43.490431,...,1.0,6.123234000000001e-17,0.866025,0.5,-0.724793,0.688967,0.988468,0.151428,-0.974928,-0.222521
4,12.635379,20040327,0,1,0,0,19.938112,19.77136,19.745688,20.788487,...,1.0,6.123234000000001e-17,0.866025,0.5,-0.724793,0.688967,0.988468,0.151428,-0.974928,-0.222521


# Modeling

1. Random Forest
2. XGBoost

## 0. Prepare train & test

#### 1) Split X and Y

In [160]:
train_x = df.drop(df[df['일자'].isin(test_df['일자'].unique())].index)
train_y = train_x['수량']
train_x = train_x.drop(['수량'], axis=1)

test_x = df[df['일자'].isin(test_df['일자'].unique())]
test_x = test_x.drop(['수량'], axis=1)

#### 2) Train Model

In [161]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators=200, 
                         learning_rate=0.05, 
                         max_depth=15,
                         colsample_bytree=0.9,
                         gamma=0.4,
                         min_child_weight=7,
                         subsample=0.9,
                         reg_alpha=0.1,
                         seed=10)

In [162]:
%%time
model.fit(train_x, train_y, verbose=True)

CPU times: user 12min 57s, sys: 4.6 s, total: 13min 1s
Wall time: 13min 19s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=0.4, learning_rate=0.05,
       max_delta_step=0, max_depth=15, min_child_weight=7, missing=None,
       n_estimators=200, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=0.1, reg_lambda=1, scale_pos_weight=1,
       seed=10, silent=True, subsample=0.9)

#### 3) Prediction

In [167]:
pred = model.predict(test_x.drop(['수량'], axis=1))

In [174]:
test_x['수량'] = pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [175]:
test_x.head()

Unnamed: 0,수량,일자,식사명_아침,식사명_저녁,식사명_점심(양식),식사명_점심(일반),MA_week,MA_month,MA_half_year,MA_year,...,month_cos,lunar_month_sin,lunar_month_cos,day_sin,day_cos,lunar_ay_sin,lunar_day_cos,weekday_sin,weekday_cos,label
8250,29.398787,20100713,0,0,0,1,22.970603,24.182912,26.320497,26.90589,...,-0.866025,1.224647e-16,-1.0,0.485302,-0.874347,0.394356,0.918958,0.781831,0.62349,29.398787
8251,14.40096,20100713,0,0,1,0,17.741104,16.34657,16.261933,15.473617,...,-0.866025,1.224647e-16,-1.0,0.485302,-0.874347,0.394356,0.918958,0.781831,0.62349,14.40096
8252,48.196018,20100713,1,0,0,0,47.127385,45.947052,40.560369,37.79002,...,-0.866025,1.224647e-16,-1.0,0.485302,-0.874347,0.394356,0.918958,0.781831,0.62349,48.196018
8253,24.987238,20100713,0,1,0,0,22.578649,19.88929,19.421982,18.451115,...,-0.866025,1.224647e-16,-1.0,0.485302,-0.874347,0.394356,0.918958,0.781831,0.62349,24.987238
8254,26.772699,20100714,0,0,0,1,22.619907,24.052948,26.178901,26.901934,...,-0.866025,1.224647e-16,-1.0,0.299363,-0.954139,0.571268,0.820763,0.974928,-0.222521,26.772699


In [181]:
test_x[['일자', '수량']]

Unnamed: 0,일자,수량
8250,20100713,29.398787
8251,20100713,14.400960
8252,20100713,48.196018
8253,20100713,24.987238
8254,20100714,26.772699
8255,20100714,16.647858
8256,20100714,26.694954
8257,20100714,49.135315
8258,20100715,19.081593
8259,20100715,22.749456


In [179]:
result = test_x[['일자', '수량']].as_matrix()

In [180]:
result.reshape((-1, 4))

array([[  2.01007130e+07,   2.93987865e+01,   2.01007130e+07,
          1.44009600e+01],
       [  2.01007130e+07,   4.81960182e+01,   2.01007130e+07,
          2.49872379e+01],
       [  2.01007140e+07,   2.67726994e+01,   2.01007140e+07,
          1.66478577e+01],
       ..., 
       [  2.01711210e+07,   2.58800468e+01,   2.01711210e+07,
          2.50370770e+01],
       [  2.01711220e+07,   2.42222633e+01,   2.01711220e+07,
          2.65468864e+01],
       [  2.01711220e+07,   2.55022888e+01,   2.01711220e+07,
          1.31779699e+01]])

In [184]:
np.savetxt("submission.csv", result.reshape((-1, 5)), delimiter=",")

ValueError: cannot reshape array of size 1192 into shape (5)