In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib
%matplotlib inline

In [64]:
train = pd.read_csv('train.csv')
train.index = pd.to_datetime(train['datetime'])
#train.head()

In [65]:
new_train=train["2014-01-07":].copy()

In [66]:
train=new_train.reset_index(drop=True)

In [67]:
#ある値に漸近する指数関数をフィッティングさせてトレンドを抽出
from scipy.optimize import curve_fit

def func(x, a, b, c):
    return a * np.exp(-b * x) + c

xs = train.index.values
ys = train['y'].values

popt, pcov = curve_fit(func, xs, ys)

a1, b1, c1 = popt
y_reg = a1 * np.exp(-b1 * xs) + c1

  """


In [68]:
#販売数からトレンドを引くことで純粋な売り上げがわかる
new_y = train['y'] - y_reg
train['new_y']=new_y

In [69]:
#テストデータとトレーニングデータでは月が違うので日にちのみ使う
train['day'] = train['datetime'].apply(lambda x : int(x.split("-")[2]))

In [70]:
# トレンドを無視した曜日の周期性を反映させるためにMedian encodingする
week_encoded = train.groupby('week').new_y.median()
train['week'] = train['week'].replace(week_encoded)

In [71]:
#トレンドに加え周期性も排除した'new_new_y'を作成
train['new_new_y'] = train['new_y'] - train['week']
#お楽しみメニューにラベルづけ
train['fun'] = train['remarks'].apply(lambda x: 1 if x=="お楽しみメニュー" else 0)

In [72]:
#月平均の差分を'temp'として加えた。温かかったのに突然冷え込むなどすると外に出たくなくなるので相関があると考えた
train['month'] = train['datetime'].apply(lambda x : int(x.split("-")[1]))
temp_mean = train.groupby('month').temperature.mean()
train['month'] =  train['month'].replace(temp_mean)
train['temp'] = train['temperature'] - train['month']

In [73]:
train['pork']=train['name'].apply(lambda x : 1 if x.find('(?:ポーク|豚)') >=0 else 0)
train['beef']=train['name'].apply(lambda x : 1 if x.find('(?:ビーフ|牛)') >=0 else 0)
train['chiken']=train['name'].apply(lambda x : 1 if x.find('(?:チキン|鶏)') >=0 else 0)
train['katu']=train['name'].apply(lambda x : 1 if x.find('カツ') >=0 else 0)
train['fry']=train['name'].apply(lambda x : 1 if x.find('フライ') >=0 else 0)
train['han']=train['name'].apply(lambda x : 1 if x.find('ハンバーグ') >=0 else 0)

train['all_meat'] = train['name'].apply(lambda x : 1 if x.find('(?:ポーク|豚|ビーフ|牛|チキン|鶏)') >=0 else 0)

In [74]:
#カレーがとても人気なのでラベル付け
train['curry'] = train['name'].apply(lambda x : 1 if x.find("カレー") >=0 else 0)

In [92]:
#人気メニューと不人気メニューを抽出しラベル付け
popular_menu = set(train[train['new_new_y']>15].name)
train['popular'] = train['name'].apply(lambda x : 1 if x in popular_menu else 0)
unpopular_menu = set(train[train['new_new_y']<-15].name)
train['unpopular'] = train['name'].apply(lambda x : 1 if x in unpopular_menu else 0)

In [98]:
base = ['day','temperature','temp']
week = ['week']
name = ['fun','curry','popular','unpopular']
menu = ['pork', 'beef', 'chiken', 'katu', 'fry', 'han']

In [99]:
feature_x = base + week + name + menu
feature_y = ['new_y']

In [100]:
data_x = train[feature_x]
data_y = train[feature_y]

In [106]:
from sklearn.linear_model import LinearRegression as LR #線形回帰モデル

In [107]:
model = LR()

In [108]:
#説明変数と目的変数を設定し、学習
result=model.fit(data_x, data_y)

In [82]:
#傾きの確認
model.coef_

array([[-2.46689503e-01, -1.59240696e-01, -4.82358909e-01,
         9.61883187e-01,  3.48932402e+01,  6.23636490e+00,
         1.36068801e+01, -1.52861237e+01, -4.61852778e-14,
        -6.10622664e-15,  0.00000000e+00, -1.61666236e+00,
        -7.50720482e-01,  4.79721267e+00]])

In [83]:
#切片の確認
model.intercept_

array([6.84792029])

In [84]:
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample.csv',header = None)

In [85]:
#テストデータにも同様の前処理を施す
test['week'] = test['week'].replace(week_encoded)
test['popular'] = test['name'].apply(lambda x : 1 if x in popular_menu else 0)
test['unpopular'] = test['name'].apply(lambda x : 1 if x in unpopular_menu else 0)
test['curry'] = test['name'].apply(lambda x : 1 if x.find("カレー") >=0 else 0)
test['fun'] = test['remarks'].apply(lambda x: 1 if x=="お楽しみメニュー" else 0)
test['day'] = test['datetime'].apply(lambda x : int(x.split("-")[2]))
test['month'] = test['datetime'].apply(lambda x : int(x.split("-")[1]))
temp_mean = test.groupby('month').temperature.mean()
test['month'] =  test['month'].replace(temp_mean)
test['temp'] = test['temperature'] - test['month']



test['pork']=test['name'].apply(lambda x : 1 if x.find('(?:ポーク|豚)') >=0 else 0)
test['beef']=test['name'].apply(lambda x : 1 if x.find('(?:ビーフ|牛)') >=0 else 0)
test['chiken']=test['name'].apply(lambda x : 1 if x.find('(?:チキン|鶏)') >=0 else 0)
test['katu']=test['name'].apply(lambda x : 1 if x.find('カツ') >=0 else 0)
test['fry']=test['name'].apply(lambda x : 1 if x.find('フライ') >=0 else 0)
test['han']=test['name'].apply(lambda x : 1 if x.find('ハンバーグ') >=0 else 0)
test['all_meat'] = test['name'].apply(lambda x : 1 if x.find('(?:ポーク|豚|ビーフ|牛|チキン|鶏)') >=0 else 0)


In [86]:
data_x = test
test['y'] = model.predict(data_x[feature_x])
 
test['index_new'] = test.index + train.index.max() + 1
xs = test['index_new'].values
y_reg = a1 * np.exp(-b1 * xs) + c1

In [87]:
test['y'] = test['y'] + y_reg
 
sample[1] = test['y']

In [91]:
sample.to_csv("compleat2.csv", index=None, header=None)

In [89]:
from sklearn.ensemble import RandomForestRegressor

In [96]:
rf_reg = RandomForestRegressor(n_estimators=5000)
rf_reg = rf_reg.fit(data_x, data_y)

fti = rf_reg.feature_importances_

dic_arr = {'importance':fti, 'feature':feature_x}
pd.DataFrame(dic_arr).sort_values('importance', ascending=False).reset_index(drop=True).loc[:15, :]

  


Unnamed: 0,importance,feature
0,0.227723,popular
1,0.189497,fun
2,0.125385,week
3,0.124436,unpopular
4,0.119736,temperature
5,0.085778,day
6,0.070949,temp
7,0.031608,curry
8,0.018245,katu
9,0.005502,han


In [104]:
score = model.score(data_x, data_y)
print('score:{}'.format(score))

score:0.7109093450376135
