# 前処理
---

In [1]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np 

In [2]:
train = pd.read_csv("./data/raw/train.csv")
test = pd.read_csv("./data/raw/test.csv")
train.head()

Unnamed: 0,datetime,y,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature
0,2013-11-18,90,月,0,厚切りイカフライ,,,,,快晴,--,19.8
1,2013-11-19,101,火,1,手作りヒレカツ,,,,,快晴,--,17.0
2,2013-11-20,118,水,0,白身魚唐揚げ野菜あん,,,,,快晴,--,15.5
3,2013-11-21,120,木,1,若鶏ピリ辛焼,,,,,快晴,--,15.2
4,2013-11-22,130,金,1,ビッグメンチカツ,,,,,快晴,--,16.1


In [3]:
def holiday(x):
    '''前日が休日か、休日明けかをフラグ化'''
    if x == "金":
        return 2 
    elif x == "月":
        return 1 
    else: 
        return 0 
    
train["holiday"] = train.week.apply(holiday)
test["holiday"] = test.week.apply(holiday)

la = LabelEncoder()
la.fit(train.week)
train["week"] = la.fit_transform(train.week)
test["week"] = la.fit_transform(test.week)
print(la.classes_)

['月' '木' '水' '火' '金']


In [4]:
train["kcal_isna"] = train.kcal.isnull().astype(int)
test["kcal_isna"] = test.kcal.isnull().astype(int)
# 曜日ごとにカロリーに差が見られたので、欠損補完する
k_mean_mon = train[train.week == 0].loc[:, ["kcal"]].mean()
k_mean_tues = train[train.week == 1].loc[:, ["kcal"]].mean()
k_mean_wed = train[train.week == 2].loc[:, ["kcal"]].mean()
k_mean_thr = train[train.week == 3].loc[:, ["kcal"]].mean()
k_mean_fri = train[train.week == 4].loc[:, ["kcal"]].mean()

mon_tr = train[train.week == 0].loc[:, ["kcal"]].fillna(k_mean_mon)
mon_te = test[test.week == 0].loc[:, ["kcal"]].fillna(k_mean_mon)
tues_tr = train[train.week == 1].loc[:, ["kcal"]].fillna(k_mean_tues)
tues_te = test[test.week == 1].loc[:, ["kcal"]].fillna(k_mean_tues)
wed_tr = train[train.week == 2].loc[:, ["kcal"]].fillna(k_mean_wed)
wed_te = test[test.week == 2].loc[:, ["kcal"]].fillna(k_mean_wed)
thr_tr = train[train.week == 3].loc[:, ["kcal"]].fillna(k_mean_thr)
thr_te = test[test.week == 3].loc[:, ["kcal"]].fillna(k_mean_thr)
fri_tr = train[train.week == 4].loc[:, ["kcal"]].fillna(k_mean_fri)
fri_te = test[test.week == 4].loc[:, ["kcal"]].fillna(k_mean_fri)

train["kcal"] = pd.concat([mon_tr, tues_tr, wed_tr, thr_tr, fri_tr])
test["kcal"] = pd.concat([mon_te, tues_te, wed_te, thr_te, fri_te])


In [5]:
carry = train[train.remarks == "お楽しみメニュー"].name.value_counts()
carry = carry.index[:3]
# remarksの存在する際にはカレーの割合が高い
train["carry"] = train.name.apply(lambda x: 1 if x in carry else 0)
test["carry"] = test.name.apply(lambda x: 1 if x in carry else 0)
# remarksの頻度が多くないので特長量から削除する
train["remarks_isna"] = train.remarks.isnull().astype(int)
test["remarks_isna"] = test.remarks.isnull().astype(int)
train.drop(["remarks"], axis=1, inplace=True)
test.drop(["remarks"], axis=1, inplace=True)

In [6]:
train["event_isna"] = train.event.isnull().astype(int)
test["event_isna"] = test.event.isnull().astype(int)
# eventへのラベルエンコーダー
la = LabelEncoder()
la.fit(train.event.fillna("none"))
train["event"] = la.transform(train.event.fillna("none"))
test["event"] = la.transform(test.event.fillna("none"))

In [7]:
weather_type = train.weather.value_counts().index 

train["weather"] = train.weather.map({weather_type[0]: 0, weather_type[1]: 1, weather_type[2]: 0, weather_type[3]: 1, 
                  weather_type[4]: 2, weather_type[5]: 2, weather_type[6]: 2})
test["weather"] = test.weather.map({weather_type[0]: 0, weather_type[1]: 1, weather_type[2]: 0, weather_type[3]: 1, 
                  weather_type[4]: 2, weather_type[5]: 2, weather_type[6]: 2})

train["weather_before1"] = train.weather.shift(-1).fillna(0).astype(int)
test["weather_before1"] = test.weather.shift(-1).fillna(train.weather.iloc[-1]).astype(int)

In [8]:
train["precipitation"] = train.precipitation.apply(lambda x: 0 if x == "--" else x)
test["precipitation"] = test.precipitation.apply(lambda x: 0 if x == "--" else x)

In [9]:
train["payday"] = train.payday.fillna(0)
train["payday"] = train.payday.astype(int)
test["payday"] = test.payday.fillna(0)
test["payday"] = test.payday.astype(int)

train["payday_before1"] = train.payday.shift(-1).fillna(0).astype(int)
train["payday_after1"] = train.payday.shift(1).fillna(0).astype(int)
test["payday_before1"] = test.payday.shift(-1).fillna(0).astype(int)
test["payday_after1"] = test.payday.shift(1).fillna(train.payday.iloc[-1]).astype(int)

In [10]:
# 時系列変換をする
train["datetime"] = pd.to_datetime(train.datetime)
test["datetime"] = pd.to_datetime(test.datetime)

train["year"] = train.datetime.dt.year
train["month"] = train.datetime.dt.month 
train["quarter"] = train.datetime.dt.quarter
train["day"] = train.datetime.dt.day
train["day"] = (train.day-1)/(31-1)

y_2013 = train.loc[train.year == 2013, "y"].mean()
y_2014 = train.loc[train.year == 2014, "y"].mean()
# 年度によって売れ値に開きがあるのでそれぞれで埋める
train["sale_before1"] = train.y.shift(-1).fillna(y_2013)
train["moving_avg"] = train.y.shift(-1).rolling(window=5).mean().fillna(y_2013)

test["year"] = test.datetime.dt.year
test["month"] = test.datetime.dt.month 
test["quarter"] = test.datetime.dt.quarter
test["day"] = test.datetime.dt.day
test["day"] = (test.day-1)/(31-1)


In [11]:
# 季節別の温度平均の割合
q_1_tem = train.loc[train.quarter == 1, "temperature"].mean()
q_2_tem = train.loc[train.quarter == 2, "temperature"].mean()
q_3_tem = train.loc[train.quarter == 3, "temperature"].mean()
q_4_tem = train.loc[train.quarter == 4, "temperature"].mean()


q_1r = train.loc[train.quarter == 1, "temperature"] / q_1_tem
q_2r = train.loc[train.quarter == 2, "temperature"] / q_2_tem
q_3r = train.loc[train.quarter == 3, "temperature"] / q_3_tem
q_4r = train.loc[train.quarter == 4, "temperature"] / q_4_tem

train["temperature_avg"] = pd.concat([q_1r, q_2r, q_3r, q_4r])
test["temperature_avg"] = test.temperature/q_4_tem

In [12]:
# 不要なカラムの削除
train.drop(["name", "datetime"], axis=1, inplace=True)
test.drop(["name", "datetime"], axis=1, inplace=True)

In [13]:
# # monthのターゲットエンコーダーをする
from src.utils.category_transform import TargetEncoder
train_ = train.copy()

tag = TargetEncoder()
tag.fit(train.drop(["y"], axis=1), train[["y"]], col="month")
train, test = tag.transform(test)
month_mean = train.loc[train.year == 2014, "month"].mean()
test["month"] = test.month.fillna(month_mean)
train["y"] = train_["y"]

In [14]:
train.corr().style.background_gradient(cmap="coolwarm")

Unnamed: 0,week,soldout,kcal,event,payday,weather,temperature,holiday,kcal_isna,carry,remarks_isna,event_isna,weather_before1,payday_before1,payday_after1,year,month,quarter,day,sale_before1,moving_avg,temperature_avg,y
week,1.0,-0.075767,0.097147,0.126604,0.030697,-0.145298,0.022823,0.36698,0.222159,0.286796,-0.341282,-0.080763,0.03846,-0.033814,0.014569,0.002735,0.002587,0.011715,-0.010075,0.066463,0.008194,-0.000697,-0.020161
soldout,-0.075767,1.0,0.0405,-0.007512,-0.158202,0.147769,-0.156752,-0.040851,-0.083348,-0.144942,0.07832,0.01121,-0.173038,0.113564,-0.067613,-0.011935,0.101653,-0.081606,-0.081096,0.141438,0.096968,-0.122252,0.098308
kcal,0.097147,0.0405,1.0,-0.05112,0.062079,0.0509,-0.038509,-0.008916,0.013613,0.011484,-0.092722,0.064814,0.115054,0.059105,-0.024224,-0.000791,0.059016,-0.06822,0.037159,0.043859,0.090419,-0.007003,0.111001
event,0.126604,-0.007512,-0.05112,1.0,-0.058089,-0.023255,-0.015137,0.148024,0.012501,0.054969,-0.24752,-0.957283,0.025043,-0.058089,-0.058089,0.036418,-0.020929,-0.017223,0.064143,0.046154,-0.051197,-0.05888,-0.037021
payday,0.030697,-0.158202,0.062079,-0.058089,1.0,0.023291,-0.025759,0.060727,0.001092,0.062445,0.001081,0.060681,-0.009066,-0.050761,-0.050761,0.023232,-0.007324,-0.018921,-0.167289,0.061933,0.03187,-0.090434,0.046546
weather,-0.145298,0.147769,0.0509,-0.023255,0.023291,1.0,-0.012504,-0.121429,-0.187122,-0.05621,0.050289,-0.005739,0.153028,0.023291,-0.009066,0.133577,-0.204597,-0.041481,-0.063019,-0.191053,-0.1993,-0.242952,-0.142137
temperature,0.022823,-0.156752,-0.038509,-0.015137,-0.025759,-0.012504,1.0,0.03003,-0.121403,0.166641,-0.213645,0.001188,0.137152,-0.030218,-0.028907,0.301331,-0.789269,0.404762,0.01846,-0.652323,-0.754016,0.428686,-0.655332
holiday,0.36698,-0.040851,-0.008916,0.148024,0.060727,-0.121429,0.03003,1.0,0.258297,0.31825,-0.354785,-0.13998,-0.060672,0.004357,0.004357,0.024158,-0.028856,-0.014851,0.018425,0.103146,-0.033199,0.043256,0.064592
kcal_isna,0.222159,-0.083348,0.013613,0.012501,0.001092,-0.187122,-0.121403,0.258297,1.0,0.369555,-0.354927,-0.01096,-0.152304,-0.055439,0.057624,-0.79582,0.502444,0.59041,0.093646,0.403945,0.512097,0.080051,0.576252
carry,0.286796,-0.144942,0.011484,0.054969,0.062445,-0.05621,0.166641,0.31825,0.369555,1.0,-0.556045,-0.036914,-0.022188,-0.048035,0.172925,0.084322,-0.156893,0.010173,0.026461,-0.113195,-0.090167,0.11805,0.185156


In [17]:
train.head()

Unnamed: 0,week,soldout,kcal,event,payday,weather,precipitation,temperature,holiday,kcal_isna,...,payday_before1,payday_after1,year,month,quarter,day,sale_before1,moving_avg,temperature_avg,y
0,0,0,398.970588,0,0,0,0,19.8,1,1,...,0,0,2013,131.666667,4,0.566667,101.0,134.321429,1.558179,90
1,3,1,406.714286,0,0,0,0,17.0,0,1,...,0,0,2013,131.666667,4,0.6,118.0,134.321429,1.33783,101
2,2,0,406.621622,0,0,0,0,15.5,0,1,...,0,0,2013,122.125,4,0.633333,120.0,134.321429,1.219786,118
3,1,1,403.540541,0,0,0,0,15.2,0,1,...,0,0,2013,131.666667,4,0.666667,130.0,134.321429,1.196178,120
4,4,1,406.782609,0,0,0,0,16.1,2,1,...,0,0,2013,120.142857,4,0.7,135.0,120.8,1.267004,130


In [18]:
train.to_csv("./data/processed/train.csv", index=False)
test.to_csv("./data/processed/test.csv", index=False)