In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

1. read_csv 並做一點處理

In [None]:
df_train = pd.read_csv(
    'dataset/train.csv',usecols=[1,2,3,4,5],
    dtype={'onpromotion':bool,
           'store_nbr':int,
           'item_nbr':int},
    parse_dates=['date'],
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    skiprows=range(1,66458909)  # 2016-01-01   
)

In [None]:
df_train.head()

In [None]:
df_train.isnull().any()

In [None]:
df_test = pd.read_csv(
    'dataset/test.csv',usecols=[0,1,2,3,4],
    dtype={'onpromotion':bool,
           'store_nbr':int,
           'item_nbr':int},
    parse_dates=['date']
).set_index(
    ['store_nbr','item_nbr','date']
)
df_test.head()

In [None]:
items = pd.read_csv(
    "dataset/items.csv",
).set_index("item_nbr")
items.head()

In [None]:
stores = pd.read_csv(
    "dataset/stores.csv",
).set_index('store_nbr')

stores.head()

2. LabelEncode

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

items['family'] = le.fit_transform(items['family'].values)
stores['city'] = le.fit_transform(stores['city'].values)
stores['state'] = le.fit_transform(stores['state'].values)
stores['type'] = le.fit_transform(stores['type'].values)

3. 將欄位攤開, 沒有銷售紀錄的補0

In [None]:
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]

In [None]:
promo_train_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_train_2017.columns = promo_train_2017.columns.get_level_values(1)

In [None]:
promo_test_2017 = df_test[['onpromotion']].unstack(level=-1).fillna(False)
promo_test_2017.columns = promo_test_2017.columns.get_level_values(1)
promo_test_2017 = promo_test_2017.reindex(promo_train_2017.index).fillna(False)

In [None]:
promo_2017 = pd.concat([promo_train_2017,promo_test_2017],axis=1)

In [None]:
promo_2017.head()

In [None]:
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

In [None]:
df_2017.head()

In [None]:
#依照df_train的item_nbr重新排序items
items = items.reindex(df_2017.index.get_level_values(1))

In [None]:
items.head()

4. 切分train/val/test

In [None]:
from datetime import date, timedelta

In [None]:
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

In [None]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
       
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_30_2017": get_timespan(promo_2017, t2017, 30, 30).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
       
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values   
    })
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    for i in [3, 7, 14, 30, 60, 140]:
        tmp = get_timespan(df_2017, t2017, i, i)
    
        X['diff_%s_mean' % i] = tmp.diff(axis=1).mean(axis=1).values
    
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [None]:
print("Preparing dataset...")
# 5/31後八週; 6/21後四週
t2017 = date(2017, 6, 21)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

In [None]:
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

In [None]:
import gc
del promo_test_2017, promo_train_2017
gc.collect() 

5. model

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# 正規化
scaler = StandardScaler()
scaler.fit(pd.concat([X_train, X_val]))

X_train[:] = scaler.transform(X_train)
X_val[:] = scaler.transform(X_val)

X_train = X_train.values
X_val = X_val.values

X_test[:] = scaler.transform(X_test)
X_test = X_test.values

In [None]:
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1],1))
X_val = X_val.reshape((X_val.shape[0], X_val.shape[1],1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1],1))

In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

建立模型

In [None]:
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras.layers import LSTM
from keras import callbacks
from keras import optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.layers import Dense,Conv1D,MaxPooling1D,Flatten

In [None]:
timesteps = X_train.shape[1]
data_dim = X_train.shape[2]
size = X_train.shape[0]

In [None]:
 def build_model():
        '''以Sequential()逐層疊加模型。'''
        
        model = Sequential()
        # conv block 1
        model.add(Conv1D(32, (3), padding="same",activation="relu",
                                     input_shape=(timesteps,data_dim)) )
        model.add(Conv1D(32, (3),padding="same",activation="relu"))
        model.add(MaxPooling1D(pool_size=(2)))
        model.add(BatchNormalization())
        model.add(Dropout(0.2))
        # conv block 2
        model.add(Conv1D(64, (3), padding='same',activation="relu"))
        model.add(Conv1D(64, (3),padding="same",activation="relu") )
        model.add(MaxPooling1D(pool_size=(2)))
        model.add(BatchNormalization())
        model.add(Dropout(0.1))
        # conv block 3
        model.add(Conv1D(128, (3), padding='same',activation="relu"))
        model.add(Conv1D(128, (3),padding="same",activation="relu") )
        model.add(MaxPooling1D(pool_size=(2)))
        model.add(BatchNormalization())
        model.add(Dropout(0.05))
        # conv block 4
        model.add(Conv1D(256, (3), padding='same',activation="relu"))
        model.add(Conv1D(256, (3),padding="same",activation="relu") )
        model.add(MaxPooling1D(pool_size=(2)))
        model.add(BatchNormalization())
        model.add(Dropout(0.05))
        # dense block
        """Dense層吃向量, 所以要用Flatten壓成一維向量
        """
        model.add(Flatten())
        model.add(Dense(512,activation="relu"))
        model.add(BatchNormalization())
        model.add(Dropout(0.05))
        
        
        
        model.add(Dense(1))
        
        return model

In [None]:
N_EPOCHS = 2000

val_pred = []
test_pred = []

# 每天單獨預測
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    y = y_train[:, i]
    y_mean = y.mean()
    xv = X_val
    yv = y_val[:, i]
    model = build_model()
    opt = optimizers.Adam(lr=0.0001) 
    model.compile(loss='mse', optimizer=opt, metrics=['mse'])

    callbacks = [
        
        EarlyStopping(monitor='val_loss', patience=10, verbose=0),
        ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, min_delta=1e-4, mode='min')
        ]
   
    model.fit(X_train, y - y_mean, batch_size = 65536, epochs = N_EPOCHS, verbose=2,
              validation_data=(xv,yv-y_mean), callbacks=callbacks ) 

    val_pred.append(model.predict(X_val)+y_mean)
    test_pred.append(model.predict(X_test)+y_mean)

In [None]:
print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).squeeze(axis=2).transpose())) 

In [None]:
weight = items["perishable"] * 0.25 + 1
err = (y_val - np.array(val_pred).squeeze(axis=2).transpose())**2
err = err.sum(axis=1) * weight
err = np.sqrt(err.sum() / weight.sum() / 16)
print('nwrmsle = {}'.format(err)) #nwrmsle

6. 輸出

In [None]:
print("Making submission...")
y_test = np.array(test_pred).squeeze(axis=2).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('CNN_VGGNet_1.csv', float_format='%.4f', index=None) 

7. val/pred_val圖

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# val_pred
val_test = np.array(val_pred).squeeze(axis=2).transpose()

In [None]:
val_preds = pd.DataFrame(
    val_test, index=df_2017.index,
    columns=pd.date_range("2017-07-26", periods=16)
).stack().to_frame("unit_sales")

In [None]:
pred_data=val_preds.reset_index().set_index('level_2')

In [None]:
pred_data.index.names = ['date']
pred_data = pred_data \
    .groupby(["date"], as_index=True) \
    .aggregate({"unit_sales": "sum"})
true_data=df_train[(df_train.date>='2017-05-21') & (df_train.date<='2017-08-10')]
true_data = true_data \
    .groupby(["date"], as_index=True) \
    .aggregate({"unit_sales": "sum"})

In [None]:
df_plot = pd.concat([true_data,pred_data],axis=1, join_axes=[true_data.index])
df_plot.columns = ['true_unit_sales', 'pred_unit_sales']
df_plot

In [None]:
plt.figure(figsize=(15,8))
sns.set_color_codes("pastel")
g=sns.pointplot(x=df_plot.index, y="pred_unit_sales", data=df_plot,color='#CA972C',markers='p')
sns.pointplot(x=df_plot.index, y="true_unit_sales", data=df_plot,color='#FEBBAA',markers='p',linestyles='--')
plt.autoscale()
plt.show()