In [1]:
import gc
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
# import dask_xgboost as xgb
# import dask.dataframe as dd
import matplotlib.pyplot as plt
from sklearn import preprocessing, metrics

In [3]:
# メモリ使用量の削減
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
print('Reading files...')

calendar_df = pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv')
calendar_df = reduce_mem_usage(calendar_df)
print('Calendar has {} rows and {} columns\n'.format(calendar_df.shape[0], calendar_df.shape[1]))

sell_prices_df = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')
sell_prices_df = reduce_mem_usage(sell_prices_df)
print('Sell prices has {} rows and {} columns\n'.format(sell_prices_df.shape[0], sell_prices_df.shape[1]))

sales_train_validation_df = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')
print('Sales train validation has {} rows and {} columns\n'.format(sales_train_validation_df.shape[0], 
                                                                    sales_train_validation_df.shape[1]))

submission_df = pd.read_csv('../input/m5-forecasting-accuracy/sample_submission.csv')

Reading files...
Mem. usage decreased to  0.12 Mb (41.9% reduction)
Calendar has 1969 rows and 14 columns

Mem. usage decreased to 130.48 Mb (37.5% reduction)
Sell prices has 6841121 rows and 4 columns

Sales train validation has 30490 rows and 1919 columns



In [6]:
# 商品情報を抽出
product_df = sales_train_validation_df.loc[:, "id":"state_id"]

# 列方向に連なっていたのを変形し行方向に連ねるように整理
print("[BEFORE] {} rows and {} columns".format(sales_train_validation_df.shape[0], sales_train_validation_df.shape[1]))
sales_train_validation_df = pd.melt(sales_train_validation_df, 
                                    id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
                                    var_name = 'day', value_name = 'demand')
print('[AFTER] {} rows and {} columns'.format(sales_train_validation_df.shape[0], sales_train_validation_df.shape[1]))
sales_train_validation_df = reduce_mem_usage(sales_train_validation_df)

[BEFORE] 30490 rows and 1919 columns
    [AFTER] 58327370 rows and 8 columns
Mem. usage decreased to 3226.27 Mb (9.4% reduction)


In [7]:
# seperate test dataframes
valid_df = submission_df[submission_df["id"].str.contains("validation")]
eval_df = submission_df[submission_df["id"].str.contains("evaluation")]

# change column names
# validation data: F1 ~ F28 => d_1914 ~ d_1941
valid_df.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923',
                    'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 'd_1932', 'd_1933', 
                    'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941']

# evaluation data: F1 ~ F28 => d_1942 ~ d_1969
eval_df.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951',
                    'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 'd_1960', 'd_1961',
                    'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969']


# melt, mergeを使ってsubmission用のdataframeを上のsales_train_validationと同様の形式に変形
valid_df = valid_df.merge(product_df, how = 'left', on = 'id')
valid_df = pd.melt(valid_df, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                    var_name = 'day', value_name = 'demand')

# valid_dfと同様eval_dfとproduct_dfをmergeさせたい
# しかしidが_evaluationのままだとデータが一致せずmergeできないので一時的に_validationにidを変更
eval_df['id'] = eval_df['id'].str.replace('_evaluation','_validation')
eval_df = eval_df.merge(product_df, how = 'left', on = 'id')
eval_df['id'] = eval_df['id'].str.replace('_validation','_evaluation')
eval_df = pd.melt(eval_df, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                    var_name = 'day', value_name = 'demand')

sales_train_validation_df['part'] = 'train'
valid_df['part'] = 'valid'
eval_df['part'] = 'eval'

data_df = pd.concat([sales_train_validation_df, valid_df, eval_df], axis=0)

# 不要なdataframeの削除
del sales_train_validation_df, valid_df, eval_df

In [8]:
data_df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,part
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,train
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,train
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,train
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,train
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,train
5,HOBBIES_1_006_CA_1_validation,HOBBIES_1_006,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,train
6,HOBBIES_1_007_CA_1_validation,HOBBIES_1_007,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,train
7,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_1,12,train
8,HOBBIES_1_009_CA_1_validation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,d_1,2,train
9,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,train


In [12]:
# NOTE get only a sample for fast training(切りはよくない)
some_data_df = data_df.loc[55000000:]
some_data_df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,part
55000000,FOODS_2_312_WI_2_validation,FOODS_2_312,FOODS_2,FOODS,WI_2,WI,d_1804,0,train
55000001,FOODS_2_313_WI_2_validation,FOODS_2_313,FOODS_2,FOODS,WI_2,WI,d_1804,1,train
55000002,FOODS_2_314_WI_2_validation,FOODS_2_314,FOODS_2,FOODS,WI_2,WI,d_1804,3,train
55000003,FOODS_2_315_WI_2_validation,FOODS_2_315,FOODS_2,FOODS,WI_2,WI,d_1804,16,train
55000004,FOODS_2_316_WI_2_validation,FOODS_2_316,FOODS_2,FOODS,WI_2,WI,d_1804,0,train
55000005,FOODS_2_317_WI_2_validation,FOODS_2_317,FOODS_2,FOODS,WI_2,WI,d_1804,0,train
55000006,FOODS_2_318_WI_2_validation,FOODS_2_318,FOODS_2,FOODS,WI_2,WI,d_1804,0,train
55000007,FOODS_2_319_WI_2_validation,FOODS_2_319,FOODS_2,FOODS,WI_2,WI,d_1804,0,train
55000008,FOODS_2_320_WI_2_validation,FOODS_2_320,FOODS_2,FOODS,WI_2,WI,d_1804,5,train
55000009,FOODS_2_321_WI_2_validation,FOODS_2_321,FOODS_2,FOODS,WI_2,WI,d_1804,0,train


In [15]:
print(data_df["part"].value_counts())
print(some_data_df["part"].value_counts())

train    58327370
valid      853720
eval       853720
Name: part, dtype: int64
train    3327370
valid     853720
eval      853720
Name: part, dtype: int64


In [17]:
# drop some calendar features TODO なぜ？
calendar_df.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis=1)
calendar_df

Unnamed: 0,date,wm_yr_wk,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,d_1,,,,,0,0,0
1,2011-01-30,11101,d_2,,,,,0,0,0
2,2011-01-31,11101,d_3,,,,,0,0,0
3,2011-02-01,11101,d_4,,,,,1,1,0
4,2011-02-02,11101,d_5,,,,,1,0,1
5,2011-02-03,11101,d_6,,,,,1,1,1
6,2011-02-04,11101,d_7,,,,,1,0,0
7,2011-02-05,11102,d_8,,,,,1,1,1
8,2011-02-06,11102,d_9,SuperBowl,Sporting,,,1,1,1
9,2011-02-07,11102,d_10,,,,,1,1,0


In [18]:
# delete eval_df for now
data_df = data_df[data_df['part'] != 'eval']

In [20]:
data_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,part
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,train
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,train
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,train
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,train
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,train


In [22]:
# notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)
data_df = pd.merge(data_df, calendar_df, how = 'left', left_on = ['day'], right_on = ['d'])  # day列でmerge
data_df.drop(['d', 'day'], inplace = True, axis = 1)

In [23]:
   # get the sell price data (this feature should be very importan   data_df = data_df.merge(sell_prices_df, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left   print('Our final dataset to train has {} rows and {} columns'.format(data_df.shape[0], data_df.shape[1t()

In [24]:
data_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,train,2011-01-29,11101,,,,,0,0,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,train,2011-01-29,11101,,,,,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,train,2011-01-29,11101,,,,,0,0,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,train,2011-01-29,11101,,,,,0,0,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,train,2011-01-29,11101,,,,,0,0,0


In [25]:
# 欠損値補間とlabel encode
def transform(data_df):
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data_df[feature].fillna('unknown', inplace=True)
        
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data_df[feature] = encoder.fit_transform(data_df[feature])
    
    return data_df

In [42]:
data_df.tail(30)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
59181060,FOODS_3_798_WI_3_validation,FOODS_3_798,FOODS_3,FOODS,WI_3,WI,0,valid,2016-05-22,11617,,,,,0,0,0
59181061,FOODS_3_799_WI_3_validation,FOODS_3_799,FOODS_3,FOODS,WI_3,WI,0,valid,2016-05-22,11617,,,,,0,0,0
59181062,FOODS_3_800_WI_3_validation,FOODS_3_800,FOODS_3,FOODS,WI_3,WI,0,valid,2016-05-22,11617,,,,,0,0,0
59181063,FOODS_3_801_WI_3_validation,FOODS_3_801,FOODS_3,FOODS,WI_3,WI,0,valid,2016-05-22,11617,,,,,0,0,0
59181064,FOODS_3_802_WI_3_validation,FOODS_3_802,FOODS_3,FOODS,WI_3,WI,0,valid,2016-05-22,11617,,,,,0,0,0
59181065,FOODS_3_803_WI_3_validation,FOODS_3_803,FOODS_3,FOODS,WI_3,WI,0,valid,2016-05-22,11617,,,,,0,0,0
59181066,FOODS_3_804_WI_3_validation,FOODS_3_804,FOODS_3,FOODS,WI_3,WI,0,valid,2016-05-22,11617,,,,,0,0,0
59181067,FOODS_3_805_WI_3_validation,FOODS_3_805,FOODS_3,FOODS,WI_3,WI,0,valid,2016-05-22,11617,,,,,0,0,0
59181068,FOODS_3_806_WI_3_validation,FOODS_3_806,FOODS_3,FOODS,WI_3,WI,0,valid,2016-05-22,11617,,,,,0,0,0
59181069,FOODS_3_807_WI_3_validation,FOODS_3_807,FOODS_3,FOODS,WI_3,WI,0,valid,2016-05-22,11617,,,,,0,0,0


In [43]:
data_df.groupby(['id'])['demand'].mean()

id
FOODS_1_001_CA_1_validation        0.774343
FOODS_1_001_CA_2_validation        1.140649
FOODS_1_001_CA_3_validation        1.185987
FOODS_1_001_CA_4_validation        0.354456
FOODS_1_001_TX_1_validation        0.576507
FOODS_1_001_TX_2_validation        0.572901
FOODS_1_001_TX_3_validation        0.404431
FOODS_1_001_WI_1_validation        0.559505
FOODS_1_001_WI_2_validation        0.450799
FOODS_1_001_WI_3_validation        0.290057
FOODS_1_002_CA_1_validation        0.470891
FOODS_1_002_CA_2_validation        0.594539
FOODS_1_002_CA_3_validation        0.310665
FOODS_1_002_CA_4_validation        0.329727
FOODS_1_002_TX_1_validation        0.166409
FOODS_1_002_TX_2_validation        0.234415
FOODS_1_002_TX_3_validation        0.274601
FOODS_1_002_WI_1_validation        0.742401
FOODS_1_002_WI_2_validation        0.325605
FOODS_1_002_WI_3_validation        0.329212
FOODS_1_003_CA_1_validation        0.820196
FOODS_1_003_CA_2_validation        1.617723
FOODS_1_003_CA_3_validation  

In [39]:
data_df.transform(lambda x: x.shift(28))

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,
6,,,,,,,,,,,,,,,,,
7,,,,,,,,,,,,,,,,,
8,,,,,,,,,,,,,,,,,
9,,,,,,,,,,,,,,,,,


In [44]:
# 特徴量エンジニアリング
def feature_engineering(data):
    
    # rolling demand features
    data['lag_t28'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
    data['lag_t29'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(29))
    data['lag_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(30))

    # per a week
    data['rolling_mean_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
    data['rolling_std_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())

    # per a month
    data['rolling_mean_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())

    # per 3 month 
    data['rolling_mean_t90'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(90).mean())

    # half year
    data['rolling_mean_t180'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(180).mean())

    # per a month
    data['rolling_std_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).std())
    data['rolling_skew_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).skew())
    data['rolling_kurt_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).kurt())
    
    # price features
    data['lag_price_t1'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))
    data['price_change_t1'] = (data['lag_price_t1'] - data['sell_price']) / (data['lag_price_t1'])
    data['rolling_price_max_t365'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())
    data['price_change_t365'] = (data['rolling_price_max_t365'] - data['sell_price']) / (data['rolling_price_max_t365'])
    data['rolling_price_std_t7'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).std())
    data['rolling_price_std_t30'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(30).std())
    data.drop(['rolling_price_max_t365', 'lag_price_t1'], inplace = True, axis = 1)
    
    # time features
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['week'] = data['date'].dt.week
    data['day'] = data['date'].dt.day
    data['dayofweek'] = data['date'].dt.dayofweek
    
    return data

In [45]:
data_df = transform(data_df)
data_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,0,train,2011-01-29,11101,30,4,4,2,0,0,0
1,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,0,train,2011-01-29,11101,30,4,4,2,0,0,0
2,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,0,train,2011-01-29,11101,30,4,4,2,0,0,0
3,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,0,train,2011-01-29,11101,30,4,4,2,0,0,0
4,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,0,train,2011-01-29,11101,30,4,4,2,0,0,0


In [46]:
data_df = feature_engineering(data_df)

KeyError: 'Column not found: sell_price'