## Preliminaries

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from itertools import product
import ipywidgets
from tqdm import tqdm_notebook
from sklearn.model_selection import KFold
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from math import sqrt
from statsmodels.tsa.arima_model import ARIMA

In [2]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

## Load data subset

In [8]:
# Get data from hard drive
data_path = "/Users/juanzinser/Workspace/advance-machine-learning/cds/competition/data/"

sales = pd.read_csv(data_path + r'sales_train_v2.csv')
shops = pd.read_csv(data_path + r'shops.csv')
items = pd.read_csv(data_path + r'items.csv')
item_cats = pd.read_csv(data_path + r'item_categories.csv')

test = pd.read_csv(data_path + r'test.csv')
test = test.drop('ID',axis=1)
test['date_block_num'] = 34

sales = sales.append(test)

## Get a feature matrix

In [9]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect();

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


## Create lagged variables

In [10]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data.columns.difference(index_cols)) 

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

In [11]:
# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 

In [12]:
# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();

all_data_saved = all_data.copy()

## Cross-validation

In [13]:
# Use the first three months to predict the fourth month. Use the first four months to predict the fifth month. etc.
def tscv(data, model, target='target', group='date_block_num', starting_window=3):
    min_group = data[group].min()
    max_group = data[group].max()
    scores = []
    for val in range(min_group+starting_window, max_group+1):
        print(val)
        X_train = data[data[group]<val].drop(target,axis=1)
        y_train = data[data[group]<val]['target']
        X_val = data[data[group]==val].drop(target,axis=1)
        y_val = data[data[group]==val]['target']
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        score = sqrt(mean_squared_error(y_val, y_pred))
        scores.append(score)
    return scores

## Final train test split

In [14]:
test_data = all_data[all_data['date_block_num'] == 34]
all_data = all_data[all_data['date_block_num'] < 34]

X_train = all_data.drop(to_drop_cols,axis=1)
y_train = all_data['target']
X_test = test_data.drop(to_drop_cols,axis=1).iloc[:]

## Predictions

In [15]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

clf_lgb = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
pred_lgb = clf_lgb.predict(X_test)

## Submission

In [16]:
submission = test_data.reset_index(drop=True).reset_index()
submission.drop(submission.columns.difference(['index']), 1, inplace=True)
submission['pred'] = pred_lgb.clip(0,20)
submission.columns = ['ID','item_cnt_month']

In [17]:
submission.to_csv('submission.csv',index=False)