# Download data

1. first you need to accept rules at
https://www.kaggle.com/c/competitive-data-science-predict-future-sales/rules
    
2. then install Kaggle Api https://github.com/Kaggle/kaggle-api#installation and get your token

3. finally, you can download any competitions data and make submissions using Kaggle api

In [None]:
!mkdir data && kaggle competitions download -c competitive-data-science-predict-future-sales -p data/

# Predict future sales

Here we will implement ensembling schemes: simple linear mix and stacking.

We will spend several cells to load data and create feature matrix first.

In [None]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
%matplotlib inline 

pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from tqdm import tqdm_notebook

from itertools import product

def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

# Load data subset

In [None]:
!ls data

In [None]:
sales = pd.read_csv('data/sales_train.csv.gz')
shops = pd.read_csv('data/shops.csv')
items = pd.read_csv('data/items.csv')
item_cats = pd.read_csv('data/item_categories.csv')

sample_submission = pd.read_csv('data/sample_submission.csv.gz')
test = pd.read_csv('data/test.csv.gz')

# Get a feature matrix

We now need to prepare the features.

In [None]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
months = list(sales['date_block_num'].unique()) + [34]
grid = []
for block_num in months:
    
    if block_num < 34:
        cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
        cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    else:
        cur_shops = test['shop_id'].unique()
        cur_items = test['item_id'].unique()
        
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect();

After creating a grid, we can calculate some features. We will use lags from [1, 2, 3, 4, 5, 12] months ago.

In [None]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data.columns.difference(index_cols)) 

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();

To this end, we've created a feature matrix. It is stored in `all_data` variable. Take a look:

In [None]:
all_data.head(5)

# Train/test split

We will treat the last month with data (33) as a validation set.

In [None]:
cv = True

# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = all_data['date_block_num']

if cv:
    last_block = dates.max() - 1
else:
    last_block = dates.max()
    
print('Test `date_block_num` is %d' % last_block)

In [None]:
def split_train_test(dates, block):

    dates_train = dates[dates <  block]
    dates_test  = dates[dates == block]

    X_train = all_data.loc[dates <  block].drop(to_drop_cols, axis=1)
    X_test =  all_data.loc[dates == block].drop(to_drop_cols, axis=1)

    y_train = all_data.loc[dates <  block, 'target'].values
    y_test =  all_data.loc[dates == block, 'target'].values
    
    return X_train, X_test, y_train, y_test, dates_train, dates_test
    
X_train, X_test, y_train, y_test, dates_train, dates_test = split_train_test(dates, last_block)

# First level models 

You need to implement a basic stacking scheme. Note, that we have a time component here. 

We always use first level models to build two datasets: test meta-features and 2-nd level train-metafeatures. Let's see how we get test meta-features first. 


In time-series task we usually have a fixed period of time we are asked to predict. Like day, week, month or arbitrary period with duration of T.


1. Split the train data into chunks of duration T. Select first M chunks.

2. Fit N diverse models on those M chunks and predict for the chunk M+1. Then fit those models on first M+1 chunks and predict for chunk M+2 and so on, until you hit the end. After that use all train data to fit models and get predictions for test. Now we will have meta-features for the chunks starting from number M+1 as well as meta-features for the test.

3. Now we can use meta-features from first K chunks [M+1,M+2,..,M+K] to fit level 2 models and validate them on chunk M+K+1. Essentially we are back to step 1. with the lesser amount of chunks and meta-features instead of features.


### Test meta-features

Firts, we will run *linear regression* on numeric columns and get predictions for the last month.

In [None]:
lr = LinearRegression()
lr.fit(X_train.values, y_train)
pred_lr = lr.predict(X_test.values)

print('Test R-squared for linreg is %f' % r2_score(y_test, pred_lr))

And then we run *LightGBM*.

In [None]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':4, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**4,
               'bagging_freq':1,
               'verbose':0 
              }

model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100, 
                  valid_names=[lgb.Dataset(X_test, label=y_test)], verbose_eval=True)
pred_lgb = model.predict(X_test)

print('Test R-squared for LightGBM is %f' % r2_score(y_test, pred_lgb))

Finally, concatenate test predictions to get test meta-features.

In [None]:
X_test_level2 = np.c_[pred_lr, pred_lgb] 

### Train meta-features

**Now we will implement the scheme from the above**. We will use duration **T** equal to month and **M=15**.  

That is, we need to get predictions (meta-features) from *linear regression* and *LightGBM* for months 28, 29, 30, 31, 32. Use the same parameters as in above models.

In [None]:
months_level2 = dates_train.unique()[-5:]
print(months_level2)
dates_train_level2 = dates_train[dates_train.isin(months_level2)]

# That is how we get target for the 2nd level dataset
y_train_level2 = y_train[dates_train.isin(months_level2)]

In [None]:
# And here we create 2nd level feeature matrix, init it with zeros first
X_train_level2 = np.zeros([y_train_level2.shape[0], 2])

# Now fill `X_train_level2` with metafeatures
for cur_block_num in months_level2:
    
    print(cur_block_num)

    '''
        1. Split `X_train` into parts
           Remember, that corresponding dates are stored in `dates_train` 
        2. Fit linear regression 
        3. Fit LightGBM and put predictions          
        4. Store predictions from 2. and 3. in the right place of `X_train_level2`. 
           You can use `dates_train_level2` for it
           Make sure the order of the meta-features is the same as in `X_test_level2`
    '''      
    
    #  YOUR CODE GOES HERE
    
    
# Sanity check
if cv:
    assert np.all(np.isclose(X_train_level2.mean(axis=0), [0.31651825, 0.28959768]))

The ensembles work best, when first level models are diverse. We can qualitatively analyze the diversity by examinig *scatter plot* between the two metafeatures. Plot the scatter plot below. 

In [None]:
# YOUR CODE GOES HERE

# Ensembling

Now, when the meta-features are created, we can ensemble our first level models.

### Simple convex mix

Let's start with simple linear convex mix:

$$
mix= \alpha\cdot\text{linreg_prediction}+(1-\alpha)\cdot\text{lgb_prediction}
$$

We need to find an optimal $\alpha$. And it is very easy, as it is feasible to do grid search. Next, find the optimal $\alpha$ out of `alphas_to_try` array. Remember, that you need to use train meta-features (not test) when searching for $\alpha$. 

In [None]:
alphas_to_try = np.linspace(0, 1, 1001)

# YOUR CODE GOES HERE
best_alpha = # YOUR CODE GOES HERE
r2_train_simple_mix = # YOUR CODE GOES HERE

print('Best alpha: %f; Corresponding r2 score on train: %f' % (best_alpha, r2_train_simple_mix))

Now use the $\alpha$ you've found to compute predictions for the test set 

In [None]:
test_preds = # YOUR CODE GOES HERE
r2_test_simple_mix = # YOUR CODE GOES HERE

print('Test R-squared for simple mix is %f' % r2_test_simple_mix)

In [None]:
submit = all_data.loc[dates_test.index, ['shop_id', 'item_id']]
submit['item_cnt_month'] = test_preds

submit = test.merge(submit, how='left', on=['shop_id', 'item_id']).fillna(0)
submit = submit[['ID', 'item_cnt_month']]

!mkdir submissions
submit.to_csv('submissions/linear_mix.csv', index=False)
submit.head()

In [None]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f submissions/linear_mix.csv -m "linear mix"

### Stacking

Now, we will try a more advanced ensembling technique. Fit a linear regression model to the meta-features. Use the same parameters as in the model above.

In [None]:
# YOUR CODE GOES HERE

Compute R-squared on the train and test sets.

In [None]:
train_preds = # YOUR CODE GOES HERE
r2_train_stacking = # YOUR CODE GOES HERE

test_preds = # YOUR CODE GOES HERE
r2_test_stacking = # YOUR CODE GOES HERE

print('Train R-squared for stacking is %f' % r2_train_stacking)
print('Test  R-squared for stacking is %f' % r2_test_stacking)

In [None]:
submit = all_data.loc[dates_test.index, ['shop_id', 'item_id']]
submit['item_cnt_month'] = test_preds

submit = test.merge(submit, how='left', on=['shop_id', 'item_id']).fillna(0)
submit = submit[['ID', 'item_cnt_month']]

submit.to_csv('submissions/linear_stacking.csv', index=False)
submit.head()

In [None]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f submissions/linear_stacking.csv -m "stacking with a linear model"