In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os
from tqdm import tqdm_notebook
from itertools import product, combinations
%matplotlib inline

In [2]:
datadir = '../data/raw'
item_categories = pd.read_csv(os.path.join(datadir, 'item_categories.csv'))
item = pd.read_csv(os.path.join(datadir, 'items.csv'))
train = pd.read_csv(os.path.join(datadir, 'sales_train_v2.csv'))
shops = pd.read_csv(os.path.join(datadir, 'shops.csv'))
test = pd.read_csv(os.path.join(datadir, 'test.csv'))

In [3]:
all_data = pd.read_csv('../data/processed/all_data.csv')

In [4]:
all_data.head()

Unnamed: 0.1,Unnamed: 0,shop_id,item_id,date_block_num,city_code,item_category_id,item_category_name,category_type,category_subtype,target,...,target_item_lag_3,target_category_lag_3,target_subcategory_lag_3,target_shop_category_lag_3,target_lag_6,target_shop_lag_6,target_item_lag_6,target_category_lag_6,target_subcategory_lag_6,target_shop_category_lag_6
0,0,59,22154,0,31,37,Кино - Blu-Ray,11,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,59,2574,0,31,55,Музыка - CD локального производства,13,4,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,59,2607,0,31,55,Музыка - CD локального производства,13,4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,59,2614,0,31,55,Музыка - CD локального производства,13,4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,59,2808,0,31,30,Игры PC - Стандартные издания,8,57,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
import lightgbm as lgb

## Feature Selection
Given large number of columns - restricting memory - we have to do feature selection

In [6]:
def select_features(X_train, y_train, num_features):
    lgb_params = {
        'feature_fraction': 0.75,
        'metric': 'rmse',
        'nthread': 4,
        'min_data_in_leaf': 2**7,
        'bagging_fraction': 0.75,
        'learning_rate': 0.03,
        'objective': 'mse',
        'bagging_seed': 2**7,
        'num_leaves': 2**7,
        'bagging_freq': 1,
        'verbose': 1
    }
    
    lgb_model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
    print('Plot feature importances...')
    fig = plt.figure(figsize=(10, 14))
    ax = lgb.plot_importance(lgb_model, figsize=(10, 30))
    plt.show()
    
    feature_importances = list(zip(lgb_model.feature_name(), lgb_model.feature_importances()))
    selected_features = [x[0] for x in sorted(feature_importances, key=lambda x: x[1], reverse=True)[:num_features]]
    
    del lgb_model
    return selected_features

### Interactions
The interactions between lagged features might be important for trend detection

In [11]:
shift_range = [1, 2, 3, 6]
lag_cols = ['target','target_shop','target_item','target_category', 'target_subcategory','target_shop_category']
interactions = pd.DataFrame(all_data[['date_block_num', 'target']])
eps = 1e-5
all_lag_cols = []
for col1, col2 in combinations(lag_cols[1:],2):
    for lag in shift_range:
        lag_col1 = '{0}_lag_{1}'.format(col1, lag)
        lag_col2 = '{0}_lag_{1}'.format(col2, lag)
        
        diff_col = lag_col1 + '_diff_' + lag_col2
        ratio_col = lag_col1 + '_ratio_' + lag_col2
        
        interactions[diff_col] = all_data[lag_col1] - all_data[lag_col2]
        interactions[ratio_col] = all_data[lag_col1]/(all_data[lag_col2]+eps)
        
        all_lag_cols.append(diff_col)
        all_lag_cols.append(ratio_col)
        
X_train = interactions[interactions['date_block_num'] < 33]['all_lag_cols']
y_train = all_data[all_data['date_block_num'] < 33]['target']

selected_interactions = select_features(X_train, y_train, 10)
for feature in selected_interactions:
    all_data[feature] = interactions[feature]
    
del X_train
del y_train

KeyError: 'all_lag_cols'