# Import packages and load data

In [1]:
import pandas as pd
import numpy as np
import os
from itertools import product
import matplotlib.pyplot as plt
import time
import gc
from sklearn.metrics import mean_squared_error, r2_score
import sklearn.linear_model
from sklearn import preprocessing

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

import xgboost
import lightgbm as lgb

import tensorflow as tf
from tensorflow import keras

%matplotlib inline 


In [2]:
DATA_FOLDER = '../readonly/final_project_data/'

transactions    = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
items           = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))

test_set=pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz'))

# Dataset exploration 
based on Pandas basics notebook from week1

In [3]:
## From week 1:
# How many items are there, such that their price stays constant during the whole period of time?
# Remember, the data can sometimes be noisy. :)
# grouping3=pd.DataFrame(transactions[['item_id','item_price']],copy=True)
# grouping3['item_price']=abs(grouping3['item_price']).round(decimals=2)
# grouping3=grouping3.drop_duplicates()
# sum3=grouping3.groupby('item_id').count()
# sum3[sum3['item_price']==1].count()

# Maybe could be the feature (price changes count, last price delta, price changed time)

In [4]:
# this looks weird, but there are only 2 such items anyway
# items[items.item_name.str.startswith('!')==True]

## EDA - to do :
 1. analyze time dependancies if any for different intervals
 2. plot some dependancies?
 3. From the task: Target distribution is visualized, time trend is assessed

# Preparing simple submittion 

<b>A good exercise is to reproduce previous_value_benchmark.</b>

<i> Stolen from somewhere from course materials </i>


As the name suggest - in this benchmark for the each shop/item pair our predictions are just monthly sales from the previous month, i.e. October 2015.

The most important step at reproducing this score is correctly aggregating daily data and constructing monthly sales data frame. You need to get lagged values, fill NaNs with zeros and clip the values into [0,20] range. If you do it correctly, you'll get precisely 1.16777 on the public leaderboard.

In [5]:
# df_filter=transactions.set_index(pd.to_datetime(transactions["date"],format='%d.%m.%Y'))["2015-10"]
# df_october=pd.DataFrame(df_filter)

# df_october_groupped = df_october.groupby(['shop_id','item_id'],as_index=False).agg({'item_cnt_day':'sum'})

# predictions = pd.merge(test_set,df_october_groupped,
#                        left_on=('shop_id','item_id'), right_on=('shop_id','item_id'), how='left')
# ##filling NAs
# predictions.item_cnt_day=predictions.item_cnt_day.fillna(0.0)
# ## clipping
# predictions.loc[predictions.item_cnt_day>20,'item_cnt_day']=20
# predictions.loc[predictions.item_cnt_day<0,'item_cnt_day']=0
# ##dropping extra colums and indexes after merge
# predictions.drop(labels=['shop_id','item_id'],axis=1,inplace=True)
# predictions.rename(columns={'item_cnt_day':'item_cnt_month'},inplace=True)

# # converting  ID back to int
# predictions.ID = predictions.ID.astype(int)

# filename = 'submition_{}.csv'.format(time.strftime('%Y_%m_%d_%H_%M',time.localtime()))
# filename = os.path.join(DATA_FOLDER,filename)
# predictions.to_csv(filename,index=False)

# Train test split investigation


In [3]:
items_test = test_set.item_id.nunique()
shops_test = test_set.shop_id.nunique()

items_train = transactions.item_id.unique().shape[0]
shops_train = transactions.shop_id.unique().shape[0]

print ('Test set: \n'+\
       ' items: ' + str(items_test) + ' shops: ' + str(shops_test) + \
       '\n total: '+ str(test_set["ID"].count()) + ' supposed total: '+str(items_test*shops_test) + \
       '\nData set: \n'+\
       ' items: ' + str(items_train) + ' shops: ' + str(shops_train) + \
       '\n total: ' + str (items_train*shops_train)
      )



Test set: 
 items: 5100 shops: 42
 total: 214200 supposed total: 214200
Data set: 
 items: 21807 shops: 60
 total: 1308420


We can see the cartesian product is used in a test set, however only some of shops and items included into test set.  This suggests potential dataleak, but further investigation needed.
also hypotesis to test - shops are beeing closed, as well items are get offsale. Maybe data missing in testset is just one which are not exists anymore

In [4]:
excluded_shops = np.setxor1d(transactions.shop_id.unique(),test_set.shop_id.unique())
items_train_only = transactions[~transactions.item_id.isin(test_set.item_id)].item_id.unique()

print ('excluded_shops ' + str(excluded_shops))
print ('items only in train count ' + str(items_train_only.shape[0]))

# There is also items only in test set, but let's ignore it for now
print ('items only in test count ' + str(test_set[~test_set.item_id.isin(transactions.item_id.unique())].\
                                         item_id.unique().shape[0]))
print ('average date_block for: \n'+\
       '  exluded shops: ' + \
        str(transactions[transactions.shop_id.isin(excluded_shops)].date_block_num.mean()) + \
       '\n  excluded items: ' + \
        str(transactions[transactions.item_id.isin(items_train_only)].date_block_num.mean()) + \
        '\n  average for all data : ' + \
        str(transactions.date_block_num.mean()) + \
       '\nlast seen:' + \
       '\n  exluded shops: ' + \
       str(transactions[transactions.shop_id.isin(excluded_shops)]. \
           groupby('shop_id'). \
           agg({'date_block_num':'max'}).mean().values[0]) +\
       '\n  excluded items: ' + \
       str(transactions[transactions.item_id.isin(items_train_only)]. \
           groupby('item_id'). \
           agg({'date_block_num':'max'}).mean().values[0]) +\
       '\n  average for all shops : ' + \
       str(transactions.groupby('shop_id').agg({'date_block_num':'max'}).mean().values[0]) +\
       '\n  average for all items : ' + \
       str(transactions.groupby('item_id').agg({'date_block_num':'max'}).mean().values[0]) 
      )




excluded_shops [ 0  1  8  9 11 13 17 20 23 27 29 30 32 33 40 43 51 54]
items only in train count 17070
items only in test count 363
average date_block for: 
  exluded shops: 11.962617895419658
  excluded items: 10.468482249471924
  average for all data : 14.56991146343017
last seen:
  exluded shops: 20.38888888888889
  excluded items: 18.841769185705918
  average for all shops : 29.216666666666665
  average for all items : 21.760489750997387


seems average date is less for absent shops, so we can somehow conlude the shops not in test set are closed. 
However, it's not that clear for items, why only about 1/4 of items are in test set. 

It also doesn't looks like there are only items which was sold, as leaderboard probing shown mean 0.2839
https://www.kaggle.com/c/competitive-data-science-predict-future-sales/discussion/79142

There could be a case items excluded are the one's had 0 sales in all the possible shops within the given month (which also suggested by train/test split in week4). The last one can be tested by grouping transactions by date_block_num and count unique items over to see if it's close to test set size

This can be utilized together with mean information above during predicting for items not in dataset.

In [5]:
transactions.groupby('date_block_num').agg({'item_id':'nunique'}).mean().values[0]

## Seems pretty close (espesially if look and number's not mean - 5100 looks reasonable values)

6879.764705882353

# Models
Models are defines in this section to be available to validations (while beeing developed later)


In [3]:
def constant_model(train_set, test_set,fit_col):
    # we can take global mean of training data, but it's not so easy to calculate, 
    # as we need to produce proper fraction of zeros. So just constant for some period taken
    return np.ones((test_set.shape[0],1),dtype='float32')*0.298 ## true_values['item_cnt_day'].mean()

In [4]:
def previous_value_model(train_set, test_set,fit_col):
    max_date_block = train_set.date_block_num.max()
    
    predictions = test_set.merge(train_set[train_set.date_block_num == max_date_block],\
        how='left',on=['shop_id','item_id']).fillna(0.0)
    
    predictions.loc[predictions.target>20,'target']=20
    predictions.loc[predictions.target<0,'target']=0
    
    return predictions['target'].values

In [5]:
def lin_model (train_set, test_set,fit_col):
    lr = sklearn.linear_model.LinearRegression(n_jobs=4)
    X = train_set[fit_col]
    y = train_set['target']
    lr.fit(X.values,y.values)
    predictions = lr.predict(test_set[fit_col].values)

    return predictions.clip(min=0.0, max =20.0)


In [6]:
def lin_model_ellastic (train_set, test_set,fit_col):
    lr = sklearn.linear_model.ElasticNet()
    X = train_set[fit_col]
    y = train_set['target']
    lr.fit(X.values,y.values)
    predictions = lr.predict(test_set[fit_col].values)    

    return predictions.clip(min=0.0, max =20.0)

In [7]:
def lin_model_ellastic_with_feature_norm (train_set, test_set,fit_col):
    ## Seems with MinMaxScaler exactly the same metrics as lin_model. Just, probably given by the 
    ## fact feature are centered in LinearRegression and not too far from 0-1
    ## 
    scaler = preprocessing.RobustScaler()

    reduced_X_train = pd.DataFrame(train_set[fit_col+['target']],copy=True)
    X_train_minmax = scaler.fit_transform(reduced_X_train[fit_col])
    reduced_X_train[fit_col] = X_train_minmax

    reduced_X_test = pd.DataFrame(test_set[fit_col],copy=True)
    X_test_minmax = scaler.transform(reduced_X_test)
    reduced_X_test[fit_col] = X_test_minmax

    predictions=lin_model_ellastic(reduced_X_train,reduced_X_test,fit_col)
    
    del reduced_X_train, reduced_X_test, X_train_minmax, X_test_minmax
    gc.collect();

    return predictions

In [8]:
def xgboost_model(train_set, test_set,fit_col):
    xgb_model = xgboost.XGBRegressor(n_jobs=4,objective='reg:squarederror')
    xgb_model.fit(train_set[fit_col].values,train_set['target'].values)
    predictions = xgb_model.predict(test_set[fit_col].values)
    
    return predictions.clip(min=0.0, max =20.0)

In [9]:
def lightgbm_model(train_set, test_set,fit_col):
    lgb_params = {
#                'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':4, 
#                'min_data_in_leaf': 2**7, 
#                'bagging_fraction': 0.75, 
#                'learning_rate': 0.03, 
               'objective': 'mse', 
#                'bagging_seed': 2**7, 
#                'num_leaves': 2**7,
#                'bagging_freq':1,
#                'verbose':0 
              }

    model = lgb.train(lgb_params, lgb.Dataset(train_set[fit_col].values, label=train_set['target'].values), 100)
    predictions = model.predict(test_set[fit_col].values)
    return predictions.clip(min=0.0, max =20.0)

# Feature engineering


### idea for text features:
#### text
    1.1. (implemented) word count in items and categories
    1.2. number of similar items by description
    1.3. (implemented) TFIDF -> SVD 
    1.4. (implemented) cluster text (item+category) and then cluster number
(see https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py)
    1.5 first word of the shop name is usualy a city. Make it categorical feature?
    
 #### outcome
    --> adding 150 SVD features from items description increases memory footprint, while rmse even worse
    --> 10 SVD features makes it more feasible, while, it doesn't change RMSE for linear model and makes it worse for 
 lightgbm
    --> however, while doing clustering on top of SVD, seems lightgbm slightly benefits from it 
 in this scenario number of SVD components and number of clusters to be turned

#### Others

    2.1. count of items in given category
    2.2. item price (maybe also price changes)
    2.3. average category price


In [3]:
def cartesian_product(*arrays):
    la = len(arrays)
    dtype = np.result_type(*arrays)
    arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype)
    for i, a in enumerate(np.ix_(*arrays)):
        arr[...,i] = a
    return arr.reshape(-1, la)

In [11]:
def group_transactions (transactions):
    '''
    For each dataframe select unique shops and unique items, make all possible pairs out of it 
    and for each such a pair either aggregate transactions to get sales, or put 0 if no transactions.
    The logic was implemented in week4 for validation, also it's the way, how test set is constructed
    '''
    index_cols = ['shop_id', 'item_id', 'date_block_num']

    grid = [] 
    for block_num in transactions['date_block_num'].unique():
        cur_shops = transactions.loc[transactions['date_block_num'] == block_num, 'shop_id'].unique()
        cur_items = transactions.loc[transactions['date_block_num'] == block_num, 'item_id'].unique()
        # https://docs.python.org/3/library/itertools.html#itertools.product
        # Cartesian product of input iterables.
        grid.append(cartesian_product(cur_shops, cur_items, np.array([block_num])))
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    gb = transactions.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'})
    gb.rename(columns={'item_cnt_day':'target'},inplace=True)
    return pd.merge(grid, gb, how='left', on=index_cols).fillna(0.0)

In [12]:
def make_shifts(trainset, testset, fit_cols):
    '''
    !!!
    If this transformation is called - it was tested to be called immidiately after groupping, 
    otherwise all additional features could be lost?
    !!!
    
    Generate for both train and testset target, shifted by some month as a feature.
    testset assumed one date_block_num ahead of train.
    '''
    shift_range = [1, 2, 3, 4, 5, 12]
    
    max_block_num = trainset.date_block_num.max()
    testset.loc[:,'date_block_num']=max_block_num+1
    
    ## groups by shop and item
    trainset = trainset.merge(
        trainset.groupby(['shop_id','date_block_num'],as_index=False).\
            agg({'target':'sum'}).\
            rename(columns={'target':'target_shop'}),
        on=['shop_id','date_block_num'], copy=False, validate = 'm:1').fillna(0.0)
    trainset = trainset.merge(
        trainset.groupby(['item_id','date_block_num'],as_index=False).\
            agg({'target':'sum'}).\
            rename(columns={'target':'target_item'}),
        on=['item_id','date_block_num'], copy=False, validate = 'm:1').fillna(0.0)
        
    ## generating shifts
    for shift in shift_range:
        curr_fit_cols = ['target_lag_{}'.format(shift),'target_shop_lag_{}'.format(shift),'target_item_lag_{}'.format(shift)]
        ##
        trainset.loc[:,'shifted_date_block_num'] = trainset['date_block_num']+shift

        testset=\
            testset.merge(trainset[['shop_id', 'item_id', 'shifted_date_block_num', 'target','target_shop','target_item']],
                      left_on= ['shop_id', 'item_id', 'date_block_num'],
                      right_on=['shop_id', 'item_id', 'shifted_date_block_num'], how='left',
                      validate='1:1',copy=False,
            )[['shop_id', 'item_id', 'date_block_num', 'target','target_shop','target_item']+fit_cols].\
            copy().fillna(0).rename(columns={'target':curr_fit_cols[0],
                                      'target_shop':curr_fit_cols[1],
                                      'target_item':curr_fit_cols[2]})

        trainset=\
            trainset[['shop_id', 'item_id', 'date_block_num', 'target','target_shop','target_item']+fit_cols].merge(
            trainset[['shop_id', 'item_id', 'shifted_date_block_num', 'target','target_shop','target_item']],
                       left_on= ['shop_id', 'item_id', 'date_block_num'], 
                        right_on=['shop_id', 'item_id', 'shifted_date_block_num'], how='left',
                        suffixes = ('','_shift'), validate='1:1',copy=False,
            )[['shop_id', 'item_id', 'date_block_num', 'target','target_shift',
               'target_shop','target_item','target_shop_shift','target_item_shift']+fit_cols].\
            copy().fillna(0).rename(columns={'target_shift':curr_fit_cols[0],
                                      'target_shop_shift':curr_fit_cols[1],
                                      'target_item_shift':curr_fit_cols[2]})
        
        fit_cols.extend(curr_fit_cols)
    
    ###
    ### should we exclude first monthes ? 
    ###
    ###
    gc.collect()
    
    return (trainset,testset,fit_cols)

In [13]:
def text_items_SVD_preprocessor(trainset,testset,fit_cols):
    ## no need to compute several times. Just compute once, add column to items and check on strat if there is column allready
    svd_components_count = 1000
    km_clusters = 100
    
    vectorizer = TfidfVectorizer()
    item_names_tfidf = vectorizer.fit_transform(items.item_name)
    svd = TruncatedSVD(svd_components_count)
    item_names_svd = svd.fit_transform(item_names_tfidf)

#     print('SDV explained_var ' + str(svd.explained_variance_ratio_.sum()))
    km = KMeans(n_clusters=km_clusters, init='k-means++',n_init=3)
    km.fit(item_names_svd)

    items_with_km = pd.concat([items, pd.DataFrame(km.labels_,columns=['km_item'])],axis=1)

    trainset_updated = trainset.merge(items_with_km[['item_id','km_item']], on = 'item_id', how = 'left')
    testset_updated = testset.merge(items_with_km[['item_id','km_item']], on = 'item_id', how = 'left')
    fit_cols.append('km_item')
    
    return (trainset_updated, testset_updated, fit_cols)

In [14]:
def item_text_counts(trainset,testset,fit_cols):
    # adding this feature to tree and liner models makes it worse, not better 
    # word count
    items['item_word_count']=items.item_name.apply(lambda x: len(x.split(' '))) 
    # chars cound
    items['item_char_count']=items.item_name.str.len()
    
    trainset_updated = trainset.merge(items[['item_id','item_word_count','item_char_count']], 
                                        on = 'item_id', how = 'left')
    
    testset_updated = testset.merge(items[['item_id','item_word_count','item_char_count']],
                                    on = 'item_id', how = 'left')
    fit_cols.extend(['item_word_count','item_char_count'])
    
    return (trainset_updated, testset_updated, fit_cols)

In [15]:
# should be probably merged with item_text_counts and parametrized..
def shop_text_counts(trainset,testset,fit_cols):
    # adding this feature to tree and liner models makes it worse, not better
    # word count
    shops['shop_word_count']=shops.shop_name.apply(lambda x: len(x.split(' '))) 
    # chars cound
    shops['shop_char_count']=shops.shop_name.str.len()
    
    trainset_updated = trainset.merge(shops[['shop_id','shop_word_count','shop_char_count']], 
                                        on = 'shop_id', how = 'left')
    
    testset_updated = testset.merge(shops[['shop_id','shop_word_count','shop_char_count']],
                                    on = 'shop_id', how = 'left')
    fit_cols.extend(['shop_word_count','shop_char_count'])
    
    return (trainset_updated, testset_updated, fit_cols)

In [16]:
def category_id(trainset,testset,fit_cols):
    trainset_updated = trainset.merge(items[['item_id','item_category_id']], on = 'item_id', how = 'left')
    testset_updated = testset.merge(items[['item_id','item_category_id']], on = 'item_id', how = 'left')
    fit_cols.append('item_category_id')
    return (trainset_updated, testset_updated, fit_cols)

# Validation

Before playing with any kind of models validation has to be setup. 
Some ideas could be taken from week4. 

In contrast with it 4 cycles of validation to be used - train on date_block_num [0:i] -> validate on [i+1], where i [30,31,32,33]



Let's assume every model should take dataset like transations and test pairs (like test_set) and return dataset of prediction 
[shop_id, item_id, prediction] for the month, next to the last present in dataset. 



In [20]:
def validate (preprocessors, models):
    outcome = []
    from_block = 30
    to_block = 33
# testing vals
#     from_block = 2
#     to_block = 3 

    for i in range(from_block,to_block+1):    
        print ('validation: generating test and train for i='+str(i))
        trainset = group_transactions(transactions[transactions.date_block_num<i])
        testset = group_transactions(transactions[transactions.date_block_num==i])
        testset_without_targer = testset[['shop_id','item_id']].copy()
        
#         (train_groupped_with_shifted, test_set_with_shifted, fit_col) = \
#             make_shifts(trainset,testset[['shop_id','item_id']].copy())## copy to avoid SettingWithCopyWarning
        applied_preprocessors = []
        fit_col=[]
        
        for preprocessor in preprocessors: 
            (train_processed, test_processed, fit_col) = \
                preprocessor(trainset,testset_without_targer,fit_col)
            
            applied_preprocessors.append(preprocessor.__name__)
            print('applied_preprocessors: '+str(applied_preprocessors))
            
            for model in models:
                
                predictions = model(train_processed, test_processed,fit_col)
                # According to competition description validation is done with clipping, so we should do the same
                # Model is responsible for it's own clipping if nessesary
                # r2 should be as close to 1 as possible. 0 is constant model            
#                 r2s = r2_score(testset['target'].clip(lower=0.0, upper =20.0), predictions)
                rmse = np.sqrt(mean_squared_error(testset['target'].clip(lower=0.0, upper =20.0), predictions))
                print('validation: tested model ' + model.__name__+' rmse = '+str(rmse))
#                 outcome.append({'preprocessors':' '.join(applied_preprocessors),\
#                                 'model':model.__name__,'block':i,'metric':'r2s','value':r2s})
                outcome.append({'preprocessors':' '.join(applied_preprocessors),\
                                'model':model.__name__,'block':i,'metric':'rmse','value':rmse})
            
            trainset = train_processed
            testset_without_targer = test_processed
            
        print () 
        
    return outcome

In [27]:
# resut = validate([constant_model,previous_value_model,lin_model,xgboost_model])
# resut = validate([constant_model,previous_value_model,lin_model, lin_model_with_feature_norm, lightgbm_model])

resut = validate(
    [make_shifts, text_items_SVD_preprocessor,category_id],
    [lin_model_ellastic,lightgbm_model]
    )

validation: generating test and train for i=30
applied_preprocessors: ['make_shifts']
validation: tested model lin_model_ellastic rmse = 0.757732759772218
validation: tested model lightgbm_model rmse = 0.7307876680876421
applied_preprocessors: ['make_shifts', 'text_items_SVD_preprocessor']
validation: tested model lin_model_ellastic rmse = 0.757732759772218
validation: tested model lightgbm_model rmse = 0.737034133991881
applied_preprocessors: ['make_shifts', 'text_items_SVD_preprocessor', 'category_id']
validation: tested model lin_model_ellastic rmse = 0.757732759772218
validation: tested model lightgbm_model rmse = 0.7373853021542082

validation: generating test and train for i=31
applied_preprocessors: ['make_shifts']
validation: tested model lin_model_ellastic rmse = 0.8381509092447855
validation: tested model lightgbm_model rmse = 0.8161690531405288
applied_preprocessors: ['make_shifts', 'text_items_SVD_preprocessor']
validation: tested model lin_model_ellastic rmse = 0.838150909

In [28]:
pd.options.display.max_colwidth = 150
pd.DataFrame(resut).groupby(['model','preprocessors','metric'],as_index=False).agg({'value':'mean'})

Unnamed: 0,model,preprocessors,metric,value
0,lightgbm_model,make_shifts,rmse,0.86587
1,lightgbm_model,make_shifts text_items_SVD_preprocessor,rmse,0.863314
2,lightgbm_model,make_shifts text_items_SVD_preprocessor category_id,rmse,0.854582
3,lin_model_ellastic,make_shifts,rmse,0.895319
4,lin_model_ellastic,make_shifts text_items_SVD_preprocessor,rmse,0.895319
5,lin_model_ellastic,make_shifts text_items_SVD_preprocessor category_id,rmse,0.895319


In [30]:
pd.options.display.max_colwidth = 150
pd.DataFrame(resut).groupby(['model','preprocessors','metric'],as_index=False).agg({'value':'mean'})

Unnamed: 0,model,preprocessors,metric,value
0,lightgbm_model,make_shifts,rmse,0.86587
1,lightgbm_model,make_shifts text_items_SVD_preprocessor,rmse,0.861515
2,lightgbm_model,make_shifts text_items_SVD_preprocessor item_text_counts,rmse,0.865068
3,lightgbm_model,make_shifts text_items_SVD_preprocessor item_text_counts shop_text_counts,rmse,0.865586
4,lin_model,make_shifts,rmse,0.904058
5,lin_model,make_shifts text_items_SVD_preprocessor,rmse,0.904204
6,lin_model,make_shifts text_items_SVD_preprocessor item_text_counts,rmse,0.90433
7,lin_model,make_shifts text_items_SVD_preprocessor item_text_counts shop_text_counts,rmse,0.904466
8,lin_model_ellastic,make_shifts,rmse,0.895319
9,lin_model_ellastic,make_shifts text_items_SVD_preprocessor,rmse,0.895319


Validation:

|preprocessors|model| 	metric 	|value|
|----|----|-----|-----------|
|make_shifts+text_items_SVD_preprocessor|constant_model|rmse|1.086481|
|make_shifts+text_items_SVD_preprocessor|lin_model|rmse|0.904204|
|make_shifts+text_items_SVD_preprocessor|previous_value_model|rmse|1.008575|
|make_shifts+text_items_SVD_preprocessor|xgboost_model|rmse|0.875871|
|make_shifts+text_items_SVD_preprocessor|lightgbm_model|rmse|0.861515|


<b> Seems local validation somehow corelates with public leaderboard </b>

|Name|Score|Local score|Model|Comment|
|----|-----|-----------|-----|-------|
|submition_2019_06_03_09_29.csv.gz|1.16777|1.008575|Last month baseline||
|submition_lin_model_2019_12_02_13_50.csv.gz|1.96182||simple linear model with 2 month shifts and with clipping 0-300|wrong cliipng|
|submition_lin_model_2019_12_02_13_56.csv.gz|4.34576||simple linear model with 2 month shifts with clipping removed|no clipping|
|submition_lin_model_2019_12_02_15_39.csv.gz|1.08014|0.905361|simple linear model, more with proper clipping|proper clipping apllied|
|submition_xgboost_model_2019_12_03_15_43.csv.gz|1.02016|0.870930|simple xgboost without any parameters tuned|
|submition_lightgbm_model_2019_12_04_05_53.csv.gz|**1.00715**|0.862635|simple light gbm without any parameters tuned|



In [25]:
def prepare_submit(model):
    
    trainset = group_transactions(transactions) # <-using global var in func 
    
    (train_groupped_with_shifted, test_set_with_shifted, fit_col) = \
            make_shifts(trainset,test_set) # <-using global var in func 
    
    predictions = model(train_groupped_with_shifted, test_set_with_shifted,fit_col)
    
    test_set_with_shifted.loc[:,'predictions'] = predictions.astype(float)
    test_set_with_shifted.loc[:,'ID'] = test_set.ID.astype(int)        
    test_set_with_shifted.rename(columns={'predictions':'item_cnt_month'},inplace=True)
    

    filename = 'submition_{}_{}.csv.gz'.format(model.__name__,time.strftime('%Y_%m_%d_%H_%M',time.localtime()))
    test_set_with_shifted[['ID','item_cnt_month']].to_csv(filename,index=False)
    print (filename + ' prepared')
    
    
# prepare_submit(lightgbm_model)


### Validation: TODO
1. From the task - Type of public/private split is identified
2. Compare for more complex model which metric better correlates with lederboard - rmse or r2

In [18]:

# trainset = group_transactions(transactions[transactions.date_block_num<13])
# testset = group_transactions(transactions[transactions.date_block_num==13])
# testset_without_targer = testset[['shop_id','item_id']].copy()




From tds article
https://colab.research.google.com/drive/1lUwtvOInzoaNC5eBMljRMVk1K9zcKD-b

https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/

seems routines used for classic models doesn't makes sence for LSTM. 
The logic for lstm should be 

0. generate pairs (item_id, shop_id) from all the items met in the test_set and 3 shops (for testing)
1. select time_window = 24 (2 years), n_features = 1 (sales last_month) 

2. generate train sequences in a way that each sequence [samples,time_window,n_features]
   
    
3. split dev_train and dev_test in a way [dev_train_X:dev_train_Y], 
   where dev_train_X  = (first n frames sales data) and dev_train_Y - last sales 

3. consider adding additional features later from text_items_SVD_preprocessor for example






In [4]:
train_test_split_block=30

# shop_ids = np.random.choice(transactions[transactions.date_block_num==train_test_split_block].shop_id.unique(),10)
shop_ids = transactions[transactions.date_block_num==train_test_split_block].shop_id.unique()


In [12]:
# time_window = 24
time_window = 13
n_features = 1

In [13]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

train_X = []
train_Y = []

for cur_seq_window in range(time_window,train_test_split_block):
    start_interval = cur_seq_window-time_window
#     print(start_interval,cur_seq_window)
    #item_ids from Y
    item_ids = transactions[
        (transactions.date_block_num==cur_seq_window) &
        (transactions.shop_id.isin(shop_ids))
    ].item_id.unique()
    # grid for both X and Y
    v_grid_X = \
        cartesian_product(shop_ids, item_ids, np.array(range(cur_seq_window-time_window,cur_seq_window)))    
    pd_grid_X = pd.DataFrame(v_grid_X, columns = index_cols,dtype=np.int32)
    
    X_trans = transactions[
        (transactions.date_block_num>=start_interval) &
        (transactions.date_block_num<cur_seq_window) &
        (transactions.shop_id.isin(shop_ids)) & 
        (transactions.item_id.isin(item_ids))
    ]
    gb_X = X_trans.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'})
    pd_grid_val_X = pd.merge(pd_grid_X, gb_X, how='left', on=index_cols).fillna(0.0)    
    seq_num = int(pd_grid_val_X.shape[0]/time_window)
#     print ("seq_num = " + str(seq_num))    
    np_X_chunk = np.reshape(pd_grid_val_X.sort_values(by=index_cols)['item_cnt_day'].values,
                                  (seq_num,time_window,n_features))
    
    
    v_grid_Y = cartesian_product(shop_ids, item_ids, np.array([cur_seq_window]))
    pd_grid_Y = pd.DataFrame(v_grid_Y, columns = index_cols,dtype=np.int32)    
    Y_trans = transactions[
        (transactions.date_block_num==cur_seq_window) &
        (transactions.shop_id.isin(shop_ids)) & 
        (transactions.item_id.isin(item_ids))
    ]
    gb_Y = Y_trans.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'})
    pd_grid_val_Y = pd.merge(pd_grid_Y, gb_Y, how='left', on=index_cols).fillna(0.0)
    
    np_Y_chunk = pd_grid_val_Y.sort_values(by=index_cols)['item_cnt_day'].values
    
#     print ("np_X_chunk.shape = "+str(np_X_chunk.shape)+"np_Y_chunk.shape = "+str(np_Y_chunk.shape))


    
    if (cur_seq_window==train_test_split_block-1):
        ## test set
        test_X = np_X_chunk
        test_Y = np_Y_chunk
    else:
        ## train set
        train_X.append(np_X_chunk)
        train_Y.append(np_Y_chunk)
        
train_X_final = np.vstack(train_X)
train_Y_final = np.hstack(train_Y)

In [14]:
print(train_X_final.shape , train_Y_final.shape, test_X.shape , test_Y.shape)

(4221912, 13, 1) (4221912,) (224288, 13, 1) (224288,)


In [None]:
# Epoch 24/30
# 1228682/1228682 - 126s - loss: 10.3635 - val_loss: 2.6240

model = keras.Sequential()
model.add(keras.layers.LSTM(32, input_shape=(train_X_final.shape[1], train_X_final.shape[2]),dropout = 0.4))
model.add(keras.layers.Dense(1))
model.compile(loss='mean_squared_error', optimizer=keras.optimizers.Adam())

history = model.fit(
    train_X_final, train_Y_final, 
    epochs=30, 
    batch_size=128, 
    validation_data=(test_X, test_Y),
    verbose=2, 
    shuffle=True
)

In [None]:
# 64 units
# Epoch 22/30
# 1228682/1228682 - 211s - loss: 10.1696 - val_loss: 2.4526
model2 = keras.Sequential()
model2.add(keras.layers.LSTM(64, input_shape=(train_X_final.shape[1], train_X_final.shape[2])))
model2.add(keras.layers.Dense(1))
model2.compile(loss='mean_squared_error', optimizer=keras.optimizers.Adam())

history2 = model2.fit(
    train_X_final, train_Y_final, 
    epochs=30, 
    batch_size=128, 
    validation_data=(test_X, test_Y),
    verbose=2, 
    shuffle=True
)
