In [152]:
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re
from ggplot import *
import itertools
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import KFold,train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
 

In [7]:
items = pd.read_csv('items.csv')
item_cat = pd.read_csv('item_categories.csv')
shops = pd.read_csv('shops.csv')
train = pd.read_csv('sales_train_v2.csv')
test = pd.read_csv('test.csv')

In [3]:
print('Train shape',train.shape[0])
print('Test shape',test.shape[0])

Train shape 2935849
Test shape 214200


<h3> Data First View </h3>

In [4]:
train.head(10)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0
5,10.01.2013,0,25,2564,349.0,1.0
6,02.01.2013,0,25,2565,549.0,1.0
7,04.01.2013,0,25,2572,239.0,1.0
8,11.01.2013,0,25,2572,299.0,1.0
9,03.01.2013,0,25,2573,299.0,3.0


In [5]:
test.head(10)

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268
5,5,5,5039
6,6,5,5041
7,7,5,5046
8,8,5,5319
9,9,5,5003


In [6]:
train.isnull().sum(axis=0)

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

None of the columns are null

In [7]:
train.isnull().sum(axis=1).head(10)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

In [18]:
some_list = [[1,2,3],[4,5]]

In [19]:
list(itertools.product(some_list))

[([1, 2, 3],), ([4, 5],)]

In [15]:
index_cols = ['shop_id', 'item_id', 'date_block_num']
for date_block in train['date_block_num'].unique():
    shop_ids = train.loc[train.date_block_num == date_block,'shop_id'].unique()
    item_ids = train.loc[train.date_block_num == date_block,'item_id'].unique()
    prds = itertools.product(*[shop_ids,item_ids,[date_block]])

gb = train.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
#pd.DataFrame(list(prds),columns=index_cols)
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
grid = pd.DataFrame(np.vstack(prds),columns=index_cols,dtype=np.int32)
all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

Looks like all rows are complete

<h3> EDA </h3>

In [8]:
train.get_dtype_counts()

float64    2
int64      3
object     1
dtype: int64

<h3> Baseleine Prediction </h3>

In [16]:
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target
189161,2,30,33,0.0
184395,2,31,33,1.0
185368,2,32,33,0.0
184193,2,33,33,0.0
189162,2,40,33,0.0


In [22]:
mean_values = all_data.groupby(['shop_id','item_id'])['target'].mean()

In [26]:
baseline_df = pd.merge(test,mean_values.reset_index(),on=['shop_id','item_id'],how='left')

In [41]:
baseline_df['target'] = baseline_df['target'].fillna(0.3343)
baseline_df[['ID','target']].rename(columns={'target': 'item_cnt_month'}).to_csv('baseline_submission.csv',index=False)

In [48]:
all_data = all_data.reset_index()

In [84]:
cat_cols = ['shop_id','item_id']

<h3> Appling Mean Encoding </h3>

In [96]:

kfolds = KFold(n_splits=5,shuffle=False)
for trn_idx,val_idx in kfolds.split(all_data):
    for col in cat_cols:
        X_trn,X_val = all_data.iloc[trn_idx],all_data.iloc[val_idx]
        cat_encoder = all_data.groupby(col)['target'].transform(np.mean)
        all_data.loc[val_idx,col + '_mean_enc'] = X_val[col].map(cat_encoder)

        

In [134]:
test_df = pd.merge(test,all_data[['shop_id','item_id','shop_id_mean_enc','item_id_mean_enc']],on=['shop_id','item_id'],how='left')

In [139]:
test_df['item_id_mean_enc'].fillna(all_data['item_id_mean_enc'].mean(),inplace=True)
test_df['shop_id_mean_enc'].fillna(all_data['shop_id_mean_enc'].mean(),inplace=True)

In [140]:
from sklearn.ensemble import RandomForestRegressor

In [141]:
rf = RandomForestRegressor(criterion='mse')

In [142]:
X_train,X_test,Y_train,Y_test = train_test_split(all_data.drop('target',axis=1),all_data['target'],test_size = -.2)

In [143]:
rf_mdl = rf.fit(X_train.loc[:,['shop_id_mean_enc','item_id_mean_enc']].values,Y_train.values)

In [144]:
pred = rf_mdl.predict(X_test.loc[:,['shop_id_mean_enc','item_id_mean_enc']].values)

In [145]:
rf_mdl.score(X_test.loc[:,['shop_id_mean_enc','item_id_mean_enc']].values,Y_test)

-0.00012062714836957868

In [146]:
test_df['item_cnt_month'] = rf_mdl.predict(test_df.loc[:,['shop_id_mean_enc','item_id_mean_enc']].values)

In [148]:
test_df[['ID','item_cnt_month']].to_csv('mean_encoding.csv',index=False)

One datetime column and 2 float columns with 3 id columns

In [156]:
tfidf = TfidfVectorizer(analyzer='word',
                        stop_words=stopwords.words('russian'),
                        token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b')


In [157]:
tfidf.fit_transform(items['item_name'])

<22170x5825 sparse matrix of type '<class 'numpy.float64'>'
	with 51483 stored elements in Compressed Sparse Row format>

In [158]:

items['star_count'] = items['item_name'].str.count('\*')
items['item_name'] = items['item_name'].str.replace('[\(\)\*BD\/\!\d+]','')


In [159]:
items['item_name'] = items['item_name'].apply(lambda x: x.lower())

In [160]:
tfidf_values = tfidf.fit_transform(items['item_name'])
items = pd.concat([items,pd.DataFrame(tfidf_values.toarray(),columns=tfidf.get_feature_names())],axis=1)

In [161]:
items['Regional'] = (items['item_name'].str.contains('регион')).astype('int')
items['BlueRay'] = (items['item_name'].str.contains('BD')).astype('int')

In [162]:
train_df = pd.merge(all_data,items,on='item_id',how='left')

In [165]:
train_df = train_df.drop(['index', 'shop_id', 'item_id', 'date_block_num', 'target'],axis=1)

In [166]:
train_df.columns

Index(['shop_id_mean_enc', 'item_id_mean_enc', 'item_name', 'item_category_id',
       'star_count', 'a', 'aa', 'aam', 'aams', 'abandon',
       ...
       'шкатулкаmy', 'шкатулкаnutcracker', 'шкатулкаpink', 'шкатулкаsingin',
       'шкатулкаyesteray', 'энигмаcast', 'юcast', 'янанебiбув', 'Regional',
       'BlueRay'],
      dtype='object', length=5832)