# In-Depth Analysis: Machine Learning Models

## Import python libarary

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from itertools import product


## Loading the data

In [2]:
df_train_1 = pd.read_csv('data/sales_train.csv')
df_test = pd.read_csv('data/test.csv', index_col = 'ID')
df_item_1 = pd.read_csv('data/items.csv')

In [3]:
x = df_train_1.item_id.unique()
y = df_test.item_id.unique()
count = 0
for i in y:
    if i not in x:
        count += 1
print(count)

363


In [4]:
df_train_2 = df_train_1[((df_train_1.item_cnt_day > 0 ) & (df_train_1.item_cnt_day < 1000))]
df_train_2 = df_train_2[df_train_2.item_price > 0]

In [5]:
df_train = pd.merge(df_train_2, df_item_1, how = 'left')
df_train = df_train.drop('item_name', axis = 1)

In [6]:
df_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
0,02.01.2013,0,59,22154,999.0,1.0,37
1,03.01.2013,0,25,2552,899.0,1.0,58
2,06.01.2013,0,25,2554,1709.05,1.0,58
3,15.01.2013,0,25,2555,1099.0,1.0,56
4,10.01.2013,0,25,2564,349.0,1.0,59


In [7]:
missing = list(df_train.isnull().sum().sort_values(ascending = True).items())
missing

[('date', 0),
 ('date_block_num', 0),
 ('shop_id', 0),
 ('item_id', 0),
 ('item_price', 0),
 ('item_cnt_day', 0),
 ('item_category_id', 0)]

## Prepare the Data for Modeling

In [8]:
#Creating the data frame from the sale_traing file
#Groupby the  shop_id and item_id, date_block_num
temp_data = df_train.groupby(['shop_id','item_id']).agg({'item_cnt_day':'sum'})
temp_data = temp_data.reset_index()
temp_data['train_or_test'] = 1 #set as train set


In [9]:
temp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424098 entries, 0 to 424097
Data columns (total 4 columns):
shop_id          424098 non-null int64
item_id          424098 non-null int64
item_cnt_day     424098 non-null float64
train_or_test    424098 non-null int64
dtypes: float64(1), int64(3)
memory usage: 12.9 MB


In [10]:
temp_cat = df_train.groupby(['shop_id','item_id','item_category_id']).sum().reset_index()
temp_cat = temp_cat[['shop_id','item_id','item_category_id']]

In [11]:
df_test = pd.merge(df_test, temp_cat, how = 'left', on = ['shop_id','item_id'])
df_test.fillna(0, inplace = True)

In [12]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 0 to 214199
Data columns (total 3 columns):
shop_id             214200 non-null int64
item_id             214200 non-null int64
item_category_id    214200 non-null float64
dtypes: float64(1), int64(2)
memory usage: 6.5 MB


In [13]:
#Merge Category_id to data frame
temp_data = pd.merge(temp_data, temp_cat, how = 'left', on = ['shop_id', 'item_id'])

In [14]:
temp_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 424098 entries, 0 to 424097
Data columns (total 5 columns):
shop_id             424098 non-null int64
item_id             424098 non-null int64
item_cnt_day        424098 non-null float64
train_or_test       424098 non-null int64
item_category_id    424098 non-null int64
dtypes: float64(1), int64(4)
memory usage: 19.4 MB


In [15]:
#Remove the item_id of test set in training set
test_item_id = df_test.item_id.unique()
temp_data = temp_data[~temp_data['item_id'].isin(test_item_id)]

In [16]:
temp_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 289898 entries, 4 to 424094
Data columns (total 5 columns):
shop_id             289898 non-null int64
item_id             289898 non-null int64
item_cnt_day        289898 non-null float64
train_or_test       289898 non-null int64
item_category_id    289898 non-null int64
dtypes: float64(1), int64(4)
memory usage: 13.3 MB


In [17]:
### Double checking if we have any item_id in train set same with test set
list1 = temp_data.item_id.unique()
count = 0
for i in list1:
    if i in test_item_id:
        count += 1
print(count)

0


In [18]:
#Create the date block number for df_test = 34
#df_test['date_block_num'] = 34
df_test['train_or_test'] = 0 #set as a test set
df_test['item_cnt_day'] = 0
#Then add the data to the df_temp_1
df_train_test = pd.concat([temp_data, df_test], ignore_index= True, sort = False, keys = ['shop_id','item_id'])
df_train_test.head()

Unnamed: 0,shop_id,item_id,item_cnt_day,train_or_test,item_category_id
0,0,35,15.0,1,40.0
1,0,36,1.0,1,37.0
2,0,40,1.0,1,57.0
3,0,43,1.0,1,40.0
4,0,49,2.0,1,57.0


In [19]:
df_train_test = df_train_test[['train_or_test','shop_id','item_id','item_category_id','item_cnt_day']]

In [20]:
df_train_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504098 entries, 0 to 504097
Data columns (total 5 columns):
train_or_test       504098 non-null int64
shop_id             504098 non-null int64
item_id             504098 non-null int64
item_category_id    504098 non-null float64
item_cnt_day        504098 non-null float64
dtypes: float64(2), int64(3)
memory usage: 19.2 MB


# Creating the feature for modeling

In [21]:
#Creating the average of item_cnt_day for shop_id and date_block_num
temp = df_train.groupby(['shop_id']).agg({'item_cnt_day':['mean']})
temp.columns = ['shop_avg_item_cnt']
temp = temp.reset_index()
#Merge the feature above to the data
df_train_test = pd.merge(df_train_test, temp, how = 'left', on = ['shop_id'])

In [22]:
#Creating the average of item_cnt_day for date_block_num
temp = df_train.groupby(['item_id']).agg({'item_cnt_day':['mean']})
temp.columns = ['avg_item_cnt']
temp = temp.reset_index()
#Merge the feature above to the data
df_train_test = pd.merge(df_train_test, temp, how = 'left', on = 'item_id')

In [23]:
#Creating the average of item_cnt_day for date_block_num
temp = df_train.groupby(['item_category_id']).agg({'item_cnt_day':['mean']})
temp.columns = ['avg_item_cnt_each_cat']
temp = temp.reset_index()
#Merge the feature above to the data
df_train_test = pd.merge(df_train_test, temp, how = 'left', on = 'item_category_id')

In [24]:
#Creating the average of item_cnt_day for date_block_num, category_id

temp = df_train.groupby(['shop_id', 'item_category_id']).agg({'item_cnt_day':['mean']})
temp.columns = ['shop_cat_avg_item_cnt']
temp = temp.reset_index()
#Merge the feature above to the data
df_train_test = pd.merge(df_train_test, temp, how = 'left', on = ['shop_id','item_category_id'])

In [25]:
#Create the average price for each item by each shop_id and item_id
temp_data_1 = df_train.groupby(['shop_id']).agg({'item_price':['mean']})
temp_data_1.columns = ['avg_price_ofshop']
temp_data_1 = temp_data_1.reset_index()

df_train_test = pd.merge(df_train_test, temp_data_1, how = 'left', on = ['shop_id'])

In [26]:
#Create the average price for each item by each shop_id and item_id
temp_data_1 = df_train.groupby(['shop_id','item_id']).agg({'item_price':['mean']})
temp_data_1.columns = ['avg_price_each_item_ofshop']
temp_data_1 = temp_data_1.reset_index()

df_train_test = pd.merge(df_train_test, temp_data_1, how = 'left', on = ['shop_id','item_id'])


In [27]:
#Create the average price for each item by each shop_id and item_id
temp_data_1 = df_train.groupby(['shop_id','item_category_id','item_id']).agg({'item_price':['mean']})
temp_data_1.columns = ['avg_price_each_item_cat_ofshop']
temp_data_1 = temp_data_1.reset_index()

df_train_test = pd.merge(df_train_test, temp_data_1, how = 'left', on = ['shop_id','item_category_id','item_id'])


In [28]:
#Create the average price for each item by each shop_id and item_id
temp_data_1 = df_train.groupby(['item_id']).agg({'item_price':['min','max','mean']})
temp_data_1.columns = ['min_price_each_item','max_price_each_item', 'mean_price_each_item']
temp_data_1 = temp_data_1.reset_index()

df_train_test = pd.merge(df_train_test, temp_data_1, how = 'left', on = ['item_id'])

In [29]:
df_train_test.head(10)

Unnamed: 0,train_or_test,shop_id,item_id,item_category_id,item_cnt_day,shop_avg_item_cnt,avg_item_cnt,avg_item_cnt_each_cat,shop_cat_avg_item_cnt,avg_price_ofshop,avg_price_each_item_ofshop,avg_price_each_item_cat_ofshop,min_price_each_item,max_price_each_item,mean_price_each_item
0,1,0,35,40.0,15.0,1.187481,1.233333,1.125806,1.248923,563.444151,247.0,247.0,148.0,399.0,375.828056
1,1,0,36,37.0,1.0,1.187481,1.0,1.063038,1.071181,563.444151,357.0,357.0,58.0,549.0,183.012195
2,1,0,40,57.0,1.0,1.187481,1.0,1.009961,1.013072,563.444151,127.0,127.0,127.0,249.0,245.138298
3,1,0,43,40.0,1.0,1.187481,1.0,1.125806,1.248923,563.444151,221.0,221.0,98.0,248.0,167.222222
4,1,0,49,57.0,2.0,1.187481,1.005405,1.009961,1.013072,563.444151,127.0,127.0,127.0,299.0,271.681081
5,1,0,61,43.0,1.0,1.187481,1.0,1.004527,1.0,563.444151,195.0,195.0,195.0,349.0,319.916667
6,1,0,75,40.0,1.0,1.187481,1.0,1.125806,1.248923,563.444151,76.0,76.0,76.0,149.0,142.105882
7,1,0,85,37.0,1.0,1.187481,1.0,1.063038,1.071181,563.444151,190.0,190.0,190.0,299.0,289.368182
8,1,0,95,40.0,1.0,1.187481,1.0,1.125806,1.248923,563.444151,193.0,193.0,193.0,299.0,259.0
9,1,0,96,40.0,1.0,1.187481,1.0,1.125806,1.248923,563.444151,70.0,70.0,70.0,149.0,145.238095


In [31]:
df_train_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
train_or_test,504098.0,0.575083,0.494331,0.0,0.0,1.0,1.0,1.0
shop_id,504098.0,31.509905,17.236922,0.0,18.0,31.0,47.0,59.0
item_id,504098.0,11423.812642,6175.751291,0.0,6183.0,11648.0,16587.0,22169.0
item_category_id,504098.0,33.81476,21.795979,0.0,19.0,40.0,55.0,83.0
item_cnt_day,504098.0,3.429962,12.681862,0.0,0.0,1.0,3.0,1704.0
shop_avg_item_cnt,504098.0,1.232275,0.200483,1.057546,1.156484,1.185057,1.261944,4.240983
avg_item_cnt,488852.0,1.095128,0.527419,1.0,1.0,1.01487,1.069767,65.473684
avg_item_cnt_each_cat,504098.0,1.107126,0.173733,1.0,1.02926,1.063038,1.125806,7.32977
shop_cat_avg_item_cnt,404852.0,1.114135,0.784104,1.0,1.022305,1.055825,1.126519,333.666667
avg_price_ofshop,504098.0,921.241079,179.074771,299.237067,834.664881,917.856259,973.275624,1458.766652


In [32]:
df_train_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 504098 entries, 0 to 504097
Data columns (total 15 columns):
train_or_test                     504098 non-null int64
shop_id                           504098 non-null int64
item_id                           504098 non-null int64
item_category_id                  504098 non-null float64
item_cnt_day                      504098 non-null float64
shop_avg_item_cnt                 504098 non-null float64
avg_item_cnt                      488852 non-null float64
avg_item_cnt_each_cat             504098 non-null float64
shop_cat_avg_item_cnt             404852 non-null float64
avg_price_ofshop                  504098 non-null float64
avg_price_each_item_ofshop        401302 non-null float64
avg_price_each_item_cat_ofshop    401302 non-null float64
min_price_each_item               488852 non-null float64
max_price_each_item               488852 non-null float64
mean_price_each_item              488852 non-null float64
dtypes: float64(12), int6

In [33]:
df_train_test.fillna(0, inplace = True)

In [34]:
df_train_test.columns

Index(['train_or_test', 'shop_id', 'item_id', 'item_category_id',
       'item_cnt_day', 'shop_avg_item_cnt', 'avg_item_cnt',
       'avg_item_cnt_each_cat', 'shop_cat_avg_item_cnt', 'avg_price_ofshop',
       'avg_price_each_item_ofshop', 'avg_price_each_item_cat_ofshop',
       'min_price_each_item', 'max_price_each_item', 'mean_price_each_item'],
      dtype='object')

int_col = ['train_or_test','shop_id','item_id']
float_col = ['item_category_id',
       'item_cnt_day', 'shop_avg_item_cnt', 'avg_item_cnt',
       'avg_item_cnt_each_cat', 'shop_cat_avg_item_cnt', 'avg_price_ofshop',
       'avg_price_each_item_ofshop', 'avg_price_each_item_cat_ofshop',
       'min_price_each_item', 'max_price_each_item', 'mean_price_each_item']

for i in int_col:
    df_train_test[i]= df_train_test[i].astype(np.int32)
for i in float_col:
    df_train_test[i]= df_train_test[i].astype(np.float32)

In [35]:
missing = list(df_train_test.isnull().sum().sort_values(ascending = True).items())
missing

[('train_or_test', 0),
 ('shop_id', 0),
 ('item_id', 0),
 ('item_category_id', 0),
 ('item_cnt_day', 0),
 ('shop_avg_item_cnt', 0),
 ('avg_item_cnt', 0),
 ('avg_item_cnt_each_cat', 0),
 ('shop_cat_avg_item_cnt', 0),
 ('avg_price_ofshop', 0),
 ('avg_price_each_item_ofshop', 0),
 ('avg_price_each_item_cat_ofshop', 0),
 ('min_price_each_item', 0),
 ('max_price_each_item', 0),
 ('mean_price_each_item', 0)]

In [36]:
df_train_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
train_or_test,504098.0,0.575083,0.494331,0.0,0.0,1.0,1.0,1.0
shop_id,504098.0,31.509905,17.236922,0.0,18.0,31.0,47.0,59.0
item_id,504098.0,11423.812642,6175.751291,0.0,6183.0,11648.0,16587.0,22169.0
item_category_id,504098.0,33.81476,21.795979,0.0,19.0,40.0,55.0,83.0
item_cnt_day,504098.0,3.429962,12.681862,0.0,0.0,1.0,3.0,1704.0
shop_avg_item_cnt,504098.0,1.232275,0.200483,1.057546,1.156484,1.185057,1.261944,4.240983
avg_item_cnt,504098.0,1.062006,0.552207,0.0,1.0,1.013072,1.06701,65.473684
avg_item_cnt_each_cat,504098.0,1.107126,0.173733,1.0,1.02926,1.063038,1.125806,7.32977
shop_cat_avg_item_cnt,504098.0,0.894786,0.830689,0.0,1.005534,1.03884,1.099099,333.666667
avg_price_ofshop,504098.0,921.241079,179.074771,299.237067,834.664881,917.856259,973.275624,1458.766652


In [76]:
np.log1p()

1.6094379124341003

In [17]:
#df_test_1.loc[:,'normalized_price'] = (df_test_1['item_price'] - df_test_1['item_price'].min()) / (df_test_1['item_price'].max() - df_test_1['item_price'].min())
#df_test_1.loc[:,'standardized_price'] = (df_test_1['item_price'] - df_test_1['item_price'].mean()) / df_test_1['item_price'].std()
#df_test_1.loc[:,'price_bin_round'] = np.array(np.floor(np.array(df_test_1['item_price']) / 100.))
#df_test_1.head()

In [37]:
# Convert the feature  to the np.log

col_name = ['item_category_id',
       'item_cnt_day', 'shop_avg_item_cnt', 'avg_item_cnt',
       'avg_item_cnt_each_cat', 'shop_cat_avg_item_cnt', 'avg_price_ofshop',
       'avg_price_each_item_ofshop', 'avg_price_each_item_cat_ofshop',
       'min_price_each_item', 'max_price_each_item', 'mean_price_each_item']

#from sklearn.preprocessing import QuantileTransformer
#qt = QuantileTransformer(n_quantiles=10, random_state=42)
for i in col_name:
    df_train_test[i] = df_train_test[i].apply(np.log1p)
df_train_test.replace(-np.inf, np.nan)
df_train_test.fillna(0)

Unnamed: 0,train_or_test,shop_id,item_id,item_category_id,item_cnt_day,shop_avg_item_cnt,avg_item_cnt,avg_item_cnt_each_cat,shop_cat_avg_item_cnt,avg_price_ofshop,avg_price_each_item_ofshop,avg_price_each_item_cat_ofshop,min_price_each_item,max_price_each_item,mean_price_each_item
0,1,0,35,3.713572,2.772589,0.782751,0.803495,0.754151,0.810451,6.335841,5.513429,5.513429,5.003946,5.991465,5.931789
1,1,0,36,3.637586,0.693147,0.782751,0.693147,0.724180,0.728119,6.335841,5.880533,5.880533,4.077537,6.309918,5.215002
2,1,0,40,4.060443,0.693147,0.782751,0.693147,0.698116,0.699662,6.335841,4.852030,4.852030,4.852030,5.521461,5.505894
3,1,0,43,3.713572,0.693147,0.782751,0.693147,0.754151,0.810451,6.335841,5.402677,5.402677,4.595120,5.517453,5.125286
4,1,0,49,4.060443,1.098612,0.782751,0.695846,0.698116,0.699662,6.335841,4.852030,4.852030,4.852030,5.703782,5.608303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504093,0,45,18454,4.025352,0.000000,0.776387,0.721375,0.707671,0.702394,6.846986,5.241159,5.241159,4.595120,5.298317,5.245344
504094,0,45,16188,0.000000,0.000000,0.776387,0.693147,0.693147,0.000000,6.846986,0.000000,0.000000,7.109798,7.215240,7.137259
504095,0,45,15757,4.025352,0.000000,0.776387,0.694498,0.707671,0.702394,6.846986,5.298317,5.298317,4.709530,5.438079,5.293862
504096,0,45,19648,0.000000,0.000000,0.776387,0.711496,0.693147,0.000000,6.846986,0.000000,0.000000,4.262680,4.605170,4.596092


In [39]:
df_train_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
train_or_test,504098.0,0.575083,0.494331,0.0,0.0,1.0,1.0,1.0
shop_id,504098.0,31.509905,17.236922,0.0,18.0,31.0,47.0,59.0
item_id,504098.0,11423.812642,6175.751291,0.0,6183.0,11648.0,16587.0,22169.0
item_category_id,504098.0,2.939687,1.540997,0.0,2.995732,3.713572,4.025352,4.430817
item_cnt_day,504098.0,0.826628,0.946875,0.0,0.0,0.693147,1.386294,7.44132
shop_avg_item_cnt,504098.0,0.800057,0.071883,0.721514,0.768479,0.781642,0.816225,1.656509
avg_item_cnt,504098.0,0.709629,0.160044,0.0,0.693147,0.699662,0.726103,4.196806
avg_item_cnt_each_cat,504098.0,0.743072,0.062378,0.693147,0.707671,0.72418,0.754151,2.119836
shop_cat_avg_item_cnt,504098.0,0.597712,0.303895,0.0,0.695911,0.712381,0.741508,5.813135
avg_price_ofshop,504098.0,6.806619,0.207834,5.704572,6.728228,6.82313,6.881694,7.286032


# Data For Modeling

In [40]:
#Seperata the Train set
data = df_train_test[df_train_test['train_or_test'] == 1]
data = data.drop(['shop_id','item_id','train_or_test','item_category_id'], axis =1)
#Test set
test = df_train_test[df_train_test['train_or_test'] == 0]
test = test.drop(['shop_id','item_id','train_or_test','item_category_id'],axis= 1)



In [41]:
data

Unnamed: 0,item_cnt_day,shop_avg_item_cnt,avg_item_cnt,avg_item_cnt_each_cat,shop_cat_avg_item_cnt,avg_price_ofshop,avg_price_each_item_ofshop,avg_price_each_item_cat_ofshop,min_price_each_item,max_price_each_item,mean_price_each_item
0,2.772589,0.782751,0.803495,0.754151,0.810451,6.335841,5.513429,5.513429,5.003946,5.991465,5.931789
1,0.693147,0.782751,0.693147,0.724180,0.728119,6.335841,5.880533,5.880533,4.077537,6.309918,5.215002
2,0.693147,0.782751,0.693147,0.698116,0.699662,6.335841,4.852030,4.852030,4.852030,5.521461,5.505894
3,0.693147,0.782751,0.693147,0.754151,0.810451,6.335841,5.402677,5.402677,4.595120,5.517453,5.125286
4,1.098612,0.782751,0.695846,0.698116,0.699662,6.335841,4.852030,4.852030,4.852030,5.703782,5.608303
...,...,...,...,...,...,...,...,...,...,...,...
289893,0.693147,0.775001,0.719815,0.725105,0.701477,6.785056,7.313220,7.313220,6.908755,7.313220,7.297658
289894,1.098612,0.775001,0.693147,0.724180,0.701586,6.785056,5.298317,5.298317,4.896197,5.298317,5.274508
289895,1.386294,0.775001,0.837660,0.754151,0.723893,6.785056,5.991465,5.991465,5.594711,5.991465,5.942951
289896,0.693147,0.775001,0.693147,0.724180,0.701586,6.785056,4.915078,4.915078,4.895374,5.703782,5.342898


## Create the Target 'y' and feature 'X' for models

In [42]:
# Create Target y and feature X base on the train set
y = data['item_cnt_day']
X = data.drop('item_cnt_day', axis = 1)

In [43]:
#Split the data to traing set and test set base on the train set to train model
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [44]:
len(Xtrain), len(ytrain)

(231918, 231918)

## K-NN Regression model

In [45]:
#Import sklearn for model
from sklearn.neighbors import KNeighborsRegressor

### Selection the importance feature


In [46]:
###Hyperparameter tunning ###
######### FIND BEST n_neighbors ###############
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': np.arange(5,15,4)}
knn = KNeighborsRegressor()
knn_cv = GridSearchCV(knn, param_grid, cv = 5)
knn_cv.fit(Xtrain,ytrain)
knn_cv.best_params_ , knn_cv.best_score_

({'n_neighbors': 5}, 0.5579750700662178)

In [47]:
#Predict the labels of the test data
y_pred = knn_cv.predict(Xtest)

In [48]:
#Checking score test and predict
test_accuray = knn_cv.score(Xtest, ytest)
train_accuray = knn_cv.score(Xtrain, ytrain)
rmse = np.sqrt(mean_squared_error(ytest, y_pred))

In [49]:
print('R^2 of model: ',test_accuray)
print('RMSE of model: ', rmse)

R^2 of model:  0.5687308790217138
RMSE of model:  0.5413899417674177


In [50]:
train_accuray

0.7216922278980897

#### Importance feature

# Random forest modeling

In [51]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor

#### Feature selection

In [52]:
rf = RandomForestRegressor()
rf.fit(Xtrain, ytrain)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [53]:
features_name = data.drop('item_cnt_day', axis = 1).columns
Score = rf.feature_importances_
no_name = zip(features_name, Score)
temp_f = pd.DataFrame(no_name,  columns = ['name', 'score'])
temp_f = temp_f.sort_values('score', ascending = False)


In [54]:
temp_f

Unnamed: 0,name,score
1,avg_item_cnt,0.32907
3,shop_cat_avg_item_cnt,0.142872
9,mean_price_each_item,0.104777
7,min_price_each_item,0.091624
0,shop_avg_item_cnt,0.079432
4,avg_price_ofshop,0.07321
2,avg_item_cnt_each_cat,0.048555
5,avg_price_each_item_ofshop,0.046537
6,avg_price_each_item_cat_ofshop,0.046084
8,max_price_each_item,0.037838


In [55]:
#Select top 3 feature
col_name = temp_f.name.values[0:5]
col_name

array(['avg_item_cnt', 'shop_cat_avg_item_cnt', 'mean_price_each_item',
       'min_price_each_item', 'shop_avg_item_cnt'], dtype=object)

In [56]:
Xtrain_rf = Xtrain[col_name]
Xtest_rf = Xtest[col_name]

In [57]:
Xtrain_rf = Xtrain_rf.values
ytrain_rf = ytrain.values
Xtest_rf = Xtest_rf.values
ytest_rf = ytest.values

In [58]:
Xtrain_rf

array([[0.69314718, 0.72192692, 5.48925173, 3.36729583, 0.74076529],
       [0.70523033, 0.72246425, 4.95236546, 4.35670883, 0.75530188],
       [0.70829899, 0.71576023, 6.79285834, 5.98896142, 0.80465132],
       ...,
       [0.69314718, 0.82281785, 4.84213088, 3.40119738, 0.83337925],
       [0.69314718, 0.70000476, 5.27743097, 4.7095302 , 0.78931724],
       [0.71183931, 0.78608422, 7.58605035, 7.13089883, 0.83308805]])

##### end___

#Instantiate the model
rf = RandomForestRegressor()

In [59]:
####Hyperparameter tunning ###
###CHoice BEST PARAMETER ##

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 20, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [60]:

#Instantiate the tunning parameter using RandomizeSearchCV
rf_cv = RandomizedSearchCV(rf, random_grid, cv = 5, random_state=42)

In [61]:
rf_cv.fit(Xtrain_rf, ytrain_rf)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              


#Get the best parameter and best score
rf_cv.best_params_, rf_cv.best_score_


({'n_estimators': 20,
  'min_samples_split': 5,
  'min_samples_leaf': 1,
  'max_features': 'sqrt',
  'max_depth': None,
  'bootstrap': False})

rf = RandomForestRegressor(n_estimators =  20,
                                  min_samples_split =  5,
                                  min_samples_leaf = 1,
                                  max_features = 'sqrt',
                                  max_depth = None,
                                  bootstrap =  False)

rf.fit(Xtrain_rf,ytrain_rf)

In [62]:
#Predict the data
y_pred_rf = rf_cv.predict(Xtest_rf)

In [63]:
#Get the R^2 and RMSE
test_accuary_rf = rf_cv.score(Xtest_rf, ytest_rf)
train_accuary_rf = rf_cv.score(Xtrain_rf, ytrain_rf)
rmse_rf = np.sqrt(mean_squared_error(ytest_rf, y_pred_rf))


In [64]:
print('R^2 of model: ',test_accuary_rf)
print('RMSE of model: ', rmse_rf)

R^2 of model:  0.6420898394285794
RMSE of model:  0.4931999398230363


In [65]:
train_accuary_rf

0.8082771957901278

## Try predict our test.csv file was provide 

In [66]:
#Create the array of predict the item_cnt in test file
#new_predict = df_test_1.values
#Use model to predict
#y_new_predict = rf_cv.predict(new_predict)
#e = 2.71828
#y_new_predict_item_cnt = e**y_new_predict


In [67]:
#solution = df_test_1
#solution['item_cnt_day'] = np.round(y_new_predict_item_cnt,1)
#solution_1=solution.drop(['shop_id','item_id','standardized_price','normalized_price','price_bin_round','item_price'], axis = 1).reset_index()
#solution_1['item_price'] =np.round(e ** df_test_1.item_price,0)
#solution_1.columns = ['ID', 'item_cnt_month']
#solution_1 = solution_1.set_index('ID')
#solution_1.head()

In [68]:
#solution_1.to_csv('submitfile.csv')

# Xgboost Model

In [69]:
from sklearn.ensemble import GradientBoostingRegressor

In [70]:
xg = GradientBoostingRegressor()
xg.fit(Xtrain,ytrain)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [71]:
features_name = data.drop('item_cnt_day', axis = 1).columns
Score = xg.feature_importances_
no_name = zip(features_name, Score)
temp_f = pd.DataFrame(no_name,  columns = ['name', 'score'])
temp_f = temp_f.sort_values('score', ascending = False)

In [72]:
temp_f

Unnamed: 0,name,score
1,avg_item_cnt,0.518453
3,shop_cat_avg_item_cnt,0.179163
0,shop_avg_item_cnt,0.109914
4,avg_price_ofshop,0.066979
7,min_price_each_item,0.037452
2,avg_item_cnt_each_cat,0.032355
5,avg_price_each_item_ofshop,0.01782
6,avg_price_each_item_cat_ofshop,0.016756
9,mean_price_each_item,0.014434
8,max_price_each_item,0.006674


In [73]:
#Select top 3 feature
col_name = temp_f.name.values[0:5]
Xtrain_xg = Xtrain[col_name]
Xtest_xg = Xtest[col_name]
Xtrain_xg = Xtrain_xg.values
ytrain_xg = ytrain.values
Xtest_xg = Xtest_xg.values
ytest_xg = ytest.values

In [74]:
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}

In [None]:
xg = GradientBoostingRegressor(**params)
xg.fit(Xtrain_xg,ytrain_xg)

In [None]:
#Predict 
y_pred_xg = xg.predict(Xtest_xg)

In [None]:
test_accuary_xg = xg.score(Xtest_xg, ytest_xg)
train_accuary_xg = xg.score(Xtrain_xg, ytrain_xg)
rmse_xg = np.sqrt(mean_squared_error(ytest_xg, y_pred_xg))

In [None]:
print('R^2 of model of test set: ',test_accuary_xg)
print('RMSE of model: ', rmse_xg)
print('R^2 train set: ', train_accuary_xg)

# DecisionTreeRegressor Model

In [None]:
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

In [None]:
#Instantiate the model
tree = DecisionTreeRegressor()
#Fit the data
tree.fit(Xtrain,ytrain)

In [None]:
#Predict 
y_pred_tree = tree.predict(Xtest)

In [None]:
test_accuary_tree = tree.score(Xtest, ytest)
train_accuary_tree = tree.score(Xtrain, ytrain)
rmse_tree = np.sqrt(mean_squared_error(ytest, y_pred_tree))

In [None]:
print('R^2 of model of test set: ',test_accuary_tree)
print('RMSE of model: ', rmse_tree)
print('R^2 train set: ', train_accuary_tree)

In [None]:
train_accuary_tree

# Solution

- Used feature engieering, to add more feature for modeling
- Data have alot items was only 1 per day, so I used log transform to have better data
- Also price is same, it crewed so I also use log trandform.
- standardided and normilized both item_cnt_data and item_price

- Overrall, that show random forest regression model have better R^2 score and RMSE score
- We choice the random forest for our problem here.