# In-Depth Analysis: Machine Learning Models

## Import python libarary

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from itertools import product
from sklearn.metrics import r2_score

## Loading the data

In [2]:
df_train_1 = pd.read_csv('data/sales_train.csv')
df_test = pd.read_csv('data/test.csv', index_col = 'ID')
df_item_1 = pd.read_csv('data/items.csv')

In [3]:
df_train_2 = df_train_1[((df_train_1.item_cnt_day > 0 ) & (df_train_1.item_cnt_day < 1000))]
df_train_2 = df_train_2[df_train_2.item_price > 0]

In [4]:
df_train = pd.merge(df_train_2, df_item_1, how = 'left')
df_train = df_train.drop('item_name', axis = 1)

## Prepare the Data for Modeling

In [5]:
#Creating the data frame from the sale_traing file
#Groupby the  shop_id and item_id, date_block_num
temp_data = df_train.groupby(['shop_id','item_id']).agg({'item_cnt_day':'sum'})
temp_data = temp_data.reset_index()
temp_data['train_or_test'] = 1 #set as train set


In [6]:
temp_cat = df_train.groupby(['shop_id','item_id','item_category_id']).sum().reset_index()
temp_cat = temp_cat[['shop_id','item_id','item_category_id']]

In [7]:
df_test = pd.merge(df_test, temp_cat, how = 'left', on = ['shop_id','item_id'])
df_test.fillna(0, inplace = True)

In [8]:
#Merge Category_id to data frame
temp_data = pd.merge(temp_data, temp_cat, how = 'left', on = ['shop_id', 'item_id'])

In [9]:
#Remove the item_id of test set in training set
test_item_id = df_test.item_id.unique()
temp_data = temp_data[~temp_data['item_id'].isin(test_item_id)]

In [10]:
#### Double checking if the item_id in test set still in train set 
x = temp_data.item_id.unique()
y = df_test.item_id.unique()
count = 0
for i in x:
    if i in y:
        count += 1
print(count)

0


In [11]:
#Create the date block number for df_test = 34
#df_test['date_block_num'] = 34
df_test['train_or_test'] = 0 #set as a test set
df_test['item_cnt_day'] = 0
#Then add the data to the df_temp_1
df_train_test = pd.concat([temp_data, df_test], ignore_index= True, sort = False, keys = ['shop_id','item_id'])

In [12]:
df_train_test = df_train_test[['train_or_test','shop_id','item_id','item_category_id','item_cnt_day']]

# Creating the feature for modeling

In [13]:
#Creating the average of item_cnt_day for shop_id and date_block_num
temp = df_train.groupby(['shop_id']).agg({'item_cnt_day':['mean']})
temp.columns = ['shop_avg_item_cnt']
temp = temp.reset_index()
#Merge the feature above to the data
df_train_test = pd.merge(df_train_test, temp, how = 'left', on = ['shop_id'])

In [14]:
#Creating the average of item_cnt_day for date_block_num
temp = df_train.groupby(['item_id']).agg({'item_cnt_day':['mean']})
temp.columns = ['avg_item_cnt']
temp = temp.reset_index()
#Merge the feature above to the data
df_train_test = pd.merge(df_train_test, temp, how = 'left', on = 'item_id')

In [15]:
#Creating the average of item_cnt_day for date_block_num
temp = df_train.groupby(['item_category_id']).agg({'item_cnt_day':['mean']})
temp.columns = ['avg_item_cnt_each_cat']
temp = temp.reset_index()
#Merge the feature above to the data
df_train_test = pd.merge(df_train_test, temp, how = 'left', on = 'item_category_id')

In [16]:
#Creating the average of item_cnt_day for date_block_num, category_id

temp = df_train.groupby(['item_category_id','item_id']).agg({'item_cnt_day':['mean']})
temp.columns = ['cat_item_avg_item_cnt']
temp = temp.reset_index()
#Merge the feature above to the data
df_train_test = pd.merge(df_train_test, temp, how = 'left', on = ['item_category_id','item_id'])

In [17]:
#Create the average price for each item by each shop_id and item_id
temp_data_1 = df_train.groupby(['shop_id']).agg({'item_price':['mean']})
temp_data_1.columns = ['avg_price_ofshop']
temp_data_1 = temp_data_1.reset_index()

df_train_test = pd.merge(df_train_test, temp_data_1, how = 'left', on = ['shop_id'])

In [18]:
#Create the average price for each item by each shop_id and item_id
temp_data_1 = df_train.groupby(['shop_id','item_id']).agg({'item_price':['mean']})
temp_data_1.columns = ['avg_price_each_item_ofshop']
temp_data_1 = temp_data_1.reset_index()

df_train_test = pd.merge(df_train_test, temp_data_1, how = 'left', on = ['shop_id','item_id'])


In [19]:
#Create the average price for each item by each shop_id and item_id
temp_data_1 = df_train.groupby(['item_category_id','item_id']).agg({'item_price':['mean']})
temp_data_1.columns = ['avg_price_each_item_cat']
temp_data_1 = temp_data_1.reset_index()

df_train_test = pd.merge(df_train_test, temp_data_1, how = 'left', on = ['item_category_id','item_id'])


In [20]:
#Create the average price for each item by each shop_id and item_id
temp_data_1 = df_train.groupby(['item_id']).agg({'item_price':['min','max','mean']})
temp_data_1.columns = ['min_price_each_item','max_price_each_item', 'mean_price_each_item']
temp_data_1 = temp_data_1.reset_index()

df_train_test = pd.merge(df_train_test, temp_data_1, how = 'left', on = ['item_id'])

In [21]:
df_train_test.head(3)

Unnamed: 0,train_or_test,shop_id,item_id,item_category_id,item_cnt_day,shop_avg_item_cnt,avg_item_cnt,avg_item_cnt_each_cat,cat_item_avg_item_cnt,avg_price_ofshop,avg_price_each_item_ofshop,avg_price_each_item_cat,min_price_each_item,max_price_each_item,mean_price_each_item
0,1,0,35,40.0,15.0,1.187481,1.233333,1.125806,1.233333,563.444151,247.0,375.828056,148.0,399.0,375.828056
1,1,0,36,37.0,1.0,1.187481,1.0,1.063038,1.0,563.444151,357.0,183.012195,58.0,549.0,183.012195
2,1,0,40,57.0,1.0,1.187481,1.0,1.009961,1.0,563.444151,127.0,245.138298,127.0,249.0,245.138298


In [22]:
df_train_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
train_or_test,504098.0,0.575083,0.494331,0.0,0.0,1.0,1.0,1.0
shop_id,504098.0,31.509905,17.236922,0.0,18.0,31.0,47.0,59.0
item_id,504098.0,11423.812642,6175.751291,0.0,6183.0,11648.0,16587.0,22169.0
item_category_id,504098.0,33.81476,21.795979,0.0,19.0,40.0,55.0,83.0
item_cnt_day,504098.0,3.429962,12.681862,0.0,0.0,1.0,3.0,1704.0
shop_avg_item_cnt,504098.0,1.232275,0.200483,1.057546,1.156484,1.185057,1.261944,4.240983
avg_item_cnt,488852.0,1.095128,0.527419,1.0,1.0,1.01487,1.069767,65.473684
avg_item_cnt_each_cat,504098.0,1.107126,0.173733,1.0,1.02926,1.063038,1.125806,7.32977
cat_item_avg_item_cnt,401302.0,1.081952,0.328331,1.0,1.0,1.016129,1.068584,65.473684
avg_price_ofshop,504098.0,921.241079,179.074771,299.237067,834.664881,917.856259,973.275624,1458.766652


In [23]:
df_train_test.fillna(0, inplace = True)

In [24]:
missing = list(df_train_test.isnull().sum().sort_values(ascending = True).items())
missing

[('train_or_test', 0),
 ('shop_id', 0),
 ('item_id', 0),
 ('item_category_id', 0),
 ('item_cnt_day', 0),
 ('shop_avg_item_cnt', 0),
 ('avg_item_cnt', 0),
 ('avg_item_cnt_each_cat', 0),
 ('cat_item_avg_item_cnt', 0),
 ('avg_price_ofshop', 0),
 ('avg_price_each_item_ofshop', 0),
 ('avg_price_each_item_cat', 0),
 ('min_price_each_item', 0),
 ('max_price_each_item', 0),
 ('mean_price_each_item', 0)]

In [25]:
#df_test_1.loc[:,'normalized_price'] = (df_test_1['item_price'] - df_test_1['item_price'].min()) / (df_test_1['item_price'].max() - df_test_1['item_price'].min())
#df_test_1.loc[:,'standardized_price'] = (df_test_1['item_price'] - df_test_1['item_price'].mean()) / df_test_1['item_price'].std()
#df_test_1.loc[:,'price_bin_round'] = np.array(np.floor(np.array(df_test_1['item_price']) / 100.))
#df_test_1.head()

https://scikit-learn.org/stable/auto_examples/compose/plot_transformed_target.html#sphx-glr-auto-examples-compose-plot-transformed-target-py

In [26]:
df_train_test.columns

Index(['train_or_test', 'shop_id', 'item_id', 'item_category_id',
       'item_cnt_day', 'shop_avg_item_cnt', 'avg_item_cnt',
       'avg_item_cnt_each_cat', 'cat_item_avg_item_cnt', 'avg_price_ofshop',
       'avg_price_each_item_ofshop', 'avg_price_each_item_cat',
       'min_price_each_item', 'max_price_each_item', 'mean_price_each_item'],
      dtype='object')

In [27]:
# Convert the feature  to the np.log

col_name = ['item_cnt_day', 'shop_avg_item_cnt', 'avg_item_cnt',
       'avg_item_cnt_each_cat', 'cat_item_avg_item_cnt', 'avg_price_ofshop',
       'avg_price_each_item_ofshop', 'avg_price_each_item_cat',
       'min_price_each_item', 'max_price_each_item', 'mean_price_each_item']
#from sklearn.preprocessing import QuantileTransformer
#qt = QuantileTransformer(n_quantiles=10, random_state=42)
for i in col_name:
    df_train_test[i] = df_train_test[i].apply(np.log1p)

In [28]:
df_train_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
train_or_test,504098.0,0.575083,0.494331,0.0,0.0,1.0,1.0,1.0
shop_id,504098.0,31.509905,17.236922,0.0,18.0,31.0,47.0,59.0
item_id,504098.0,11423.812642,6175.751291,0.0,6183.0,11648.0,16587.0,22169.0
item_category_id,504098.0,33.81476,21.795979,0.0,19.0,40.0,55.0,83.0
item_cnt_day,504098.0,0.826628,0.946875,0.0,0.0,0.693147,1.386294,7.44132
shop_avg_item_cnt,504098.0,0.800057,0.071883,0.721514,0.768479,0.781642,0.816225,1.656509
avg_item_cnt,504098.0,0.709629,0.160044,0.0,0.693147,0.699662,0.726103,4.196806
avg_item_cnt_each_cat,504098.0,0.743072,0.062378,0.693147,0.707671,0.72418,0.754151,2.119836
cat_item_avg_item_cnt,504098.0,0.580026,0.303157,0.0,0.693147,0.694888,0.717245,4.196806
avg_price_ofshop,504098.0,6.806619,0.207834,5.704572,6.728228,6.82313,6.881694,7.286032


# Data For Modeling

In [29]:
#Seperata the Train set
data = df_train_test[df_train_test['train_or_test'] == 1]
data = data.drop(['train_or_test','item_category_id','shop_id','item_id'], axis =1)
#submit file predict
submit = df_train_test[df_train_test['train_or_test'] == 0]
submit = submit.drop(['train_or_test','item_category_id','shop_id','item_id'],axis= 1)



## Create the Target 'y' and feature 'X' for models

In [30]:
# Create Target y and feature X base on the train set
y = data['item_cnt_day']
X = data.drop('item_cnt_day', axis = 1)

In [31]:
#Split the data to traing set and test set base on the train set to train model
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [32]:
len(Xtrain), len(ytrain)

(202928, 202928)

## K-NN Regression model

In [33]:
#Import sklearn for model
from sklearn.neighbors import KNeighborsRegressor

### Selection the importance feature


In [34]:
###Hyperparameter tunning ###
######### FIND BEST n_neighbors ###############
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': np.arange(5,6,1)}
knn = KNeighborsRegressor()
knn_cv = GridSearchCV(knn, param_grid, cv = 5)
knn_cv.fit(Xtrain,ytrain)
knn_cv.best_params_ , knn_cv.best_score_

({'n_neighbors': 5}, 0.5389903524559968)

In [35]:
#Predict the labels of the test data
y_pred = knn_cv.predict(Xtest)

In [36]:
#Checking score test and predict
test_accuray = knn_cv.score(Xtest, ytest)
train_accuray = knn_cv.score(Xtrain, ytrain)
rmse = np.sqrt(mean_squared_error(ytest, y_pred))

In [37]:
print('R^2 of model: ',test_accuray)
print('RMSE of model: ', rmse)

R^2 of model:  0.5507134233545816
RMSE of model:  0.5513823603660406


In [38]:
train_accuray

0.7080143218064845

#### Importance feature

# Random forest modeling

In [39]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor

#### Feature selection

In [40]:
#rf = RandomForestRegressor()
#rf.fit(Xtrain, ytrain)

In [41]:
#features_name = data.drop('item_cnt_day', axis = 1).columns
#Score = rf.feature_importances_
#no_name = zip(features_name, Score)
#temp_f = pd.DataFrame(no_name,  columns = ['name', 'score'])
#temp_f = temp_f.sort_values('score', ascending = False)


In [42]:
#temp_f

In [43]:
#Select top 3 feature
#col_name = temp_f.name.values[0:5]
#col_name = ['avg_item_cnt', 'shop_cat_avg_item_cnt', 'mean_price_each_item',
 #      'min_price_each_item', 'shop_avg_item_cnt']

In [44]:
#Xtrain_rf = Xtrain[col_name]
#Xtest_rf = Xtest[col_name]

In [45]:
#Xtrain_rf = Xtrain_rf.values
#ytrain_rf = ytrain.values
#Xtest_rf = Xtest_rf.values
#ytest_rf = ytest.values

In [46]:
#Xtrain_rf

##### end___

In [47]:
#Instantiate the model
rf = RandomForestRegressor()

In [48]:
####Hyperparameter tunning ###
###CHoice BEST PARAMETER ##

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 20, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [49]:

#Instantiate the tunning parameter using RandomizeSearchCV
rf_cv = RandomizedSearchCV(rf, random_grid, cv = 5, random_state=42)

In [50]:
rf_cv.fit(Xtrain, ytrain)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [51]:

#Get the best parameter and best score
rf_cv.best_params_, rf_cv.best_score_

({'n_estimators': 20,
  'min_samples_split': 5,
  'min_samples_leaf': 1,
  'max_features': 'sqrt',
  'max_depth': None,
  'bootstrap': False},
 0.6780957584868701)

In [52]:
#Predict the data
y_pred_rf = rf_cv.predict(Xtest)

In [53]:
#Get the R^2 and RMSE
test_accuary_rf = rf_cv.score(Xtest, ytest)
train_accuary_rf = rf_cv.score(Xtrain, ytrain)
rmse_rf = np.sqrt(mean_squared_error(ytest, y_pred_rf))


In [54]:
print('R^2 of model: ',test_accuary_rf)
print('RMSE of model: ', rmse_rf)

R^2 of model:  0.6815191048879072
RMSE of model:  0.46422961156563797


# Xgboost Model

In [56]:
from sklearn.ensemble import GradientBoostingRegressor

In [57]:
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}

In [58]:
xg = GradientBoostingRegressor(**params)
xg.fit(Xtrain,ytrain)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='ls', max_depth=4,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=500,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [59]:
#Predict 
y_pred_xg = xg.predict(Xtest)


In [60]:
test_accuary_xg = xg.score(Xtest, ytest)
train_accuary_xg = xg.score(Xtrain, ytrain)
rmse_xg = np.sqrt(mean_squared_error(ytest, y_pred_xg))

In [61]:
print('R^2 of model of test set: ',test_accuary_xg)
print('RMSE of model: ', rmse_xg)
print('R^2 train set: ', train_accuary_xg)

R^2 of model of test set:  0.47742190839230103
RMSE of model:  0.5946572878853934
R^2 train set:  0.4859775180855838


# Solution

- Used feature engieering, to add more feature for modeling
- Data have alot items was only 1 per day, so I used log transform to have better data
- Also price is same, it crewed so I also use log trandform.
- standardided and normilized both item_cnt_data and item_price

- Overrall, that show random forest regression model have better R^2 score and RMSE score
- We choice the random forest regression for our problem here.