# In-Depth Analysis: Machine Learning Models

## Import python libarary

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from itertools import product
from sklearn.metrics import r2_score

## Loading the data

In [2]:
df_train_1 = pd.read_csv('data/sales_train.csv')
df_test = pd.read_csv('data/test.csv', index_col = 'ID')
df_item_1 = pd.read_csv('data/items.csv')

In [3]:
df_train_2 = df_train_1[((df_train_1.item_cnt_day > 0 ) & (df_train_1.item_cnt_day < 1000))]
df_train_2 = df_train_2[df_train_2.item_price > 0]

In [4]:
df_train = pd.merge(df_train_2, df_item_1, how = 'left')
df_train = df_train.drop('item_name', axis = 1)

## Prepare the Data for Modeling

In [5]:
#Creating the data frame from the sale_traing file
#Groupby the  shop_id and item_id, date_block_num
temp_data = df_train.groupby(['shop_id','item_id']).agg({'item_cnt_day':'sum'})
temp_data = temp_data.reset_index()
temp_data['train_or_test'] = 1 #set as train set


In [6]:
temp_cat = df_train.groupby(['shop_id','item_id','item_category_id']).sum().reset_index()
temp_cat = temp_cat[['shop_id','item_id','item_category_id']]

In [7]:
df_test = pd.merge(df_test, temp_cat, how = 'left', on = ['shop_id','item_id'])
df_test.fillna(0, inplace = True)

In [8]:
#Merge Category_id to data frame
temp_data = pd.merge(temp_data, temp_cat, how = 'left', on = ['shop_id', 'item_id'])

In [None]:
#Remove the item_id of test set in training set
#test_item_id = df_test.item_id.unique()
#temp_data = temp_data[~temp_data['item_id'].isin(test_item_id)]

In [9]:
#### Double checking if the item_id in test set still in train set 
x = temp_data.item_id.unique()
y = df_test.item_id.unique()
count = 0
for i in x:
    if i in y:
        count += 1
print(count)

4737


In [10]:
#Create the date block number for df_test = 34
#df_test['date_block_num'] = 34
df_test['train_or_test'] = 0 #set as a test set
df_test['item_cnt_day'] = 0
#Then add the data to the df_temp_1
df_train_test = pd.concat([temp_data, df_test], ignore_index= True, sort = False, keys = ['shop_id','item_id'])

In [11]:
df_train_test = df_train_test[['train_or_test','shop_id','item_id','item_category_id','item_cnt_day']]

# Creating the feature for modeling

In [12]:
#Creating the average of item_cnt_day for shop_id and date_block_num
temp = df_train.groupby(['shop_id']).agg({'item_cnt_day':['mean']})
temp.columns = ['shop_avg_item_cnt']
temp = temp.reset_index()
#Merge the feature above to the data
df_train_test = pd.merge(df_train_test, temp, how = 'left', on = ['shop_id'])

In [13]:
#Creating the average of item_cnt_day for date_block_num
temp = df_train.groupby(['item_id']).agg({'item_cnt_day':['mean']})
temp.columns = ['avg_item_cnt']
temp = temp.reset_index()
#Merge the feature above to the data
df_train_test = pd.merge(df_train_test, temp, how = 'left', on = 'item_id')

In [14]:
#Creating the average of item_cnt_day for date_block_num
temp = df_train.groupby(['item_category_id']).agg({'item_cnt_day':['mean']})
temp.columns = ['avg_item_cnt_each_cat']
temp = temp.reset_index()
#Merge the feature above to the data
df_train_test = pd.merge(df_train_test, temp, how = 'left', on = 'item_category_id')

In [15]:
#Creating the average of item_cnt_day for date_block_num, category_id

temp = df_train.groupby(['shop_id', 'item_category_id']).agg({'item_cnt_day':['mean']})
temp.columns = ['shop_cat_avg_item_cnt']
temp = temp.reset_index()
#Merge the feature above to the data
df_train_test = pd.merge(df_train_test, temp, how = 'left', on = ['shop_id','item_category_id'])

In [16]:
#Create the average price for each item by each shop_id and item_id
temp_data_1 = df_train.groupby(['shop_id']).agg({'item_price':['mean']})
temp_data_1.columns = ['avg_price_ofshop']
temp_data_1 = temp_data_1.reset_index()

df_train_test = pd.merge(df_train_test, temp_data_1, how = 'left', on = ['shop_id'])

In [17]:
#Create the average price for each item by each shop_id and item_id
temp_data_1 = df_train.groupby(['shop_id','item_id']).agg({'item_price':['mean']})
temp_data_1.columns = ['avg_price_each_item_ofshop']
temp_data_1 = temp_data_1.reset_index()

df_train_test = pd.merge(df_train_test, temp_data_1, how = 'left', on = ['shop_id','item_id'])


In [18]:
#Create the average price for each item by each shop_id and item_id
temp_data_1 = df_train.groupby(['shop_id','item_category_id','item_id']).agg({'item_price':['mean']})
temp_data_1.columns = ['avg_price_each_item_cat_ofshop']
temp_data_1 = temp_data_1.reset_index()

df_train_test = pd.merge(df_train_test, temp_data_1, how = 'left', on = ['shop_id','item_category_id','item_id'])


In [19]:
#Create the average price for each item by each shop_id and item_id
temp_data_1 = df_train.groupby(['item_id']).agg({'item_price':['min','max','mean']})
temp_data_1.columns = ['min_price_each_item','max_price_each_item', 'mean_price_each_item']
temp_data_1 = temp_data_1.reset_index()

df_train_test = pd.merge(df_train_test, temp_data_1, how = 'left', on = ['item_id'])

In [20]:
df_train_test.head(3)

Unnamed: 0,train_or_test,shop_id,item_id,item_category_id,item_cnt_day,shop_avg_item_cnt,avg_item_cnt,avg_item_cnt_each_cat,shop_cat_avg_item_cnt,avg_price_ofshop,avg_price_each_item_ofshop,avg_price_each_item_cat_ofshop,min_price_each_item,max_price_each_item,mean_price_each_item
0,1,0,30,40.0,31.0,1.187481,1.512301,1.125806,1.248923,563.444151,265.0,265.0,99.0,399.0,323.624704
1,1,0,31,37.0,11.0,1.187481,1.303167,1.063038,1.071181,563.444151,434.0,434.0,147.62,699.0,578.707941
2,1,0,32,40.0,16.0,1.187481,1.112646,1.125806,1.248923,563.444151,221.0,221.0,70.62,349.0,249.736179


In [21]:
df_train_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
train_or_test,638298.0,0.66442,0.472193,0.0,0.0,1.0,1.0,1.0
shop_id,638298.0,31.502413,17.165986,0.0,18.0,31.0,47.0,59.0
item_id,638298.0,11310.843249,6177.160419,0.0,5961.0,11488.0,16429.0,22169.0
item_category_id,638298.0,35.960052,21.249605,0.0,23.0,40.0,55.0,83.0
item_cnt_day,638298.0,5.722369,52.349474,0.0,0.0,1.0,4.0,19934.0
shop_avg_item_cnt,638298.0,1.23403,0.217899,1.057546,1.156484,1.185057,1.261944,4.240983
avg_item_cnt,623052.0,1.099146,0.485211,1.0,1.0,1.019231,1.079929,65.473684
avg_item_cnt_each_cat,638298.0,1.115873,0.176333,1.0,1.02926,1.064949,1.125806,7.32977
shop_cat_avg_item_cnt,539052.0,1.120238,0.837001,1.0,1.021739,1.05515,1.12963,333.666667
avg_price_ofshop,638298.0,922.939343,178.931993,299.237067,834.664881,917.856259,973.275624,1458.766652


In [22]:
df_train_test.fillna(0, inplace = True)

In [23]:
missing = list(df_train_test.isnull().sum().sort_values(ascending = True).items())
missing

[('train_or_test', 0),
 ('shop_id', 0),
 ('item_id', 0),
 ('item_category_id', 0),
 ('item_cnt_day', 0),
 ('shop_avg_item_cnt', 0),
 ('avg_item_cnt', 0),
 ('avg_item_cnt_each_cat', 0),
 ('shop_cat_avg_item_cnt', 0),
 ('avg_price_ofshop', 0),
 ('avg_price_each_item_ofshop', 0),
 ('avg_price_each_item_cat_ofshop', 0),
 ('min_price_each_item', 0),
 ('max_price_each_item', 0),
 ('mean_price_each_item', 0)]

In [None]:
#df_test_1.loc[:,'normalized_price'] = (df_test_1['item_price'] - df_test_1['item_price'].min()) / (df_test_1['item_price'].max() - df_test_1['item_price'].min())
#df_test_1.loc[:,'standardized_price'] = (df_test_1['item_price'] - df_test_1['item_price'].mean()) / df_test_1['item_price'].std()
#df_test_1.loc[:,'price_bin_round'] = np.array(np.floor(np.array(df_test_1['item_price']) / 100.))
#df_test_1.head()

https://scikit-learn.org/stable/auto_examples/compose/plot_transformed_target.html#sphx-glr-auto-examples-compose-plot-transformed-target-py

In [24]:
# Convert the feature  to the np.log

col_name = ['item_category_id',
       'item_cnt_day', 'shop_avg_item_cnt', 'avg_item_cnt',
       'avg_item_cnt_each_cat', 'shop_cat_avg_item_cnt', 'avg_price_ofshop',
       'avg_price_each_item_ofshop', 'avg_price_each_item_cat_ofshop',
       'min_price_each_item', 'max_price_each_item', 'mean_price_each_item']

#from sklearn.preprocessing import QuantileTransformer
#qt = QuantileTransformer(n_quantiles=10, random_state=42)
for i in col_name:
    df_train_test[i] = df_train_test[i].apply(np.log1p)

In [25]:
df_train_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
train_or_test,638298.0,0.66442,0.472193,0.0,0.0,1.0,1.0,1.0
shop_id,638298.0,31.502413,17.165986,0.0,18.0,31.0,47.0,59.0
item_id,638298.0,11310.843249,6177.160419,0.0,5961.0,11488.0,16429.0,22169.0
item_category_id,638298.0,3.10218,1.423093,0.0,3.178054,3.713572,4.025352,4.430817
item_cnt_day,638298.0,1.05128,1.063365,0.0,0.0,0.693147,1.609438,9.900232
shop_avg_item_cnt,638298.0,0.80045,0.07577,0.721514,0.768479,0.781642,0.816225,1.656509
avg_item_cnt,638298.0,0.716692,0.149195,0.0,0.693147,0.701958,0.730997,4.196806
avg_item_cnt_each_cat,638298.0,0.747093,0.064404,0.693147,0.707671,0.725105,0.754151,2.119836
shop_cat_avg_item_cnt,638298.0,0.63012,0.281428,0.0,0.698216,0.713988,0.74673,5.813135
avg_price_ofshop,638298.0,6.808481,0.207971,5.704572,6.728228,6.82313,6.881694,7.286032


# Data For Modeling

In [28]:
#Seperata the Train set
data = df_train_test[df_train_test['train_or_test'] == 1]
data = data.drop(['shop_id','item_id','train_or_test','item_category_id'], axis =1)
#submit file predict
submit = df_train_test[df_train_test['train_or_test'] == 0]
submit = submit.drop(['shop_id','item_id','train_or_test','item_category_id'],axis= 1)



## Create the Target 'y' and feature 'X' for models

In [29]:
# Create Target y and feature X base on the train set
y = data['item_cnt_day']
X = data.drop('item_cnt_day', axis = 1)

In [32]:
#Split the data to traing set and test set base on the train set to train model
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [33]:
len(Xtrain), len(ytrain)

(296868, 296868)

## K-NN Regression model

In [34]:
#Import sklearn for model
from sklearn.neighbors import KNeighborsRegressor

### Selection the importance feature


In [None]:
###Hyperparameter tunning ###
######### FIND BEST n_neighbors ###############
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': np.arange(5,6,1)}
knn = KNeighborsRegressor()
knn_cv = GridSearchCV(knn, param_grid, cv = 5)
knn_cv.fit(Xtrain,ytrain)
knn_cv.best_params_ , knn_cv.best_score_

In [None]:
#Predict the labels of the test data
y_pred = knn_cv.predict(Xtest)

In [None]:
#Checking score test and predict
test_accuray = knn_cv.score(Xtest, ytest)
train_accuray = knn_cv.score(Xtrain, ytrain)
rmse = np.sqrt(mean_squared_error(ytest, y_pred))

In [None]:
print('R^2 of model: ',test_accuray)
print('RMSE of model: ', rmse)

In [None]:
train_accuray

#### Importance feature

# Random forest modeling

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor

#### Feature selection

In [None]:
#rf = RandomForestRegressor()
#rf.fit(Xtrain, ytrain)

In [None]:
#features_name = data.drop('item_cnt_day', axis = 1).columns
#Score = rf.feature_importances_
#no_name = zip(features_name, Score)
#temp_f = pd.DataFrame(no_name,  columns = ['name', 'score'])
#temp_f = temp_f.sort_values('score', ascending = False)


In [None]:
#temp_f

In [None]:
#Select top 3 feature
#col_name = temp_f.name.values[0:5]
#col_name = ['avg_item_cnt', 'shop_cat_avg_item_cnt', 'mean_price_each_item',
 #      'min_price_each_item', 'shop_avg_item_cnt']

In [None]:
#Xtrain_rf = Xtrain[col_name]
#Xtest_rf = Xtest[col_name]

In [None]:
#Xtrain_rf = Xtrain_rf.values
#ytrain_rf = ytrain.values
#Xtest_rf = Xtest_rf.values
#ytest_rf = ytest.values

In [None]:
#Xtrain_rf

##### end___

In [None]:
#Instantiate the model
rf = RandomForestRegressor()

In [None]:
####Hyperparameter tunning ###
###CHoice BEST PARAMETER ##

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 20, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [None]:

#Instantiate the tunning parameter using RandomizeSearchCV
rf_cv = RandomizedSearchCV(rf, random_grid, cv = 5, random_state=42)

In [None]:
rf_cv.fit(Xtrain, ytrain)

In [None]:

#Get the best parameter and best score
rf_cv.best_params_, rf_cv.best_score_

rf = RandomForestRegressor(n_estimators =  20,
                                  min_samples_split =  5,
                                  min_samples_leaf = 1,
                                  max_features = 'sqrt',
                                  max_depth = None,
                                  bootstrap =  False)

rf.fit(Xtrain_rf,ytrain_rf)

In [None]:
#Predict the data
y_pred_rf = rf_cv.predict(Xtest)

In [None]:
#Get the R^2 and RMSE
test_accuary_rf = rf_cv.score(Xtest, ytest)
train_accuary_rf = rf_cv.score(Xtrain, ytrain)
rmse_rf = np.sqrt(mean_squared_error(ytest, y_pred_rf))


In [None]:
print('R^2 of model: ',test_accuary_rf)
print('RMSE of model: ', rmse_rf)

In [None]:
train_accuary_rf

## Try predict our test.csv file was provide 

In [45]:
submit.head()

Unnamed: 0,item_cnt_day,shop_avg_item_cnt,avg_item_cnt,avg_item_cnt_each_cat,shop_cat_avg_item_cnt,avg_price_ofshop,avg_price_each_item_ofshop,avg_price_each_item_cat_ofshop,min_price_each_item,max_price_each_item,mean_price_each_item
424098,0.0,0.753664,0.780351,0.803492,0.74492,6.769229,7.39921,7.39921,6.620073,7.863267,7.562479
424099,0.0,0.753664,0.0,0.693147,0.0,6.769229,0.0,0.0,0.0,0.0,0.0
424100,0.0,0.753664,0.754556,0.803492,0.74492,6.769229,6.764654,6.764654,6.39693,7.090077,6.686831
424101,0.0,0.753664,0.742306,0.773344,0.726705,6.769229,6.39693,6.39693,6.39693,7.090077,6.669938
424102,0.0,0.753664,0.0,0.693147,0.0,6.769229,0.0,0.0,0.0,0.0,0.0


In [52]:
X_submit = submit.drop('item_cnt_day', axis = 1)
y_submit_pred = xg.predict(X_submit)
item_cnt_month = np.expm1(y_submit_pred)
solution = df_test
solution['item_cnt_month'] = np.round(item_cnt_month,0)
solution.head()
#solution = df_test_1
#solution['item_cnt_day'] = np.round(y_new_predict_item_cnt,1)
solution_1=solution.drop(['shop_id','item_id','item_category_id','train_or_test','item_cnt_day'], axis = 1).reset_index()
#solution_1['item_price'] =np.round(e ** df_test_1.item_price,0)
#solution_1.columns = ['ID', 'item_cnt_month']
#solution_1 = solution_1.set_index('ID')
solution_1.head()

Unnamed: 0,index,item_cnt_month
0,0,10.0
1,1,1.0
2,2,7.0
3,3,5.0
4,4,1.0


In [53]:
solution_1.head()

Unnamed: 0,index,item_cnt_month
0,0,10.0
1,1,1.0
2,2,7.0
3,3,5.0
4,4,1.0


In [54]:
solution_1.columns = ['ID', 'item_cnt_month']
solution_1 = solution_1.set_index('ID')

In [55]:
solution_1.to_csv('submitfile_1.csv')

# Xgboost Model

In [35]:
from sklearn.ensemble import GradientBoostingRegressor

In [36]:
#xg = GradientBoostingRegressor()
#xg.fit(Xtrain,ytrain)

In [37]:
#features_name = data.drop('item_cnt_day', axis = 1).columns
#Score = xg.feature_importances_
#no_name = zip(features_name, Score)
#temp_f = pd.DataFrame(no_name,  columns = ['name', 'score'])
#temp_f = temp_f.sort_values('score', ascending = False)

In [38]:
#temp_f

In [39]:
#Select top 3 feature
#col_name = temp_f.name.values[0:5]
#Xtrain_xg = Xtrain[col_name]
#Xtest_xg = Xtest[col_name]
#Xtrain_xg = Xtrain_xg.values
#ytrain_xg = ytrain.values
#Xtest_xg = Xtest_xg.values
#ytest_xg = ytest.values

In [40]:
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}

In [41]:
xg = GradientBoostingRegressor(**params)
xg.fit(Xtrain,ytrain)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='ls', max_depth=4,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=500,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [42]:
#Predict 
y_pred_xg = xg.predict(Xtest)


In [43]:
test_accuary_xg = xg.score(Xtest, ytest)
train_accuary_xg = xg.score(Xtrain, ytrain)
rmse_xg = np.sqrt(mean_squared_error(ytest, y_pred_xg))

In [44]:
print('R^2 of model of test set: ',test_accuary_xg)
print('RMSE of model: ', rmse_xg)
print('R^2 train set: ', train_accuary_xg)

R^2 of model of test set:  0.4981292072037954
RMSE of model:  0.6581896840941389
R^2 train set:  0.4980183386591611


# Support Vector Machines (SVM) 

In [None]:
from sklearn.svm import SVR
clf = SVR(C=1.0, epsilon=0.2)

In [None]:
clf.fit(Xtrain, ytrain)

In [None]:
y_pred_svr = clf.predict(Xtest)

In [None]:
test_accuary_tree = clf.score(Xtest, ytest)
train_accuary_tree = clf.score(Xtrain, ytrain)
rmse_tree = np.sqrt(mean_squared_error(ytest, y_pred_svr))

In [None]:
print('R^2 of model of test set: ',test_accuary_tree)
print('RMSE of model: ', rmse_tree)
print('R^2 train set: ', train_accuary_tree)

# Solution

- Used feature engieering, to add more feature for modeling
- Data have alot items was only 1 per day, so I used log transform to have better data
- Also price is same, it crewed so I also use log trandform.
- standardided and normilized both item_cnt_data and item_price

- Overrall, that show xgboots regression model have better R^2 score and RMSE score
- We choice the xgboots regression for our problem here.