# In-Depth Analysis: Machine Learning Models

## Import python libarary

In [131]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Loading the data

In [132]:
df_1 = pd.read_csv('data/question1_data.csv').drop('Unnamed: 0', axis = 1)
df_2 = pd.read_csv('data/question2_data.csv')
df_3 = pd.read_csv('data/question3_data.csv')
df_test = pd.read_csv('data/test.csv', index_col = 'ID')


In [133]:
#Predict November itemsalfe, extract only data for november last 2 year to make a training data
#Cause we only predict item_cnt_day  sale for month november, so we only need the data last november of 2 year to predict
november_data = df_1[((df_1.date_block_num == 10) | (df_1.date_block_num == 22))].reset_index()

In [134]:
#Extract new dataframe only contain shop_id and item_id, so it can match with our test data file
df_new_1 = november_data[['shop_id','item_id','item_cnt_day']]

## Create the Target 'y' and feature 'X' for models

In [135]:
# Create Target y and feature X
y = df_new_1['item_cnt_day']
X = df_new_1.drop('item_cnt_day', axis = 1)

In [136]:
#Split the data to traing set and test set
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.4, random_state = 42)

## K-NN Regression model

In [137]:
#Import sklearn for model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

In [138]:
###Hyperparameter tunning ###
######### FIND BEST n_neighbors ###############
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': np.arange(1,50)}
knn = KNeighborsRegressor()
knn_cv = GridSearchCV(knn, param_grid, cv = 5)
knn_cv.fit(Xtrain,ytrain)
knn_cv.best_params_ , knn_cv.best_score_

({'n_neighbors': 16}, 0.19631919716195562)

In [139]:
#Predict the labels of the test data
y_pred = knn_cv.predict(Xtest)

In [140]:
#Checking score test and predict
test_accuray = knn_cv.score(Xtest, ytest)
train_accuray = knn_cv.score(Xtrain, ytrain)
rmse = np.sqrt(mean_squared_error(ytest, y_pred))

In [141]:
print('R^2 of model: ',test_accuray)
print('RMSE of model: ', rmse)

R^2 of model:  0.146279253735217
RMSE of model:  8.023309020903122


#### Importance feature

In [142]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 0.4, normalize= True)
lasso.fit(X,y)
lasso.coef_

array([-0., -0.])

## Try predict our test.csv file was provide 

In [143]:
#Create the array of predict the item_cnt in test file
new_predict = df_test.values
#Use model to predict
y_new_predict = knn_cv.predict(new_predict)

In [144]:
solution = df_test
solution['item_cnt_day'] = y_new_predict
solution.head()

Unnamed: 0_level_0,shop_id,item_id,item_cnt_day
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5,5037,1.75
1,5,5320,3.25
2,5,5233,5.3125
3,5,5232,5.3125
4,5,5268,2.4375


# Random forest modeling

In [146]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [151]:
#Instantiate the model
rf = RandomForestRegressor()

In [152]:
####Hyperparameter tunning ###
###CHoice BEST PARAMETER ##

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 20, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [153]:
#Instantiate the tunning parameter using RandomizeSearchCV
rf_cv = RandomizedSearchCV(rf, random_grid, cv = 3, random_state=42)

In [154]:
rf_cv.fit(Xtrain, ytrain)

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [160]:
#Get the best parameter and best score
rf_cv.best_params_, rf_cv.best_score_

({'n_estimators': 12,
  'min_samples_split': 2,
  'min_samples_leaf': 2,
  'max_features': 'auto',
  'max_depth': None,
  'bootstrap': True},
 0.34442697965675206)

In [157]:
#Predict the data
y_pred_rf = rf_cv.predict(Xtest)

In [165]:
#Get the R^2 and RMSE
test_accuary_rf = rf_cv.score(Xtest, ytest)
train_accuary_rf = rf_cv.score(Xtrain, ytrain)
rmse_rf = np.sqrt(mean_squared_error(ytest, y_pred_rf))


In [166]:
print('R^2 of model: ',test_accuary_rf)
print('RMSE of model: ', rmse_rf)

R^2 of model:  0.4621953236687989
RMSE of model:  6.3680708842454825


# DecisionTreeRegressor Model

In [167]:
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

In [168]:
#Instantiate the model
tree = DecisionTreeRegressor()
#Fit the data
tree.fit(Xtrain,ytrain)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [169]:
#Predict 
y_pred_tree = tree.predict(Xtest)

In [174]:
test_accuary_tree = tree.score(Xtest_tree, ytest_tree)
train_accuary_tree = tree.score(Xtrain_tree, ytrain_tree)
rmse_tree = np.sqrt(mean_squared_error(ytest_tree, y_pred_tree))

In [176]:
print('R^2 of model: ',test_accuary_tree)
print('RMSE of model: ', rmse_tree)

R^2 of model:  0.3835276216824326
RMSE of model:  6.817928038598524


In [None]:

#Set up the parameter
param_grid = {"criterion": ["mse", "mae"],
              "min_samples_split": [10, 20, 40],
              "max_depth": [2, 6, 8],
              "min_samples_leaf": [20, 40, 100],
              "max_leaf_nodes": [5, 20, 100],
              }
#Instantiate a Decision Tree Regressor
tree = DecisionTreeRegressor()

#Instantiate the RandomizedSearchCV object
tree_cv = RandomizedSearchCV(tree, param_grid, cv = 5)

#Fit it to the data
tree_cv.fit(Xtrain_tree, ytrain_tree)

#Print the score and parameter
tree_cv.best_params_ , tree_cv.best_score_