## Modelling

Train the model

In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import time

In [48]:
#from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [6]:
path = 'gdrive/MyDrive/DATA/Laptop Price/'

In [7]:
os.listdir(path)

['laptop_price.csv', 'laptop_price_interim.csv', 'data_interim.csv']

##Get the Data

In [8]:
df = pd.read_csv(path + 'data_interim.csv', encoding='latin-1')

In [9]:
df.head()

Unnamed: 0,Inches2SS,Weight2SS,Ram2SS,cpu_sizeSS,is_ssdSS,screen_typeSS,laptop_ID,is_train,Company2 Acer,Company2 Apple,Company2 Asus,Company2 Dell,Company2 HP,Company2 Lenovo,Company2 MSI,Company2 Toshiba,Company2 others,TypeName 2 in 1 Convertible,TypeName Gaming,TypeName Netbook,TypeName Notebook,TypeName Ultrabook,TypeName Workstation,OpSys Android,OpSys Chrome OS,OpSys Linux,OpSys Mac OS X,OpSys No OS,OpSys Windows 10,OpSys Windows 10 S,OpSys Windows 7,OpSys macOS,Price_euros
0,-1.231169,-1.014448,-0.087182,0.042451,0.745133,-0.385227,1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1339.69
1,-1.231169,-1.059224,-0.087182,-0.881391,-1.342041,-1.487991,2,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,898.94
2,0.398827,-0.283106,-0.087182,0.411987,0.745133,0.717538,3,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,575.0
3,0.257089,-0.327882,1.520678,0.781524,0.745133,-0.385227,4,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2537.45
4,-1.231169,-1.014448,-0.087182,1.520597,0.745133,-0.385227,5,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1803.6


In [10]:
df.shape

(1303, 33)

In [11]:
col_num = ['Inches2SS', 'Weight2SS', 'Ram2SS', 'cpu_sizeSS', 'is_ssdSS', 'screen_typeSS']

col_cat = ['Company2 Acer',
 'Company2 Apple',
 'Company2 Asus',
 'Company2 Dell',
 'Company2 HP',
 'Company2 Lenovo',
 'Company2 MSI',
 'Company2 Toshiba',
 'Company2 others',
 'TypeName 2 in 1 Convertible',
 'TypeName Gaming',
 'TypeName Netbook',
 'TypeName Notebook',
 'TypeName Ultrabook',
 'TypeName Workstation',
 'OpSys Android',
 'OpSys Chrome OS',
 'OpSys Linux',
 'OpSys Mac OS X',
 'OpSys No OS',
 'OpSys Windows 10',
 'OpSys Windows 10 S',
 'OpSys Windows 7',
 'OpSys macOS']

In [12]:
col_all = col_num + col_cat
col_all.sort()

col_target = ['Price_euros']

In [13]:
X_train = df[df.is_train == 1][col_all]
X_val = df[df.is_train == 0][col_all]

Y_train = df[df.is_train == 1][col_target]
Y_val = df[df.is_train == 0][col_target]

In [34]:
def all_metrics(y_true, y_pred, name):
  mae = mean_absolute_error(y_true, y_pred)
  mape = mean_absolute_percentage_error(y_true, y_pred)
  r_sq = r2_score(y_true, y_pred)

  return {'1. model_name': name,'2. mape': mape, '3. mae': mae, '4. r_square': r_sq}

## Base Model

In [14]:
base_model = LinearRegression()

In [15]:
base_model.fit(X_train,Y_train)

LinearRegression()

In [16]:
Y_train_pred = base_model.predict(X_train)
Y_val_pred = base_model.predict(X_val)

In [37]:
m1 = all_metrics(Y_train,Y_train_pred,'Linear_Regression_Data_Train')
m2 = all_metrics(Y_val,Y_val_pred,'Linear_Regression_Data_Test')

In [44]:
m1

{'1. model_name': 'Linear_Regression_Data_Train',
 '2. mape': 0.2685678731999669,
 '3. mae': 255.14425143953935,
 '4. r_square': 0.7401105451818669}

In [45]:
m2

{'1. model_name': 'Linear_Regression_Data_Test',
 '2. mape': 0.31030542422367186,
 '3. mae': 349.53371647509573,
 '4. r_square': -0.7233032240038879}

## Model Tandingan

In [38]:
tandingan_model = RandomForestRegressor()

In [39]:
tandingan_model.fit(X_train,Y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor()

In [40]:
Y_train_pred = tandingan_model.predict(X_train)
Y_val_pred = tandingan_model.predict(X_val)

In [41]:
t1_1 = all_metrics(Y_train,Y_train_pred,'RF1_Data_Train')
t1_2 = all_metrics(Y_val,Y_val_pred,'RF1_Data_Test')

In [42]:
t1_1

{'1. model_name': 'RF1_Data_Train',
 '2. mape': 0.07191334428290758,
 '3. mae': 78.94025254247296,
 '4. r_square': 0.9697092114606931}

In [43]:
t1_2

{'1. model_name': 'RF1_Data_Test',
 '2. mape': 0.17211125938257912,
 '3. mae': 201.2556040863064,
 '4. r_square': 0.7692491853556045}

## Model Tandingan 2

In [56]:
tandingan_model2 = RandomForestRegressor(max_depth = 5)

In [57]:
tandingan_model2.fit(X_train,Y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(max_depth=5)

In [58]:
Y_train_pred = tandingan_model2.predict(X_train)
Y_val_pred = tandingan_model2.predict(X_val)

In [60]:
t2_1 = all_metrics(Y_train,Y_train_pred,'RF2_Data_Train')
t2_2 = all_metrics(Y_val,Y_val_pred,'RF2_Data_Test')

In [62]:
t2_1

{'1. model_name': 'RF2_Data_Train',
 '2. mape': 0.21438773607969147,
 '3. mae': 207.15965751451273,
 '4. r_square': 0.8347487153825021}

In [61]:
t2_2

{'1. model_name': 'RF2_Data_Test',
 '2. mape': 0.23023865852506387,
 '3. mae': 254.35858675160344,
 '4. r_square': 0.7049131268010285}

## Hyper Parameter Tuning

In [46]:
## Manual
## Grid Search
## Random Search

In [53]:
parameters = {'n_estimators':[10,20,30,40],'max_depth':[3,5,7,9], 'min_samples_split':[3,5,7,9,11],'max_leaf_nodes':[8,12,16,20]}
model_base = RandomForestRegressor()
clf_search1 = RandomizedSearchCV(model_base, param_distributions=parameters, n_iter=30,cv=3,verbose=1)

In [54]:
L0=time.time()
clf_search1.fit(X_train,Y_train)
print(time.time()-L0)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

5.967327356338501


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


In [55]:
clf_search1.best_estimator_

RandomForestRegressor(max_depth=9, max_leaf_nodes=20, min_samples_split=5,
                      n_estimators=30)

In [63]:
tandingan_model3 = RandomForestRegressor(max_depth=9, max_leaf_nodes=20, min_samples_split=5,
                      n_estimators=30)

In [64]:
tandingan_model3.fit(X_train,Y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(max_depth=9, max_leaf_nodes=20, min_samples_split=5,
                      n_estimators=30)

In [65]:
Y_train_pred = tandingan_model3.predict(X_train)
Y_val_pred = tandingan_model3.predict(X_val)

In [66]:
t3_1 = all_metrics(Y_train,Y_train_pred,'RF3_Data_Train')
t3_2 = all_metrics(Y_val,Y_val_pred,'RF3_Data_Test')

In [67]:
t3_1

{'1. model_name': 'RF3_Data_Train',
 '2. mape': 0.2229559134565118,
 '3. mae': 207.17338779901834,
 '4. r_square': 0.8361611900179843}

In [68]:
t3_2

{'1. model_name': 'RF3_Data_Test',
 '2. mape': 0.2367976673537733,
 '3. mae': 252.9023719675406,
 '4. r_square': 0.7103596782926446}