# AdaBoostClassifier with RandomForest Hyper Parameter Tuning using GridSearchCV

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier 
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix, precision_score, roc_auc_score
from sklearn.pipeline import Pipeline

import category_encoders as ce

# Using SAMPLE2.csv

In [3]:
master_df = pd.read_csv('SAMPLE2.csv')

In [4]:
master_df.head()

Unnamed: 0,Primary Key,Created,Mileage In,Work Order,Model,Line Type,Service Category,Service Package,Description,Definition,...,Quantity,Total,Discount,Net Total,ContactID,ServiceItemID,InvoiceID,labels,Make,Year
0,1,12/28/2018 10:18,33320,49,civic,Material,"Oil, Lube & Preventive Maintenance Service",MINOR Preventive Maintenance Service (MEDIUM S...,OIL FILTER HONDA,Engine Oil Filter,...,1.0,696.43,0.0,696.43,a271834a-1afc-4e62-8c71-638025174c1f,ff01e2a4-d2d5-48ff-8ccc-d5605a653df4,99833bf9-865b-4e2c-9bd1-a0ed0cebea8e,preventive maintenance,honda,2012
1,2,12/28/2018 10:18,33320,49,civic,Material,"Oil, Lube & Preventive Maintenance Service",MINOR Preventive Maintenance Service (MEDIUM S...,ENGINE FLUSHING,,...,1.0,142.86,0.0,142.86,a271834a-1afc-4e62-8c71-638025174c1f,ff01e2a4-d2d5-48ff-8ccc-d5605a653df4,99833bf9-865b-4e2c-9bd1-a0ed0cebea8e,preventive maintenance,honda,2012
2,3,12/28/2018 10:18,33320,49,civic,Material,"Oil, Lube & Preventive Maintenance Service",MINOR Preventive Maintenance Service (MEDIUM S...,BRAKE CLEANER 14oz,,...,1.0,227.68,0.0,227.68,a271834a-1afc-4e62-8c71-638025174c1f,ff01e2a4-d2d5-48ff-8ccc-d5605a653df4,99833bf9-865b-4e2c-9bd1-a0ed0cebea8e,preventive maintenance,honda,2012
3,4,12/28/2018 10:18,33320,49,civic,Labor,"Oil, Lube & Preventive Maintenance Service",MINOR Preventive Maintenance Service (MEDIUM S...,Labor,,...,3.0,1339.29,0.0,1339.29,a271834a-1afc-4e62-8c71-638025174c1f,ff01e2a4-d2d5-48ff-8ccc-d5605a653df4,99833bf9-865b-4e2c-9bd1-a0ed0cebea8e,preventive maintenance,honda,2012
4,5,12/28/2018 10:18,33320,49,civic,Material,"Oil, Lube & Preventive Maintenance Service",MINOR Preventive Maintenance Service (MEDIUM S...,REPSOL ELITE INJECTION 10W40 1L - GAS/DIESEL,Engine Oil,...,4.0,1178.56,0.0,1178.56,a271834a-1afc-4e62-8c71-638025174c1f,ff01e2a4-d2d5-48ff-8ccc-d5605a653df4,99833bf9-865b-4e2c-9bd1-a0ed0cebea8e,preventive maintenance,honda,2012


In [5]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37548 entries, 0 to 37547
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Primary Key       37548 non-null  int64  
 1   Created           37548 non-null  object 
 2   Mileage In        37548 non-null  int64  
 3   Work Order        37548 non-null  int64  
 4   Model             37548 non-null  object 
 5   Line Type         37548 non-null  object 
 6   Service Category  37548 non-null  object 
 7   Service Package   37525 non-null  object 
 8   Description       37548 non-null  object 
 9   Definition        15582 non-null  object 
 10  Price             37548 non-null  float64
 11  Quantity          37548 non-null  float64
 12  Total             37548 non-null  float64
 13  Discount          37548 non-null  float64
 14  Net Total         37548 non-null  float64
 15  ContactID         37548 non-null  object 
 16  ServiceItemID     37548 non-null  object

In [6]:
master_df['labels'].value_counts()

preventive maintenance                          15899
change oil, lube and flushing                    4957
complete vehicle inspection                      2799
steering and wheel alignment                     1962
Brakes                                           1590
suspension                                       1252
Air Conditioner General Cleaning Service         1131
Ignition and fuel general services                988
clutch general services                           978
Engine                                            928
tire general services and repair                  851
electrical general services and replacement       625
Valve Cover Gasket                                545
Cooling Repair, charging and replacement          515
Transmission general services                     430
Vacuum/Charging Service                           369
Radiator general services                         349
Wheel Bearing                                     230
Drive train general services

In [7]:
# car labels; getting only the entries with greater than 250
label_list = master_df['labels'].value_counts().to_frame()
label_list = label_list[label_list['labels'] <= 100]
label_list = list(label_list.index)
label_list

['Door Servicing',
 'Counter / Cash Sale',
 'Body Repair and Paint Works',
 'Accessories General Service',
 'Car Horn Repair & Services',
 'Other Sublet Services (Machine Shop)',
 'Car Hood Services',
 'Towing',
 'Exhaust General Service',
 'electrical general services',
 'Car Seat Services']

In [8]:
master_df = master_df[~master_df['labels'].isin(label_list)]

In [9]:
# car make; getting only the entries with greater than 100 
car_list = master_df['Make'].value_counts().to_frame()
car_list = car_list[car_list['Make'] <= 100]
car_list = list(car_list.index)
car_list

['audi',
 'mercedes benz',
 'lexus',
 'volvo',
 'volkswagen',
 'dodge',
 'lincoln',
 'foton',
 'chrysler',
 'peugeot',
 'jeep',
 'mg']

In [10]:
master_df = master_df[~master_df['Make'].isin(car_list)]

In [11]:
master_df = master_df[master_df['Service Category'] != 'Accessories (Gauges, Power Accessories, Vision & Air Bag)']

In [12]:
master_df = master_df[master_df['labels'] != 'preventive maintenance']
master_df = master_df[master_df['labels'] != 'Engine Detailing']

# 
master_df = master_df[master_df['labels'] != 'Drive train general services and replacement']
master_df = master_df[master_df['labels'] != 'filter and sensor (air and fuel)']

In [13]:
labl = master_df['labels'].value_counts().to_frame()
labl

Unnamed: 0,labels
"change oil, lube and flushing",4924
complete vehicle inspection,2766
steering and wheel alignment,1930
Brakes,1575
suspension,1238
Air Conditioner General Cleaning Service,1103
clutch general services,978
Ignition and fuel general services,941
Engine,918
tire general services and repair,840


In [14]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20397 entries, 13 to 37547
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Primary Key       20397 non-null  int64  
 1   Created           20397 non-null  object 
 2   Mileage In        20397 non-null  int64  
 3   Work Order        20397 non-null  int64  
 4   Model             20397 non-null  object 
 5   Line Type         20397 non-null  object 
 6   Service Category  20397 non-null  object 
 7   Service Package   20375 non-null  object 
 8   Description       20397 non-null  object 
 9   Definition        7957 non-null   object 
 10  Price             20397 non-null  float64
 11  Quantity          20397 non-null  float64
 12  Total             20397 non-null  float64
 13  Discount          20397 non-null  float64
 14  Net Total         20397 non-null  float64
 15  ContactID         20397 non-null  object 
 16  ServiceItemID     20397 non-null  objec

In [15]:
X = master_df[['Year', 'Make', 'Model','Mileage In']]
Y = master_df['labels']

In [16]:
# category encoding
# !pip install category_encoders


encoder = ce.BinaryEncoder(cols=['Year', 'Make', 'Model'], return_df = True)

In [17]:
encoder.fit(X)

BinaryEncoder(cols=['Year', 'Make', 'Model'])

# Splitting of Dataset

In [18]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 4244, test_size = 0.20, stratify = Y)

In [19]:
X_train_encoded = encoder.transform(x_train)
X_test_encoded = encoder.transform(x_test)

In [20]:
print(X_train_encoded.shape, X_test_encoded.shape)

(16317, 22) (4080, 22)


In [21]:
# Using gridsearch for RandomForestClassifier

In [22]:
# model_ = RandomForestClassifier(random_state = 4244, n_jobs = 4)

In [23]:
param1 = {
    'n_estimators': [50, 100, 150, 200],
    'criterion': ['gini','entropy'],
    'max_depth': [5, 10, 15, 20, 25],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf' : [1, 2, 4]    
}

# Using GridSearchCV to get the Optimum setting for the Hyperparameter

In [24]:
# grid_model = GridSearchCV(model_, param1, cv = 5)

In [25]:
# grid_model.fit(X_train_encoded, y_train)

In [26]:
# print(grid_model.best_params_)

In [27]:
# y_pred = grid_model.predict(X_test_encoded)

In [28]:
# print("Precision score: {:.3f}".format(precision_score(y_test,y_pred , average='macro')))
# print("Accuracy Score:{:.3f}".format(accuracy_score(y_test, y_pred)))
# print("Classification Report: \n", classification_report(y_test, y_pred))

In [29]:
# rf_model = RandomForestClassifier(n_estimators = 50, bootstrap = False, criterion = 'entropy', max_depth = 25, max_features = 'auto', min_samples_leaf = 1, min_samples_split = 2, random_state = 4244)

In [30]:
# rf_model.fit(X_train_encoded, y_train)

In [31]:
# y_pred = grid_model.predict(X_test_encoded)

In [32]:
# print("Precision score: {:.3f}".format(precision_score(y_test,y_pred , average='macro')))
# print("Accuracy Score:{:.3f}".format(accuracy_score(y_test, y_pred)))
# print("Classification Report: \n", classification_report(y_test, y_pred))

# AdaBoostClassifier with optimized RandomForestClassifier

In [33]:
# Using AdaboostClassifier (default)

random_forest_model = RandomForestClassifier(n_estimators = 50, bootstrap = False, criterion = 'entropy', max_depth = 25, max_features = 'auto', min_samples_leaf = 1, min_samples_split = 2, random_state = 4244)
abc_model = AdaBoostClassifier(base_estimator=random_forest_model, random_state = 4244)

In [None]:
abc_model.fit(X_train_encoded, y_train)

In [None]:
y_pred = abc_model.predict(X_test_encoded)

In [None]:
print("Precision score: {:.3f}".format(precision_score(y_test,y_pred , average='macro')))
print("Accuracy Score:{:.3f}".format(accuracy_score(y_test, y_pred)))
print("Classification Report: \n", classification_report(y_test, y_pred))