# AdaBoostClassifier with RandomForest Hyper Parameter Tuning using GridSearchCV.
## Note: Converting 'Mileage In' as categorical feature

In [116]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier 
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix, precision_score, roc_auc_score
from sklearn.pipeline import Pipeline

import category_encoders as ce

In [117]:
master_df = pd.read_csv('SAMPLE1.csv')

In [118]:
master_df.head()

Unnamed: 0,Primary Key,Created,Mileage In,Work Order,Model,Line Type,Service Category,Service Package,Description,Definition,...,Quantity,Total,Discount,Net Total,ContactID,ServiceItemID,InvoiceID,labels,Make,Year
0,1,12/28/2018 10:18,33320,49,civic,Material,"Oil, Lube & Preventive Maintenance Service",MINOR Preventive Maintenance Service (MEDIUM S...,OIL FILTER HONDA,Engine Oil Filter,...,1.0,696.43,0.0,696.43,a271834a-1afc-4e62-8c71-638025174c1f,ff01e2a4-d2d5-48ff-8ccc-d5605a653df4,99833bf9-865b-4e2c-9bd1-a0ed0cebea8e,preventive maintenance,honda,2012
1,2,12/28/2018 10:18,33320,49,civic,Material,"Oil, Lube & Preventive Maintenance Service",MINOR Preventive Maintenance Service (MEDIUM S...,ENGINE FLUSHING,,...,1.0,142.86,0.0,142.86,a271834a-1afc-4e62-8c71-638025174c1f,ff01e2a4-d2d5-48ff-8ccc-d5605a653df4,99833bf9-865b-4e2c-9bd1-a0ed0cebea8e,preventive maintenance,honda,2012
2,3,12/28/2018 10:18,33320,49,civic,Material,"Oil, Lube & Preventive Maintenance Service",MINOR Preventive Maintenance Service (MEDIUM S...,BRAKE CLEANER 14oz,,...,1.0,227.68,0.0,227.68,a271834a-1afc-4e62-8c71-638025174c1f,ff01e2a4-d2d5-48ff-8ccc-d5605a653df4,99833bf9-865b-4e2c-9bd1-a0ed0cebea8e,preventive maintenance,honda,2012
3,4,12/28/2018 10:18,33320,49,civic,Labor,"Oil, Lube & Preventive Maintenance Service",MINOR Preventive Maintenance Service (MEDIUM S...,Labor,,...,3.0,1339.29,0.0,1339.29,a271834a-1afc-4e62-8c71-638025174c1f,ff01e2a4-d2d5-48ff-8ccc-d5605a653df4,99833bf9-865b-4e2c-9bd1-a0ed0cebea8e,preventive maintenance,honda,2012
4,5,12/28/2018 10:18,33320,49,civic,Material,"Oil, Lube & Preventive Maintenance Service",MINOR Preventive Maintenance Service (MEDIUM S...,REPSOL ELITE INJECTION 10W40 1L - GAS/DIESEL,Engine Oil,...,4.0,1178.56,0.0,1178.56,a271834a-1afc-4e62-8c71-638025174c1f,ff01e2a4-d2d5-48ff-8ccc-d5605a653df4,99833bf9-865b-4e2c-9bd1-a0ed0cebea8e,preventive maintenance,honda,2012


In [119]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37548 entries, 0 to 37547
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Primary Key       37548 non-null  int64  
 1   Created           37548 non-null  object 
 2   Mileage In        37548 non-null  int64  
 3   Work Order        37548 non-null  int64  
 4   Model             37548 non-null  object 
 5   Line Type         37548 non-null  object 
 6   Service Category  37548 non-null  object 
 7   Service Package   37525 non-null  object 
 8   Description       37548 non-null  object 
 9   Definition        15582 non-null  object 
 10  Price             37548 non-null  float64
 11  Quantity          37548 non-null  float64
 12  Total             37548 non-null  float64
 13  Discount          37548 non-null  float64
 14  Net Total         37548 non-null  float64
 15  ContactID         37548 non-null  object 
 16  ServiceItemID     37548 non-null  object

In [120]:
master_df['labels'].value_counts()

preventive maintenance                          15899
change oil, lube and flushing                    4957
complete vehicle inspection                      2799
steering and wheel alignment                     1962
Brakes                                           1590
suspension                                       1252
Air Conditioner General Cleaning Service         1131
clutch general services                           978
Engine                                            928
tire general services and repair                  851
Ignition and fuel general services                808
electrical general services and replacement       625
Valve Cover Gasket                                545
Cooling Repair, charging and replacement          515
Transmission general services                     430
Vacuum/Charging Service                           369
Radiator general services                         349
Wheel Bearing                                     230
Drive train general services

In [121]:
# car labels; getting only the entries with greater than 250
label_list = master_df['labels'].value_counts().to_frame()
label_list = label_list[label_list['labels'] <= 100]
label_list = list(label_list.index)
label_list

['Door Servicing',
 'Counter / Cash Sale',
 'Body Repair and Paint Works',
 'Accessories General Service',
 'Car Horn Repair & Services',
 'Other Sublet Services (Machine Shop)',
 'Car Hood Services',
 'Towing',
 'Exhaust General Service',
 'electrical general services',
 'Car Seat Services']

In [122]:
master_df = master_df[~master_df['labels'].isin(label_list)]

In [123]:
# car make; getting only the entries with greater than 100 
car_list = master_df['Make'].value_counts().to_frame()
car_list = car_list[car_list['Make'] <= 100]
car_list = list(car_list.index)
car_list

['audi',
 'mercedes benz',
 'lexus',
 'volvo',
 'volkswagen',
 'dodge',
 'lincoln',
 'foton',
 'chrysler',
 'peugeot',
 'jeep',
 'mg']

In [124]:
master_df = master_df[~master_df['Make'].isin(car_list)]

In [125]:
master_df = master_df[master_df['Service Category'] != 'Accessories (Gauges, Power Accessories, Vision & Air Bag)']

In [126]:
master_df = master_df[master_df['labels'] != 'preventive maintenance']
master_df = master_df[master_df['labels'] != 'Engine Detailing'] # engine detailing -- in depth cleaning

# 
master_df = master_df[master_df['labels'] != 'Drive train general services and replacement']
master_df = master_df[master_df['labels'] != 'filter and sensor (air and fuel)']

In [127]:
labl = master_df['labels'].value_counts().to_frame()
labl

Unnamed: 0,labels
"change oil, lube and flushing",4924
complete vehicle inspection,2766
steering and wheel alignment,1930
Brakes,1575
suspension,1238
Air Conditioner General Cleaning Service,1103
clutch general services,978
Engine,918
tire general services and repair,840
Ignition and fuel general services,762


In [128]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20218 entries, 13 to 37547
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Primary Key       20218 non-null  int64  
 1   Created           20218 non-null  object 
 2   Mileage In        20218 non-null  int64  
 3   Work Order        20218 non-null  int64  
 4   Model             20218 non-null  object 
 5   Line Type         20218 non-null  object 
 6   Service Category  20218 non-null  object 
 7   Service Package   20196 non-null  object 
 8   Description       20218 non-null  object 
 9   Definition        7843 non-null   object 
 10  Price             20218 non-null  float64
 11  Quantity          20218 non-null  float64
 12  Total             20218 non-null  float64
 13  Discount          20218 non-null  float64
 14  Net Total         20218 non-null  float64
 15  ContactID         20218 non-null  object 
 16  ServiceItemID     20218 non-null  objec

In [129]:
# Converting the 'Mileage In' to Categorical Values

In [130]:
def mileage_func(x):
    if x <= 10000:
        return '10k'
    if 10001 <= x <= 20000:
        return '20k'
    if 20001 <= x <= 30000:
        return '30k'
    if 30001 <= x <= 40000:
        return '40k'
    if 40001 <= x <= 50000:
        return '50k'
    if 50001 <= x <= 60000:
        return '60k'
    if 60001 <= x <= 70000:
        return '70k'
    if 70001 <= x <= 80000:
        return '80k'
    if 80001 <= x <= 90000:
        return '90k'
    if 90001 <= x <= 100000:
        return '100k'
    if 100001 <= x <= 150000:
        return '150k'
    if 150001 <= x <= 200000:
        return '200k'
    else:
        return 'above 200k'

In [131]:
master_df['Mileage In_'] = master_df['Mileage In'].apply(mileage_func)
master_df['Mileage In_'].value_counts()

150k          4009
50k           1981
60k           1773
70k           1725
200k          1623
80k           1618
40k           1568
100k          1298
90k           1297
30k           1018
above 200k     905
10k            709
20k            694
Name: Mileage In_, dtype: int64

In [132]:
X = master_df[['Year', 'Make', 'Model','Mileage In_']]
Y = master_df['labels']

In [133]:
# encoding all the categorical values

encoder = ce.BinaryEncoder(cols=['Year', 'Make', 'Model', 'Mileage In_'], return_df = True)

In [134]:
encoder.fit(X)

BinaryEncoder(cols=['Year', 'Make', 'Model', 'Mileage In_'])

In [135]:
# Splitting of the dataset to train and test set

In [136]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 4244, test_size = 0.20, stratify = Y)

In [137]:
X_train_encoded = encoder.transform(x_train)
X_test_encoded = encoder.transform(x_test)

In [138]:
print(X_train_encoded.shape, X_test_encoded.shape)

(16174, 26) (4044, 26)


In [143]:
# defifning the entry parameters that we will be using

param1 = {
    'n_estimators': [50, 100, 150, 200],
    'criterion': ['gini','entropy'],
    'max_depth': [5, 10, 15, 20, 25],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf' : [1, 2, 4]    
}

In [145]:
# model_ = GridSearchCV(estimator = RandomForestClassifier(random_state = 4244, n_jobs = 4), param_grid = param1, cv = 5)

In [146]:
# model_.fit(X_train_encoded, y_train)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(n_jobs=4, random_state=4244),
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 10, 15, 20, 25],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 150, 200]})

In [150]:
# print(model_.best_params_)

{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}


In [147]:
# y_pred = model_.predict(X_test_encoded)

In [148]:
# print("Precision score: {:.3f}".format(precision_score(y_test,y_pred , average='macro')))
# print("Accuracy Score:{:.3f}".format(accuracy_score(y_test, y_pred)))
# print("Classification Report: \n", classification_report(y_test, y_pred))

Precision score: 0.437
Accuracy Score:0.509
Classification Report: 
                                              precision    recall  f1-score   support

   Air Conditioner General Cleaning Service       0.44      0.65      0.52       221
    Alternator Diagnosis / Component Repair       0.47      0.28      0.35        29
                                     Brakes       0.49      0.42      0.45       315
   Cooling Repair, charging and replacement       0.26      0.17      0.21       103
                                     Engine       0.44      0.52      0.48       184
         Ignition and fuel general services       0.46      0.18      0.26       152
                  Radiator general services       0.44      0.31      0.37        70
              Transmission general services       0.36      0.25      0.29        85
                    Vacuum/Charging Service       0.38      0.38      0.38        72
                         Valve Cover Gasket       0.33      0.25      0.29      

In [153]:
random_forest = RandomForestClassifier(bootstrap = False, criterion = 'entropy', max_depth = 15, max_features = 'auto', min_samples_leaf = 1, min_samples_split = 2, n_estimators = 150)

In [154]:
random_forest.fit(X_train_encoded, y_train)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=15,
                       n_estimators=150)

In [156]:
y_pred = random_forest.predict(X_test_encoded)

In [157]:
print("Precision score: {:.3f}".format(precision_score(y_test,y_pred , average='macro')))
print("Accuracy Score:{:.3f}".format(accuracy_score(y_test, y_pred)))
print("Classification Report: \n", classification_report(y_test, y_pred))

Precision score: 0.433
Accuracy Score:0.505
Classification Report: 
                                              precision    recall  f1-score   support

   Air Conditioner General Cleaning Service       0.44      0.65      0.53       221
    Alternator Diagnosis / Component Repair       0.47      0.28      0.35        29
                                     Brakes       0.48      0.42      0.45       315
   Cooling Repair, charging and replacement       0.26      0.17      0.21       103
                                     Engine       0.43      0.51      0.46       184
         Ignition and fuel general services       0.48      0.18      0.26       152
                  Radiator general services       0.42      0.31      0.36        70
              Transmission general services       0.36      0.25      0.29        85
                    Vacuum/Charging Service       0.36      0.33      0.35        72
                         Valve Cover Gasket       0.33      0.25      0.28      