In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier 
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix, precision_score, roc_auc_score
from sklearn.pipeline import Pipeline

import category_encoders as ce

In [2]:
master_df = pd.read_csv('FINAL3.csv')

In [3]:
master_df['Work Order'].nunique()

4561

In [4]:
master_df.isna().sum()

Primary Key             0
Created                 0
Mileage In              0
Work Order              0
Model                   0
Line Type               0
Service Category        0
Service Package        23
Description             0
Definition          21966
Price                   0
Quantity                0
Total                   0
Discount                0
Net Total               0
ContactID               0
ServiceItemID           0
InvoiceID               0
labels                  0
Make                    0
Year                    0
dtype: int64

In [5]:
master_df['Service Category'].value_counts()

Oil, Lube & Preventive Maintenance Service                            23617
Steering & Suspension                                                  3214
Brakes                                                                 1820
HVAC (Heating, Ventilation & Air Conditioning)                         1500
Engine                                                                 1473
Transmission & Clutch                                                  1416
Cooling System                                                          853
Tires & Wheels                                                          851
Driveability (Fuel, Ignition & Emission Systems)                        811
Electrical (Belts, Lighting, Starting & Charging Systems)               796
Accessories (Gauges, Power Accessories, Vision & Air Bag)               445
Drive Train (Final Drive, Transaxle, Differential & Transfer Case)      223
Batteries                                                               199
Car Detailin

In [6]:
master_df = master_df[master_df['Service Category'] != 'Accessories (Gauges, Power Accessories, Vision & Air Bag)']
master_df = master_df[master_df['labels'] != 'Engine Detailing']

In [7]:
car_list = master_df['Make'].value_counts().to_frame()
car_list = car_list[car_list['Make'] <= 100]
car_list = list(car_list.index)

master_df = master_df[~master_df['Make'].isin(car_list)]

In [8]:
label_list = master_df['labels'].value_counts().to_frame()
label_list = label_list[label_list['labels'] <= 100]
label_list = list(label_list.index)

master_df = master_df[~master_df['labels'].isin(label_list)]

In [9]:
master_df['labels'].value_counts()

preventive maintenance                                       15758
change oil, lube and flushing                                 4924
complete vehicle inspection                                   2766
Brakes                                                        1575
suspension                                                    1238
wheel alignment                                                968
steering                                                       962
Engine                                                         918
Air Conditioner General Cleaning Service                       808
Mount, Install, & Balance                                      727
Clutch General Service (Replacement of Clutch Components)      646
Valve Cover Gasket                                             539
Vacuum/Charging Service                                        362
Clutch Repair                                                  294
Drive Belt / Tensioner                                        

In [10]:
# taking only the unique values

master_df = master_df[['Work Order','Make','Model','Year','Mileage In','labels']]

In [11]:
# to get only the unique values when sampling

master_df = master_df.drop_duplicates()

In [12]:
master_df['labels'] = master_df['labels'].apply(lambda x: x.lower())

In [13]:
master_df = master_df.reset_index().drop('index', axis = 'columns')

In [14]:
master_df

Unnamed: 0,Work Order,Make,Model,Year,Mileage In,labels
0,49,honda,civic,2012,33320,preventive maintenance
1,63,honda,city,2012,68598,preventive maintenance
2,33,hyundai,getz,2010,68624,engine
3,33,hyundai,getz,2010,68624,"change oil, lube and flushing"
4,36,hyundai,getz,2010,61548,engine
...,...,...,...,...,...,...
7923,2041,isuzu,sportivo,2013,71511,wheel alignment
7924,2292,mitsubishi,strada,2015,34068,egr valve motor cleaning
7925,2306,toyota,wigo,2019,24616,"change oil, lube and flushing"
7926,2308,mitsubishi,montero,2015,104214,brakes


In [15]:
master_df['labels'].value_counts()

preventive maintenance                                       1995
complete vehicle inspection                                  1301
change oil, lube and flushing                                 902
wheel alignment                                               693
brakes                                                        482
suspension                                                    323
steering                                                      263
mount, install, & balance                                     229
engine                                                        190
valve cover gasket                                            148
scan computerized engine controls                             118
drive belt / tensioner                                        118
battery replacement                                           115
clutch general service (replacement of clutch components)     109
vacuum/charging service                                       104
air condit

In [16]:
# rebalancing all the labels

# preventive maintenance
preventive_maintenance = master_df[master_df['labels'] == 'preventive maintenance'].sample(n = 2000, replace = True, random_state = 4244)
cvi = master_df[master_df['labels'] == 'complete vehicle inspection'].sample(n = 2000, replace = True, random_state = 4244)
change_oil = master_df[master_df['labels'] == 'change oil, lube and flushing'].sample(n = 2000, replace = True, random_state = 4244)
alignment = master_df[master_df['labels'] == 'wheel alignment'].sample(n=2000, replace = True, random_state = 4244)
brakes = master_df[master_df['labels'] == 'brakes'].sample(n=2000, replace=True, random_state = 4244)
suspension = master_df[master_df['labels'] == 'suspension'].sample(n=2000, replace = True, random_state = 4244)
steering = master_df[master_df['labels'] == 'steering'].sample(n=2000, replace = True, random_state = 4244)
mib = master_df[master_df['labels'] == 'mount, install, & balance'].sample(n=2000, replace = True, random_state=4244)
engine = master_df[master_df['labels'] == 'engine'].sample(n=2000, replace = True, random_state=42444)
valve_cover = master_df[master_df['labels'] == 'valve cover gasket'].sample(n=2000, replace = True, random_state=4244)
scec = master_df[master_df['labels'] == 'scan computerized engine controls'].sample(n=2000, replace = True, random_state=4244)
drive_belt = master_df[master_df['labels'] == 'drive belt / tensioner'].sample(n=2000, replace=True, random_state=4244)
battery_replacement = master_df[master_df['labels'] == 'battery replacement'].sample(n=2000, replace=True, random_state=4244)
clutch = master_df[master_df['labels'] == 'clutch general service (replacement of clutch components)'].sample(n=2000, replace = True, random_state = 4244)
vacuum = master_df[master_df['labels'] == 'vacuum/charging service'].sample(n=2000, replace = True, random_state=4244)
aircon = master_df[master_df['labels'] == 'air conditioner general cleaning service'].sample(n=2000, replace = True, random_state=4244)
filter_fuel = master_df[master_df['labels'] == 'fuel filter / air filter'].sample(n=2000, replace=True, random_state=4244)
lighting = master_df[master_df['labels'] == 'lighting service / light bulb replacement'].sample(n=2000, replace=True, random_state=4244)
ignition = master_df[master_df['labels'] == 'ignition coil assembly / spark plugs'].sample(n=2000, replace=True, random_state=4244)
transmission = master_df[master_df['labels'] == 'transmission fluid replacement & filter service'].sample(n=2000, replace=True, random_state=4244)
aux_motor = master_df[master_df['labels'] == 'replacement of aux fan motor'].sample(n=2000, replace=True, random_state=4244)
engine_coolant = master_df[master_df['labels'] == 'engine, radiator coolant / hose'].sample(n=2000, replace=True, random_state=4244)
bearing = master_df[master_df['labels'] == 'wheel bearing'].sample(n=2000, replace=True, random_state=4244)
alternator = master_df[master_df['labels'] == 'alternator diagnosis / component repair'].sample(n=2000, replace=True, random_state=4244)
clutch_repair = master_df[master_df['labels'] == 'clutch repair'].sample(n=2000, replace=True, random_state=4244)
egr = master_df[master_df['labels'] == 'egr valve motor cleaning'].sample(n=2000, replace=True, random_state=4244)
drive_shaft = master_df[master_df['labels'] == 'servicing of drive shaft'].sample(n=2000, replace = True, random_state = 4244)
rad_assembly = master_df[master_df['labels'] == 'radiator assembly'].sample(n=2000, replace=True, random_state=4244)

In [17]:
new_df = pd.concat([preventive_maintenance, cvi, change_oil, alignment, brakes, suspension, steering, mib, engine, valve_cover, scec, drive_belt, battery_replacement, clutch, vacuum, aircon, filter_fuel, lighting, ignition, transmission, aux_motor, engine_coolant, bearing, alternator, clutch_repair, egr, drive_shaft, rad_assembly], axis = 0)

In [18]:
new_df = new_df[['Year','Make','Model','Mileage In','labels']]
new_df['labels'].value_counts()

vacuum/charging service                                      2000
transmission fluid replacement & filter service              2000
engine                                                       2000
drive belt / tensioner                                       2000
lighting service / light bulb replacement                    2000
battery replacement                                          2000
engine, radiator coolant / hose                              2000
fuel filter / air filter                                     2000
ignition coil assembly / spark plugs                         2000
egr valve motor cleaning                                     2000
servicing of drive shaft                                     2000
wheel alignment                                              2000
steering                                                     2000
radiator assembly                                            2000
clutch repair                                                2000
suspension

In [19]:
# encoding 

X = new_df[['Year', 'Make', 'Model','Mileage In']]
Y = new_df['labels']

In [20]:
encoder = ce.BinaryEncoder(cols=['Year', 'Make', 'Model'], return_df = True)

In [21]:
encoder.fit(X)

BinaryEncoder(cols=['Year', 'Make', 'Model'])

In [22]:
# train test splitting of the dataset

x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 4244, test_size = 0.20, stratify = Y)

In [23]:
X_train_encoded = encoder.transform(x_train)
X_test_encoded = encoder.transform(x_test)

In [24]:
# using RandomForestClassifier
model_ = RandomForestClassifier(n_jobs = 4)

In [25]:
model_.fit(X_train_encoded, y_train)

RandomForestClassifier(n_jobs=4)

In [26]:
y_pred = model_.predict(X_test_encoded)

In [27]:
print("Precision score: {:.3f}".format(precision_score(y_test,y_pred , average='macro')))
print("Accuracy Score:{:.3f}".format(accuracy_score(y_test, y_pred)))
print("Classification Report: \n", classification_report(y_test, y_pred))

Precision score: 0.765
Accuracy Score:0.763
Classification Report: 
                                                            precision    recall  f1-score   support

                 air conditioner general cleaning service       0.73      0.85      0.79       400
                  alternator diagnosis / component repair       0.85      0.94      0.89       400
                                      battery replacement       0.88      0.94      0.91       400
                                                   brakes       0.75      0.57      0.64       400
                            change oil, lube and flushing       0.82      0.72      0.77       400
clutch general service (replacement of clutch components)       0.79      0.78      0.79       400
                                            clutch repair       0.82      1.00      0.90       400
                              complete vehicle inspection       0.80      0.49      0.61       400
                                   driv

In [28]:
# getting the optimum hyperparameter

param1 = {
    'n_estimators': [50, 100, 150, 200],
    'criterion': ['gini','entropy'],
    'max_depth': [5, 10, 15, 20, 25],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf' : [1, 2, 4]    
}

In [29]:
grid_model = GridSearchCV(model_, param1, cv = 5)

In [30]:
grid_model.fit(X_train_encoded, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=4),
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 10, 15, 20, 25],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 150, 200]})

In [31]:
print(grid_model.best_params_)

{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 25, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}


In [32]:
y_pred = grid_model.predict(X_test_encoded)

In [33]:
print("Precision score: {:.3f}".format(precision_score(y_test,y_pred , average='macro')))
print("Accuracy Score:{:.3f}".format(accuracy_score(y_test, y_pred)))
print("Classification Report: \n", classification_report(y_test, y_pred))

Precision score: 0.767
Accuracy Score:0.765
Classification Report: 
                                                            precision    recall  f1-score   support

                 air conditioner general cleaning service       0.73      0.88      0.80       400
                  alternator diagnosis / component repair       0.85      0.94      0.89       400
                                      battery replacement       0.88      0.94      0.91       400
                                                   brakes       0.74      0.59      0.65       400
                            change oil, lube and flushing       0.80      0.74      0.77       400
clutch general service (replacement of clutch components)       0.79      0.78      0.79       400
                                            clutch repair       0.82      1.00      0.90       400
                              complete vehicle inspection       0.80      0.49      0.61       400
                                   driv