In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from pandas.api.types import CategoricalDtype
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import time

In [2]:
data = pd.read_csv('car_kick.csv')
# Deleting Unwanted Columns of Data
data = data.drop(['PurchDate','Color', 'VNZIP1'], axis=1)

# Deleting Missing Values
data = data[data['MMRAcquisitionAuctionAveragePrice']!=0]
data = data[data['MMRAcquisitionAuctionCleanPrice']!=0]
data = data[data['MMRAcquisitionRetailAveragePrice']!=0]
data = data[data['MMRAcquisitonRetailCleanPrice']!=0]

X = data.iloc[:, :data.shape[1]-1]
Y = data.iloc[:, data.shape[1]-1]

In [3]:
# Name of Data Attributes
N = np.array(X.columns)
# print(N)

encoder = LabelEncoder()

for i in range(len(N)):
    if type(X.loc[0, N[i]]) == str :
        X[N[i]] = encoder.fit_transform(X[N[i]])

# Normailising the Data
for i in range(len(N)):
    X[N[i]] = ( ( X[N[i]] - X[N[i]].min() ) /  (X[N[i]].max() - X[N[i]].min()))

In [4]:
# Classifying Attributes as Ordinal and Nominal
X['Auction'] = X['Auction'].astype(CategoricalDtype(ordered=False))
X['Make'] = X['Make'].astype(CategoricalDtype(ordered=False))
X['Model'] = X['Model'].astype(CategoricalDtype(ordered=False))
X['Trim'] = X['Trim'].astype(CategoricalDtype(ordered=False))
X['SubModel'] = X['SubModel'].astype(CategoricalDtype(ordered=False))
X['Transmission'] = X['Transmission'].astype(CategoricalDtype(ordered=False))
X['WheelTypeID'] = X['WheelTypeID'].astype(CategoricalDtype(ordered=False))
X['WheelType'] = X['WheelType'].astype(CategoricalDtype(ordered=False))
X['Nationality'] = X['Nationality'].astype(CategoricalDtype(ordered=False))
X['TopThreeAmericanName'] = X['TopThreeAmericanName'].astype(CategoricalDtype(ordered=False))
X['BYRNO'] = X['BYRNO'].astype(CategoricalDtype(ordered=False))
X['VNST'] = X['VNST'].astype(CategoricalDtype(ordered=False))

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 40, test_size = 0.14)
clf = RandomForestClassifier()

In [7]:
x_time = time.time()
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [ 35, 45],
    'n_estimators': [ 100, 150, 200]
}

gs = GridSearchCV( clf, param_grid, cv=5, n_jobs=2 )
gs.fit(X_train, Y_train)

y_time = time.time()
print("Time required(in min)", (y_time-x_time)/60 )

print(gs.best_params_)
print(gs.best_score_)

Time required(in min) 34.5823571006457
{'criterion': 'entropy', 'max_depth': 35, 'n_estimators': 100}
0.9052803020828384


In [8]:
clf_best = RandomForestClassifier(**gs.best_params_)
clf_best.fit(X_train, Y_train)

In [9]:
Y_pred = clf_best.predict( X_test )
print('Accuracy Score on train data: ', accuracy_score(y_true=Y_train, y_pred=clf_best.predict(X_train)))
print('Accuracy Score on test data: ', accuracy_score(y_true=Y_test, y_pred=Y_pred))

Accuracy Score on train data:  0.9999474946181983
Accuracy Score on test data:  0.8994839819393678


In [10]:
print("For the Test Cases : ")
f1score = f1_score(Y_test, Y_pred, average='macro')
print("F1 Score: ", f1score)
cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(cm)

For the Test Cases : 
F1 Score:  0.4819294468794174
Confusion Matrix:
[[8359   14]
 [ 921    8]]
