In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from pandas.api.types import CategoricalDtype
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import time

In [3]:
data = pd.read_csv('car_kick.csv')
# Deleting Unwanted Columns of Data
data = data.drop(['PurchDate','Color', 'VNZIP1'], axis=1)

# Deleting Missing Values
data = data[data['MMRAcquisitionAuctionAveragePrice']!=0]
data = data[data['MMRAcquisitionAuctionCleanPrice']!=0]
data = data[data['MMRAcquisitionRetailAveragePrice']!=0]
data = data[data['MMRAcquisitonRetailCleanPrice']!=0]

X = data.iloc[:, :data.shape[1]-1]
Y = data.iloc[:, data.shape[1]-1]

In [4]:
# Name of Data Attributes
N = np.array(X.columns)
# print(N)

encoder = LabelEncoder()

for i in range(len(N)):
    if type(X.loc[0, N[i]]) == str :
        X[N[i]] = encoder.fit_transform(X[N[i]])


# Normailising the Data
for i in range(len(N)):
    X[N[i]] = ( ( X[N[i]] - X[N[i]].min() ) /  (X[N[i]].max() - X[N[i]].min()))

In [5]:
# Classifying Attributes as Ordinal and Nominal
X['Auction'] = X['Auction'].astype(CategoricalDtype(ordered=False))
X['Make'] = X['Make'].astype(CategoricalDtype(ordered=False))
X['Model'] = X['Model'].astype(CategoricalDtype(ordered=False))
X['Trim'] = X['Trim'].astype(CategoricalDtype(ordered=False))
X['SubModel'] = X['SubModel'].astype(CategoricalDtype(ordered=False))
X['Transmission'] = X['Transmission'].astype(CategoricalDtype(ordered=False))
X['WheelTypeID'] = X['WheelTypeID'].astype(CategoricalDtype(ordered=False))
X['WheelType'] = X['WheelType'].astype(CategoricalDtype(ordered=False))
X['Nationality'] = X['Nationality'].astype(CategoricalDtype(ordered=False))
X['TopThreeAmericanName'] = X['TopThreeAmericanName'].astype(CategoricalDtype(ordered=False))
X['BYRNO'] = X['BYRNO'].astype(CategoricalDtype(ordered=False))
X['VNST'] = X['VNST'].astype(CategoricalDtype(ordered=False))

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 40, test_size = 0.14)
clf = DecisionTreeClassifier()
# clf.fit(X_train, Y_train)

In [7]:
x_time = time.time()
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [ 40, 50, None],
    'min_samples_split': [20, 50, 70, None],
    'max_features': ['sqrt', 'log2', None]
}

gs = GridSearchCV( clf, param_grid, cv=5, n_jobs=-1 )
gs.fit(X_train, Y_train)

y_time = time.time()
print("Time for Grid Parameters ",  y_time-x_time)

print(gs.best_params_)
print(gs.best_score_)

90 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\harsh\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\harsh\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\tree\_classes.py", line 889, in fit
    super().fit(
  File "C:\Users\harsh\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\tree\_classes.py", line 177, in fit
    self._validate_params()
  File "C:\Users\harsh\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\b

Time for Grid Parameters  43.40403866767883
{'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'min_samples_split': 70}
0.8946916493457472


In [8]:
clf_best = DecisionTreeClassifier(**gs.best_params_)
clf_best.fit(X_train, Y_train)

In [9]:
Y_pred = clf_best.predict( X_test )
print('Accuracy Score on train data: ', accuracy_score(y_true=Y_train, y_pred=clf_best.predict(X_train)))
print('Accuracy Score on test data: ', accuracy_score(y_true=Y_test, y_pred=Y_pred))

Accuracy Score on train data:  0.9132786110576334
Accuracy Score on test data:  0.8874435605246184


In [10]:
print("For the Test Cases : ")
f1score = f1_score(Y_test, Y_pred, average='macro')
print("F1 Score: ", f1score)
cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(cm)

For the Test Cases : 
F1 Score:  0.496328348691677
Confusion Matrix:
[[8226  147]
 [ 900   29]]
