In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from pandas.api.types import CategoricalDtype
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import time

In [2]:
data = pd.read_csv('car_kick.csv')
# Deleting Unwanted Columns of Data
data = data.drop(['PurchDate','Color', 'VNZIP1'], axis=1)

# Deleting Missing Values
data = data[data['MMRAcquisitionAuctionAveragePrice']!=0]
data = data[data['MMRAcquisitionAuctionCleanPrice']!=0]
data = data[data['MMRAcquisitionRetailAveragePrice']!=0]
data = data[data['MMRAcquisitonRetailCleanPrice']!=0]

X = data.iloc[:, :data.shape[1]-1]
Y = data.iloc[:, data.shape[1]-1]

In [3]:
# Name of Data Attributes
N = np.array(X.columns)
# print(N)

encoder = LabelEncoder()

for i in range(len(N)):
    if type(X.loc[0, N[i]]) == str :
        X[N[i]] = encoder.fit_transform(X[N[i]])

# Normailising the Data
for i in range(len(N)):
    X[N[i]] = ( ( X[N[i]] - X[N[i]].min() ) /  (X[N[i]].max() - X[N[i]].min()))

In [4]:
# Classifying Attributes as Ordinal and Nominal
X['Auction'] = X['Auction'].astype(CategoricalDtype(ordered=False))
X['Make'] = X['Make'].astype(CategoricalDtype(ordered=False))
X['Model'] = X['Model'].astype(CategoricalDtype(ordered=False))
X['Trim'] = X['Trim'].astype(CategoricalDtype(ordered=False))
X['SubModel'] = X['SubModel'].astype(CategoricalDtype(ordered=False))
X['Transmission'] = X['Transmission'].astype(CategoricalDtype(ordered=False))
X['WheelTypeID'] = X['WheelTypeID'].astype(CategoricalDtype(ordered=False))
X['WheelType'] = X['WheelType'].astype(CategoricalDtype(ordered=False))
X['Nationality'] = X['Nationality'].astype(CategoricalDtype(ordered=False))
X['TopThreeAmericanName'] = X['TopThreeAmericanName'].astype(CategoricalDtype(ordered=False))
X['BYRNO'] = X['BYRNO'].astype(CategoricalDtype(ordered=False))
X['VNST'] = X['VNST'].astype(CategoricalDtype(ordered=False))

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 40, test_size = 0.14)
clf = GaussianNB()
# clf.fit(X_train, Y_train)

In [7]:
x_time = time.time()
param_grid = {
    'priors' : [ [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],  None],
    'var_smoothing' : np.arange( 0.1, 1, 0.1)
}

random_search = RandomizedSearchCV(clf, param_distributions=param_grid, n_iter=20, cv=5, random_state=42, n_jobs=1 )
random_search.fit(X_train, Y_train)

y_time = time.time()
print("Time required(in sec)", y_time-x_time)

print('Best hyperparameters:', random_search.best_params_)
print('Best score:', random_search.best_score_)



Time required(in sec) 2.6027917861938477
Best hyperparameters: {'var_smoothing': 0.5, 'priors': None}
Best score: 0.905455310833276


45 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\harsh\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\harsh\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\naive_bayes.py", line 267, in fit
    return self._partial_fit(
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\harsh\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\naive_bayes.py", line 454, in _partial_fit
    raise ValueError("Number of priors must match number of classes.")


In [8]:
clf_best = GaussianNB(**random_search.best_params_)
clf_best.fit(X_train, Y_train)

In [9]:
Y_pred = clf_best.predict( X_test )
print('Accuracy Score on train data: ', accuracy_score(y_true=Y_train, y_pred=clf_best.predict(X_train)))
print('Accuracy Score on test data: ', accuracy_score(y_true=Y_test, y_pred=Y_pred))

Accuracy Score on train data:  0.9054553091691898
Accuracy Score on test data:  0.900129004515158


In [10]:
print("For the Test Cases : ")
f1score = f1_score(Y_test, Y_pred, average='macro')
print("F1 Score: ", f1score)
cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(cm)

For the Test Cases : 
F1 Score:  0.4737199434229137
Confusion Matrix:
[[8373    0]
 [ 929    0]]
