# Kimiya Ghanai Machine Learning

## Comparing ML Classification Algorithms

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## import BMW sales dataset

In [2]:
df= pd.read_csv('BMW sales data (2010-2024).csv')
df.head()

Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,5 Series,2016,Asia,Red,Petrol,Manual,3.5,151748,98740,8300,High
1,i8,2013,North America,Red,Hybrid,Automatic,1.6,121671,79219,3428,Low
2,5 Series,2022,North America,Blue,Petrol,Automatic,4.5,10991,113265,6994,Low
3,X3,2024,Middle East,Blue,Petrol,Automatic,1.7,27255,60971,4047,Low
4,7 Series,2020,South America,Black,Diesel,Manual,2.1,122131,49898,3080,Low


In [3]:
df['Sales_Classification'].value_counts()

Sales_Classification
Low     34754
High    15246
Name: count, dtype: int64

## Normalizing data

In [6]:
x= df.drop('Sales_Classification',axis=1)
y= df['Sales_Classification']

In [7]:
from sklearn.preprocessing import LabelEncoder
x = x.apply(LabelEncoder().fit_transform)
y = LabelEncoder().fit_transform(y)
y.shape

(50000,)

## Split train\test

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [9]:
x_train= np.array(x_train)
x_test= np.array(x_test)
y_train= np.array(y_train)
y_test= np.array(y_test)
y_test.shape

(10000,)

## Prediction with Naive Bayes

In [10]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train, y_train)
y_pred = nb.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [11]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [12]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 0.9997
Precision: 1.0
Recall: 0.9995694603903559

Confusion Matrix:
 [[3032    0]
 [   3 6965]]


### <span style="color:green;"> Naive Bayes classification shows a great accuracy, precision and recall for this dataset but this algorithm is too simple for this datset so I gave it 8/10 </span>

## Prediction with KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [15]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 0.9798
Precision: 0.9862029318769762
Recall: 0.9847876004592423

Confusion Matrix:
 [[2936   96]
 [ 106 6862]]


### <span style="color:red;"> k-nearest neighbor classification is the weakest one among all of other algorithm for this dataset so i gave it 4/10</span>

## Prediction with Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [17]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [18]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0

Confusion Matrix:
 [[3032    0]
 [   0 6968]]


### <span style="color:green;"> Disicion tree classification shows a great accuracy, precision and recall for this dataset but this algorithm can cause overfitting due to large numbers so I gave it 9/10 </span>

## Prediction with Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [20]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [21]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0

Confusion Matrix:
 [[3032    0]
 [   0 6968]]


### <span style="color:green;"> Random forest classification is the best choice for this dataset because it can handle numerical and categorical features very well so I gave it 10/10 </span>

## Prediction with SVM

In [22]:
from sklearn import svm
svm = svm.SVC(kernel='linear')
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [23]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [24]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 0.9999
Precision: 1.0
Recall: 0.9998564867967853

Confusion Matrix:
 [[3032    0]
 [   1 6967]]


### <span style="color:green;"> SVM classification is also very good but a little bit slow for this dataset so I gave it 9/10</span>

## Prediction with Logestic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=10000)
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [26]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [27]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0

Confusion Matrix:
 [[3032    0]
 [   0 6968]]


### <span style="color:green;">Logistic regression is also good but not as good as SVM or DT so i gave it 8.5/10</span>

## Prediction with ANN

In [28]:
from sklearn.neural_network import MLPClassifier
ann = MLPClassifier(hidden_layer_sizes=100, max_iter=150)
ann.fit(x_train, y_train)
y_pred = ann.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [29]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [30]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 0.8776
Precision: 0.9465174129353234
Recall: 0.8737083811710677

Confusion Matrix:
 [[2688  344]
 [ 880 6088]]


### <span style="color:red;">ANN classification is one of the weakest algorithms for this dataset so i gave it 6/10</span>

# <span style="color:purple;"> So as we can see Random Forest is the best one with 100% accuracy, precision and recall for this dataset otherwise K-nearest neighbor is the weakest one with 96% acuuracy and 93% for both precision and recall, thus, the best option here is Random Forest</span>