Prepare a naïve bayes classification model for prediction of purchase power of a user.

In [1]:
import pandas as pd 
import numpy as np 
from matplotlib.colors import ListedColormap
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics 
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score 

In [2]:
df = pd.read_csv(r"C:\Users\faizu\OneDrive\Desktop\User_Data.csv")

In [3]:
df.shape

(400, 5)

In [4]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [5]:
df.tail()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0
399,15594041,Female,49,36000,1


In [6]:
df.drop('User ID',axis = 1)

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0
...,...,...,...,...
395,Female,46,41000,1
396,Male,51,23000,1
397,Female,50,20000,1
398,Male,36,33000,0


In [7]:
# Label Encoder is the class which is used to convert a categorical variable into a numerical variable.
# since a machine learning model is a mathematical model so it understands only numerical values.
# this is also known as one hot encoding.
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender']) 
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0
...,...,...,...,...,...
395,15691863,0,46,41000,1
396,15706071,1,51,23000,1
397,15654296,0,50,20000,1
398,15755018,1,36,33000,0


In [8]:
# y = df['Purchased']
# x = df.drop('Purchased',axis = 1)
x = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [9]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.25 , random_state = True) 

In [10]:
sc = StandardScaler() 
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test) 

In [11]:
# Classifier
classifier = GaussianNB() 
classifier.fit(x_train,y_train)

In [12]:
# Prediction 
y_pred = classifier.predict(x_test) 
# accuracy 
accuracy_score(y_test, y_pred)

0.84

In [13]:
# Classification report
print(f"Classification Report: \n {classification_report(y_test,y_pred)}") 

Classification Report: 
               precision    recall  f1-score   support

           0       0.83      0.91      0.87        58
           1       0.86      0.74      0.79        42

    accuracy                           0.84       100
   macro avg       0.84      0.83      0.83       100
weighted avg       0.84      0.84      0.84       100



In [14]:
# confusion matrix
cf_matrix = confusion_matrix(y_test , y_pred)
print(cf_matrix) 

[[53  5]
 [11 31]]
