***Importing Libaries***

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# this lib is used to convert text data to numerical...countvectorize converts
# text into a matrix of token counts.. tfidvectorizer converts text to a matrix to
# TFIDF(term frequency-inverse document frequency)

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
# all above are 3 varients of Naive Bayes classifier....[1]gaussianNB is for continuous data
# [2]MultinomialNB is used for discreate data like word counts,etc
# [3]BrenoulluiNB is used for binary/boolean feature

from sklearn.model_selection import train_test_split
#  this splits your dataset into training and testing sets

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# this is used to evaluate metrics for the performance of your machine learning model
# accuracy_score used to measure how many predictions were correct out of the total prediction
# confusion_matrix shows matrix of true positives, true negatives, false positives, and false negatives
# classification_report gives a detailed report including precision, recall,F1-score and support for each class


import warnings
warnings.filterwarnings("ignore")
# used to suppress any warning message

***Data Collections***

In [2]:
df = pd.read_csv("/content/Naive Bayesian Classification.csv")
df

# read the data from the file


Unnamed: 0,outlook,temperature,humidity,windy,play
0,overcast,hot,high,0.0,yes
1,rainy,mild,normal,1.0,no
2,sunny,cool,,,
3,,hot,high,0.0,yes
4,overcast,mild,normal,1.0,no
5,rainy,cool,,,
6,sunny,hot,high,0.0,yes
7,,mild,normal,1.0,no
8,overcast,cool,,,
9,rainy,hot,high,0.0,yes


***Exploratory Data analysis and feature engineering***

In [3]:
df1 = df.dropna()
df1

# removes all values which has NaN values

Unnamed: 0,outlook,temperature,humidity,windy,play
0,overcast,hot,high,0.0,yes
1,rainy,mild,normal,1.0,no
4,overcast,mild,normal,1.0,no
6,sunny,hot,high,0.0,yes
9,rainy,hot,high,0.0,yes
10,sunny,mild,normal,1.0,no
12,overcast,hot,high,0.0,yes
13,rainy,mild,normal,1.0,no
16,overcast,mild,normal,1.0,no
18,sunny,hot,high,0.0,yes


***Encoding or conerting categorical column to numerical one***

In [4]:
df1["outlook"].replace({"overcast":1,"rainy":2,"sunny":3},inplace=True)

In [5]:
df1["temperature"].replace({"mild":0,"hot":1,"cool":2},inplace=True)

In [6]:
df1["humidity"].replace({"high":0,"normal":1},inplace=True)

In [7]:
df1["play"].replace({"yes":1,"no":0},inplace=True)

***Train Test Split***

In [8]:
x=df1.drop('play',axis=1)
# store all cols except the play one
y=df1["play"]
# assigns the target variable which is the play one

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=11,stratify=y)
# test_size=0.2 :  specifies that 20% of the data will be used for testing and the remaining 80% will be used for training
# randome_state=11 : reproducibility by setting a fixed seed for the random splitting of data.
# stratify=y : ensures that the class distribution in the training and testing sets is the same as in the original dataset

***Model Training***

In [9]:
gnb_model = GaussianNB()
# creates an instance of the Gaussian Naive Bayes classifier
gnb_model.fit(x_train,y_train)
# this method trains the model using the training data

***Evaluating the model on the test data***

In [10]:
y_pred = gnb_model.predict(x_test)
# this method uses the trained model

cnf_matrix = confusion_matrix(y_test,y_pred)
#this shows the number of correct and incorrect prediction,
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy=accuracy_score(y_pred,y_test)
# this metric represents the proportion of correctly predicted instances out of the total
print("Accuracy:",accuracy)
print("-"*60)

clf_report = classification_report(y_pred,y_test)
# gives a detailed performance metrics for each classlike (precision,recall,F1-score,support)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[1 0]
 [0 1]]
------------------------------------------------------------
Accuracy: 1.0
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



***Training Model Evaluation***

In [11]:
y_pred_train= gnb_model.predict(x_train)

cnf_matrix=confusion_matrix(y_train,y_pred_train)
print("Confusion Matrix:\n",cnf_matrix)
print("-"*60)

accuracy=accuracy_score(y_pred_train,y_train)
print("Accuracy:",accuracy)
print("-"*60)

clf_report=classification_report(y_pred_train,y_train)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[4 0]
 [0 4]]
------------------------------------------------------------
Accuracy: 1.0
------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         4

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



***Single User Input Testing***

In [12]:
column_names=x.columns
column_names

Index(['outlook', 'temperature', 'humidity', 'windy'], dtype='object')

In [13]:
# Predefined Dictionary Approach
outlook_value={"overcast":1,"rainy":2,"sunny":3}
temperature_value={"mild":0,"hot":1,"cool":2}
humidity_value={"high":0,"normal":1}

In [14]:
# initializes a dictionary called json_data
json_data={"outlook":outlook_value, # this adds the outlook_value into json_data with name outlook same for rest
           "temperature":temperature_value,
           "humidity":humidity_value,
           "columns":list(column_names)}

json_data

{'outlook': {'overcast': 1, 'rainy': 2, 'sunny': 3},
 'temperature': {'mild': 0, 'hot': 1, 'cool': 2},
 'humidity': {'high': 0, 'normal': 1},
 'columns': ['outlook', 'temperature', 'humidity', 'windy']}

In [15]:
import json
with open("json_data.json","w") as f: # with syntax ensures that the file is properly closed after the block of code is executed, even if an error occurs
    json.dump(json_data,f)

In [16]:
outlook="rainy"
temperature="mild"
humidity="high"
windy=1.0

In [17]:
test_array = np.zeros(len(column_names)) #fill all values with 0 with length of the array equal to the number of cols in column_names

test_array[0]=json_data['outlook'][outlook]
test_array[1]=json_data['temperature'][temperature]
test_array[2]=json_data['humidity'][humidity]
test_array[3]=windy

test_array

# output is depended on the input we gave before

array([2., 0., 0., 1.])

In [18]:
play=gnb_model.predict([test_array])[0]

if play==1:
    print("yes,Play is happening")
else:
    print("No, Play is not happening")

    #final code which predicts the climate based on the input we game earlier

No, Play is not happening
