In [1]:
import numpy as np
import pandas as pd

In [2]:
golf_df = pd.read_csv("./TestData/golf-dataset.csv")

In [3]:
#golf_df["Play Golf"]=golf_df["Play Golf"].map({"Yes":1,"No":0})

# Prior probability

In [4]:
# Prior probability calculation 

def prior_prob(df,val):
    total = df.shape[0]
    val_count = df["Play Golf"].value_counts()[val]
    return val_count/total

In [5]:
yvalue = prior_prob(golf_df,"Yes")
yvalue

0.6428571428571429

In [6]:
prior = {"Yes" : prior_prob(golf_df,"Yes"),
        "No": prior_prob(golf_df,"No")}
prior

{'Yes': 0.6428571428571429, 'No': 0.35714285714285715}

# Conditional Probability 

In [7]:
def conditional_prob(df,feature,feature_value,label):
    filtered_data = df[df["Play Golf"]==label]
    numerator = np.sum(filtered_data[feature]==feature_value)
    denominator = filtered_data.shape[0]
   
    return numerator/denominator 

In [8]:
conditional_prob(golf_df,"Outlook","Sunny","Yes")

0.3333333333333333

# Likelihood probability

In [9]:
# likelihood probability for all the features

cond_prob = {}

for label in golf_df["Play Golf"].unique():  # to get 0 and 1
    cond_prob[label] = {}
    features = list(golf_df.columns)[:-1]
    for feature in features: # to get outlook, temp, humidity and windy ==> features name
        cond_prob[label][feature] = {}
        for feat_val in golf_df[feature].unique(): # To get features values like outlook = sunny, overcast and rainy
            cond_prob[label][feature][feat_val] = round(conditional_prob(golf_df,feature,feat_val,label),2)
cond_prob

{'No': {'Outlook': {'Rainy': 0.6, 'Overcast': 0.0, 'Sunny': 0.4},
  'Temp': {'Hot': 0.4, 'Mild': 0.4, 'Cool': 0.2},
  'Humidity': {'High': 0.8, 'Normal': 0.2},
  'Windy': {False: 0.4, True: 0.6}},
 'Yes': {'Outlook': {'Rainy': 0.22, 'Overcast': 0.44, 'Sunny': 0.33},
  'Temp': {'Hot': 0.22, 'Mild': 0.44, 'Cool': 0.33},
  'Humidity': {'High': 0.33, 'Normal': 0.67},
  'Windy': {False: 0.67, True: 0.33}}}

 # Posterior Probability

In [10]:
features

['Outlook', 'Temp', 'Humidity', 'Windy']

In [11]:
# posterior probability calculation for given weather condition

x_test = ["Sunny","Hot","Normal",False]

for label in golf_df["Play Golf"].unique():
    
    prior = prior_prob(golf_df,label)
    likelihood = 1.0
    
    for i in range (len(features)):
        feature = features[i]
        feat_val = x_test[i]
        
        likelihood  *= cond_prob[label][feature][feat_val]
    post = likelihood * prior
    print(label,post)
    


No 0.004571428571428573
Yes 0.02095080428571429


# Implementation using Sklearn

 Note: Sklearn cannot work with text data, so we need to convert text data to numerical data.
 We can use Label encoder from sklearn to do this.

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
le1 = LabelEncoder()
golf_df["Outlook"] = le1.fit_transform(golf_df["Outlook"])

In [14]:
le2 = LabelEncoder()
golf_df["Temp"] = le2.fit_transform(golf_df["Temp"])

In [15]:
le3 = LabelEncoder()
golf_df["Humidity"] = le3.fit_transform(golf_df["Humidity"])

In [16]:
le4 = LabelEncoder()
golf_df["Windy"] = le4.fit_transform(golf_df["Windy"])

In [17]:
le5 = LabelEncoder()
golf_df["Play Golf"]= le5.fit_transform(golf_df["Play Golf"])

In [18]:
X = golf_df.iloc[:,:-1]
Y = golf_df.iloc[:,-1]

In [19]:
from sklearn.naive_bayes import CategoricalNB

In [20]:
model = CategoricalNB()

In [21]:
model.fit(X,Y)

In [22]:
x_test = ["Sunny","Hot","Normal",False]

In [23]:
le1.transform(['Sunny'])

array([2])

In [24]:
le2.transform(['Hot'])

array([1])

In [25]:
le3.transform(['Normal'])

array([1])

In [26]:
le4.transform([False])

array([0], dtype=int64)

In [27]:
x_test = np.array([[2,1,1,0]])

In [28]:
model.predict(x_test)



array([1])

So, model is prediction class yes in the encoded i.e., 1

In [30]:
model.predict_proba(x_test)



array([[0.22086561, 0.77913439]])

From the above probability we can get to know that the probability of class 'yes' is 0.77 and class 'no' is 0.22. Hence given dataset belongs to class 'yes'(1)