# Naive Bayes - Golf Dataset

In [45]:
import pandas as pd

In [46]:
df = pd.read_csv("../data/golf-dataset.csv")
df

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Sunny,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes
5,Sunny,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Rainy,Mild,High,False,No
8,Rainy,Cool,Normal,False,Yes
9,Sunny,Mild,Normal,False,Yes


## Bayes Theorem

$P(A|B) = \dfrac{P(B|A)P(A)}{P(B)}$

The goal is to predict for:

Should we play golf if:

Outlook = Sunny, Temperature = Cool, Humidity = High, Windy = True

Mathimatically:

**P(Yes | Outlook = Sunny, Temperature = Cool, Humidity = High, Windy = True)**

In [47]:
p_yes = 9/14
p_no = 5/14

print(p_yes, p_no)

0.6428571428571429 0.35714285714285715


In [48]:
p_yes = df[df["Play Golf"] == 'Yes'].shape[0] / df.shape[0]
p_no = df[df["Play Golf"] == 'No'].shape[0] / df.shape[0]

print(p_yes, p_no)

0.6428571428571429 0.35714285714285715


In [49]:
# P(Outlook = Sunny | play = Yes)

p_outlook_sunny_yes = 3/9
p_temp_cool_yes = 3/9
p_humidity_high_yes = 3/9
p_wind_true_yes = 3/9

# P(Outlook = Sunny | play = No)

p_outlook_sunny_no = 2/5
p_temp_cool_no = 1/5
p_humidity_high_no = 4/5
p_wind_true_no = 3/5


In [50]:
# P(Sunny, Cool, High, Windy | Yes)

p_sunny_cool_high_windy_yes = (3/9) * (3/9) * (3/9) * (3/9)
p_sunny_cool_high_windy_yes


0.012345679012345678

In [51]:
# P(Sunny, Cool, High, Windy | No)

p_sunny_cool_high_windy_no = (2/5) * (1/5) * (4/5) * (3/5)
p_sunny_cool_high_windy_no

0.03840000000000001

In [52]:
# normalizing the resulted value by
# P(Outlook = Sunny, Temperature = Cool, Humidity = High, Windy = True)

p_sunny = 5/14
p_cool = 4/14
p_high = 7/14
p_windy = 6/14

p_sunny_cool_high_windy = p_sunny * p_cool * p_high * p_windy

p_sunny_cool_high_windy

0.021865889212827987

$P(A|B) = \dfrac{P(B|A)P(A)}{P(B)}$

$P(Yes|Sunny, Cool, High, Windy) = \dfrac{P(Sunny, Cool, High, Windy|Yes)P(Yes)}{P(Sunny, Cool, High, Windy)}$

$P(No|Sunny, Cool, High, Windy) = \dfrac{P(Sunny, Cool, High, Windy|Yes)P(No)}{P(Sunny, Cool, High, Windy)}$

In [53]:
p_final_yes = p_sunny_cool_high_windy_yes * p_yes / p_sunny_cool_high_windy
p_final_yes

0.36296296296296293

In [54]:
p_final_no = p_sunny_cool_high_windy_no * p_no / p_sunny_cool_high_windy
p_final_no

0.6272000000000002

## Scikit Learn

In [55]:
from sklearn.preprocessing import LabelEncoder

columns = ['Outlook', 'Temp', 'Humidity', 'Windy', 'Play Golf']

# encoders = {"Outlook": le_object, "Temp": le2_object}
encoders = {}

for col in columns:
    encoders[col] = LabelEncoder()
    df[col] = encoders[col].fit_transform(df[col])
    
df

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
0,1,1,0,0,0
1,1,1,0,1,0
2,0,1,0,0,1
3,2,2,0,0,1
4,2,0,1,0,1
5,2,0,1,1,0
6,0,0,1,1,1
7,1,2,0,0,0
8,1,0,1,0,1
9,2,2,1,0,1


In [69]:
encoders

{'Outlook': LabelEncoder(),
 'Temp': LabelEncoder(),
 'Humidity': LabelEncoder(),
 'Windy': LabelEncoder(),
 'Play Golf': LabelEncoder()}

In [64]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

X = df.drop(columns=["Play Golf"])
y = df["Play Golf"]

nb.fit(X, y)

nb

In [65]:
data = ['Sunny', 'Cool', 'High', 'True']
columns = ['Outlook', 'Temp', 'Humidity', 'Windy']

df_test = pd.DataFrame(data=[data], columns=columns)
df_test

Unnamed: 0,Outlook,Temp,Humidity,Windy
0,Sunny,Cool,High,True


In [66]:
for col in columns:
    df_test[col] = encoders[col].transform(df_test[col])
    
df_test

Unnamed: 0,Outlook,Temp,Humidity,Windy
0,2,0,0,1


In [67]:
prediction = nb.predict(df_test)
prediction

array([0])

In [68]:
encoders["Play Golf"].inverse_transform(prediction)

array(['No'], dtype=object)

In [70]:
prediction_proba = nb.predict_proba(df_test)
prediction_proba

array([[0.80106965, 0.19893035]])