# Naive Bayes
Because scikit-learn only works with numeric features, the method for calculating conditional probabilities covered in lectures does not apply.  
Here we use one-hot encoding (using `get_dummies`) to convert to the Swimming dataset to a numeric format. 

In [1]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import confusion_matrix 
from sklearn.preprocessing import OneHotEncoder

In [2]:
swim = pd.read_csv('Swimming.csv')
swim.head()

Unnamed: 0,Rain_Recently,Rain_Today,Temp,Wind,Sunshine,Swimming
0,Moderate,Moderate,Warm,Light,Some,Yes
1,Light,Moderate,Warm,Moderate,,No
2,Moderate,Moderate,Cold,Gale,,No
3,Moderate,Moderate,Warm,Light,,Yes
4,Moderate,Light,Cold,Light,Some,No


In [3]:
y = swim.pop('Swimming').values # Set this as the y (target)
print(y)

['Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes' 'No']


### One-Hot-Encode the training data

In [4]:
onehot_encoder = OneHotEncoder(sparse=False)
swimOH = onehot_encoder.fit_transform(swim)
swimOH

array([[0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0.],
       [0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0.],
       [0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1.]])

In [5]:
onehot_encoder.get_feature_names(swim.columns)

array(['Rain_Recently_Heavy', 'Rain_Recently_Light',
       'Rain_Recently_Moderate', 'Rain_Today_Heavy', 'Rain_Today_Light',
       'Rain_Today_Moderate', 'Temp_Cold', 'Temp_Warm', 'Wind_Gale',
       'Wind_Light', 'Wind_Moderate', 'Sunshine_None', 'Sunshine_Some'],
      dtype=object)

In [6]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()
swim_numNB = mnb.fit(swimOH,y)
y_dash = swim_numNB.predict(swimOH)

In [7]:
confusion = confusion_matrix(y, y_dash)
print("Confusion matrix:\n{}".format(confusion)) 

Confusion matrix:
[[6 0]
 [1 3]]


In [8]:
swim_numNB.classes_

array(['No', 'Yes'], dtype='<U3')

In [9]:
# Three query examples, two from the lecture and one from the training data.

squery = pd.DataFrame([["Moderate","Moderate","Warm","Light","Some"],
                       ["Moderate","Moderate","Cold","Moderate","Some"],
                       ["Moderate","Light","Warm","Light","None"]
                      ], columns=swim.columns)

In [10]:
X_query = onehot_encoder.transform(squery)
X_query, X_query.shape

(array([[0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1.],
        [0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1.],
        [0., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0.]]), (3, 13))

In [11]:
y_query = swim_numNB.predict(X_query)
y_query

array(['Yes', 'No', 'Yes'], dtype='<U3')

In [12]:
q_probs = swim_numNB.predict_proba(X_query)
q_probs

array([[0.3716943 , 0.6283057 ],
       [0.78019522, 0.21980478],
       [0.34743898, 0.65256102]])

In [13]:
swim_numNB.classes_

array(['No', 'Yes'], dtype='<U3')