### Titanic Dataset

In [35]:
import pandas as pd
from sklearn import naive_bayes
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np

Read in the actual dataset of 887 of the 2435 people aboard the Titanic.

In [2]:
df = pd.read_csv('data/titanic.csv')
print(df.shape)
df.head()

(887, 8)


Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


### Question 1:
Create an input matrix containing the explanatory variables (including a one-hot matrix of the Sex column and by deleting the Name column) and use it to predict the response variable, Survived, using the Naive-Bayes algorithm.

In [3]:
df = df.drop(columns = ['Name'])
one_hot = pd.get_dummies(df['Sex'])
df = df.join(one_hot)
df = df.drop(columns = ["Sex"])
df.head()

X = df.drop(columns = ['Survived'])
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=.30, random_state=4444)

model = naive_bayes.GaussianNB()
model.fit(X_train,y_train)

print('train accuracy', model.score(X_train, y_train))
print('test accuracy', model.score(X_test, y_test))

train accuracy 0.7919354838709678
test accuracy 0.8127340823970037


### Question 2:
How many dead passengers were incorrectly predicted to survive? How many survivors were incorrectly predicted to be deceased?

In [4]:
print(model.classes_)
confusion_matrix(y_test, model.predict(X_test))

[0 1]


array([[144,  19],
       [ 31,  73]])

### Question 3:
Would you predict survival or death of a 3rd class, 18 year old, male passenger who had no family aboard and paid $1?

Would you predict survival or death of a 1st class, 18 year old, female passenger who had no family aboard and paid $50?

In [5]:
print(model.predict([[3,18,0,0,0,0,1]]))
print(model.predict_proba([[3,18,0,0,0,0,1]]))


print(model.predict([[1,18,0,0,50,1,0]]))
print(model.predict_proba([[1,18,0,0,50,1,0]]))


[0]
[[0.96937592 0.03062408]]
[1]
[[0.00290691 0.99709309]]


### Question 4: 
Return to the golf example from yesterday. Write a function called NaiveBayes that takes in an outlook, temp, humidity, and wind, and returns whether we predict that we will play golf or not. Within the function, print both the probabilities of yes or no. If you want to be fancy, you can make this function more general, but it's okay to make this very specific to the golf example.

In [6]:
df = pd.read_csv("data/golf.csv")
df.head()

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
0,Rainy,Hot,High,False,No
1,Sunny,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Rainy,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes


First iteration:

In [19]:
df = pd.read_csv("data/golf.csv")
df.head()

def NaiveBayes(outlook, temp, humidity, windy):
    yes = df[df['Play Golf'] == "Yes"]
    sunnyP = len(df[(df['Outlook'] == outlook) & (df['Play Golf'] == "Yes")])/len(yes)
    hotP = len(df[(df['Temp'] == temp) & (df['Play Golf'] == "Yes")])/len(yes)
    humidityP = len(df[(df['Humidity'] == humidity) & (df['Play Golf'] == "Yes")])/len(yes)
    windyP = len(df[(df['Windy'] == windy) & (df['Play Golf'] == "Yes")])/len(yes)
    yesP = len(yes)/len(df)
    Y = sunnyP*hotP*humidityP*windyP*yesP

    no = df[df['Play Golf'] == "No"]
    sunnyP = len(df[(df['Outlook'] == outlook) & (df['Play Golf'] == "No")])/len(no)
    hotP = len(df[(df['Temp'] == temp) & (df['Play Golf'] == "No")])/len(no)
    humidityP = len(df[(df['Humidity'] == humidity) & (df['Play Golf'] == "No")])/len(no)
    windyP = len(df[(df['Windy'] == windy) & (df['Play Golf'] == "No")])/len(no)
    noP = len(no)/len(df)
    N = sunnyP*hotP*humidityP*windyP*noP

    print(f"Prob Yes: {Y/(N+Y)}")
    print(f"Prob No: {N/(N+Y)}")
    
NaiveBayes('Rainy', 'Hot', 'Normal', True)

Prob Yes: 0.6067961165048542
Prob No: 0.3932038834951457


Improved:

In [20]:
df = pd.read_csv("data/golf.csv")
df.head()

def NaiveBayes(outlook, temp, humidity, windy):
    probabilities = []
    for class_ in df['Play Golf'].unique():
        classNum = len(df[df['Play Golf'] == class_])
        sunnyProb = len(df[(df['Outlook'] == outlook) & (df['Play Golf'] == class_)])/classNum
        hotProb = len(df[(df['Temp'] == temp) & (df['Play Golf'] == class_)])/classNum
        humidityProb = len(df[(df['Humidity'] == humidity) & (df['Play Golf'] == class_)])/classNum
        windyProb = len(df[(df['Windy'] == windy) & (df['Play Golf'] == class_)])/classNum
        yesProb = classNum/len(df)
        prob = sunnyProb*hotProb*humidityProb*windyProb*yesProb
        probabilities.append(prob)
        
    for i, class_ in enumerate(df['Play Golf'].unique()):
        print(f"Prob {class_}: {probabilities[i]/(sum(probabilities))}")
    
NaiveBayes('Rainy', 'Hot', 'Normal', True)

Prob No: 0.3932038834951457
Prob Yes: 0.6067961165048542


Better:

In [41]:
df = pd.read_csv("data/golf.csv")
df.head()

def NaiveBayes(X,y,instance):
    probabilities = []
    for class_ in y.unique():
        classNum = len(df[y == class_])
        probs = []
        for i in range(len(X.columns)):
            probs.append(len(df[(X.iloc[:,i] == instance[i]) & (y == class_)])/classNum)
        yesProb = classNum/len(df)
        prob = np.prod(probs)*yesProb
        probabilities.append(prob)
        
    for i, class_ in enumerate(y.unique()):
        print(f"Prob {class_}: {probabilities[i]/(sum(probabilities))}")

        
X = df[['Outlook', 'Temp', 'Humidity', 'Windy']]
y = df['Play Golf']
instance = ['Rainy', 'Hot', 'Normal', True]

NaiveBayes(X,y,instance)

Prob No: 0.3932038834951457
Prob Yes: 0.6067961165048542
