# NaiveBayes Exercise

## Larry Larkin

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


from sklearn.naive_bayes import GaussianNB  # Import Gaussian Bayes modeling function
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
location = "datasets/titanic.xls"

df = pd.read_excel(location)
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


## Clean the Data

#### Handle missing values

In [3]:
#find columns that have missing values
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

Let's clean up 'age' and 'embarked'

In [4]:
#rows where the age is missing
missing_age = df.loc[df['age'].isnull()]
missing_age.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
15,1,0,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S,,,"New York, NY"
37,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S,9.0,,"Los Angeles, CA"
40,1,0,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C,,,"Philadelphia, PA"
46,1,0,"Cairns, Mr. Alexander",male,,0,0,113798,31.0,,S,,,
59,1,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...",female,,0,0,17770,27.7208,,C,5.0,,"New York, NY"


In [5]:
#get index numbers of missing rows - we'll use this later
mals = list(missing_age.index)

In [6]:
#table of avg age of passenger by survival status, sex, and passenger class
df.groupby(['survived', 'sex', 'pclass'])['age'].mean()

survived  sex     pclass
0         female  1         35.200000
                  2         34.090909
                  3         23.418750
          male    1         43.658163
                  2         33.092593
                  3         26.679598
1         female  1         37.109375
                  2         26.711051
                  3         20.814815
          male    1         36.168240
                  2         17.449274
                  3         22.436441
Name: age, dtype: float64

In [7]:
#fill missing values for age based on survival status, sex, and passenger class
df['age'].fillna(df.groupby(['survived', 'sex', 'pclass'])['age'].transform('mean'), inplace=True)

In [8]:
#verify filled missing values 
df.iloc[mals].head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
15,1,0,"Baumann, Mr. John D",male,43.658163,0,0,PC 17318,25.925,,S,,,"New York, NY"
37,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,36.16824,0,0,111427,26.55,,S,9.0,,"Los Angeles, CA"
40,1,0,"Brewe, Dr. Arthur Jackson",male,43.658163,0,0,112379,39.6,,C,,,"Philadelphia, PA"
46,1,0,"Cairns, Mr. Alexander",male,43.658163,0,0,113798,31.0,,S,,,
59,1,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...",female,37.109375,0,0,17770,27.7208,,C,5.0,,"New York, NY"


In [9]:
#verify there are no more missing age values
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age             0
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [10]:
#missing values for 'embarked'
embark = df.loc[df['embarked'].isnull()]
embark

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
168,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,6,,
284,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,6,,"Cincinatti, OH"


In [11]:
#save index for missing values to verify later
embarkls = list(embark.index)

In [12]:
#only 2 missing values so we'll fill with most common embarkation point
df['embarked'].value_counts()

S    914
C    270
Q    123
Name: embarked, dtype: int64

In [13]:
#fill missing values
df['embarked'].fillna('S', inplace=True)

In [14]:
#check that they're filled
df.iloc[embarkls]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
168,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,S,6,,
284,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,S,6,,"Cincinatti, OH"


In [15]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age             0
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        0
boat          823
body         1188
home.dest     564
dtype: int64

Get rid of columns that we don't want to use in the model

In [16]:
modeldf = df.drop(['name','ticket','fare', 'cabin', 'boat', 'body', 'home.dest'], axis=1)

In [17]:
#columns left in our dataframe
modeldf.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'embarked'], dtype='object')

Create dummy variables for categorical values

In [18]:
#dummy variables for passenger class embarkation port
#get_dummies will auto-drop columns that dummies were created from
modeldf = pd.get_dummies(data=modeldf, columns=['pclass','embarked'])
modeldf.head()

Unnamed: 0,survived,sex,age,sibsp,parch,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S
0,1,female,29.0,0,0,1,0,0,0,0,1
1,1,male,0.9167,1,2,1,0,0,0,0,1
2,0,female,2.0,1,2,1,0,0,0,0,1
3,0,male,30.0,1,2,1,0,0,0,0,1
4,0,female,25.0,1,2,1,0,0,0,0,1


In [19]:
#change sex values to binary
#female=0, male=1
modeldf['sex'] = modeldf['sex'].map({'female':0, 'male':1})
modeldf.head()

Unnamed: 0,survived,sex,age,sibsp,parch,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S
0,1,0,29.0,0,0,1,0,0,0,0,1
1,1,1,0.9167,1,2,1,0,0,0,0,1
2,0,0,2.0,1,2,1,0,0,0,0,1
3,0,1,30.0,1,2,1,0,0,0,0,1
4,0,0,25.0,1,2,1,0,0,0,0,1


In [20]:
#create new column based on number of family members
#drop sibsp and parch columns
modeldf['family_num'] = modeldf['sibsp'] + modeldf['parch']
modeldf.drop(['sibsp', 'parch'], axis=1, inplace=True)
modeldf.head()

Unnamed: 0,survived,sex,age,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S,family_num
0,1,0,29.0,1,0,0,0,0,1,0
1,1,1,0.9167,1,0,0,0,0,1,3
2,0,0,2.0,1,0,0,0,0,1,3
3,0,1,30.0,1,0,0,0,0,1,3
4,0,0,25.0,1,0,0,0,0,1,3


In [21]:
modeldf['TravelAlone']=np.where((modeldf['family_num'] > 0), 0, 1)
modeldf.head()

Unnamed: 0,survived,sex,age,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S,family_num,TravelAlone
0,1,0,29.0,1,0,0,0,0,1,0,1
1,1,1,0.9167,1,0,0,0,0,1,3,0
2,0,0,2.0,1,0,0,0,0,1,3,0
3,0,1,30.0,1,0,0,0,0,1,3,0
4,0,0,25.0,1,0,0,0,0,1,3,0


## Correlation Analysis

In [22]:
modeldf.corr()

Unnamed: 0,survived,sex,age,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S,family_num,TravelAlone
survived,1.0,-0.528693,-0.060032,0.279449,0.05079,-0.283428,0.182123,-0.016071,-0.150542,0.026876,-0.201719
sex,-0.528693,1.0,0.080752,-0.107371,-0.028862,0.116562,-0.066564,-0.088651,0.115193,-0.188583,0.284537
age,-0.060032,0.080752,1.0,0.428501,0.005843,-0.375549,0.082706,-0.085716,-0.018446,-0.206087,0.116266
pclass_1,0.279449,-0.107371,0.428501,1.0,-0.296526,-0.622172,0.325722,-0.166101,-0.1818,-0.029656,-0.126551
pclass_2,0.05079,-0.028862,0.005843,-0.296526,1.0,-0.56318,-0.134675,-0.121973,0.196532,-0.039976,-0.035075
pclass_3,-0.283428,0.116562,-0.375549,-0.622172,-0.56318,1.0,-0.17143,0.243706,-0.003805,0.05843,0.13825
embarked_C,0.182123,-0.066564,0.082706,0.325722,-0.134675,-0.17143,1.0,-0.164166,-0.778262,-0.036553,-0.107874
embarked_Q,-0.016071,-0.088651,-0.085716,-0.166101,-0.121973,0.243706,-0.164166,1.0,-0.491656,-0.08719,0.127214
embarked_S,-0.150542,0.115193,-0.018446,-0.1818,0.196532,-0.003805,-0.778262,-0.491656,1.0,0.087771,0.014246
family_num,0.026876,-0.188583,-0.206087,-0.029656,-0.039976,0.05843,-0.036553,-0.08719,0.087771,1.0,-0.688864


In [23]:
# With regards to survival, highest correlation occurs with sex and cabin class of passenger

## Gaussian Naive Bayes Analysis

#### Split data into train and test

In [24]:
#extract target variable
#make copy of 'survived' column
y = modeldf['survived']

In [25]:
#copy of modeldf without 'survived' column
X = modeldf.drop(['survived'], axis=1)

In [26]:
# Create training and test data
# Leave test size at default (25%)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=109)

In [27]:
#initialize Gaussian Bayes classifier
gnb = GaussianNB()

In [28]:
#train the model to learn trends
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [29]:
#predictive score of the model on the training data
gnb.score(X_train, y_train)

0.7696228338430173

In [30]:
#test the model on unseen data
#score predictive values in variable
y_pred = gnb.predict(X_test)

In [32]:
#create confusion matrix that will show which values predict correctly vs incorrectly
cm =pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted Not_Survived', 'Predicted Survived'],
    index=['True Not_Survived', 'True Survived']
    )
cm

Unnamed: 0,Predicted Not_Survived,Predicted Survived
True Not_Survived,169,31
True Survived,44,84


In [33]:
#frequency of surviving passengers to dead passengers in the test dataset
y_test.value_counts()

0    200
1    128
Name: survived, dtype: int64

In [34]:
#predictive score of the model on the test data
gnb.score(X_test, y_test)

0.7713414634146342

In [35]:
#predictive score of the model for each predictive category
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.84      0.82       200
           1       0.73      0.66      0.69       128

   micro avg       0.77      0.77      0.77       328
   macro avg       0.76      0.75      0.75       328
weighted avg       0.77      0.77      0.77       328



## Gaussian Naive Bayes Analysis

In [36]:
#import Bernoulli NaiveBayes function from scikit-learn library
from sklearn.naive_bayes import BernoulliNB

In [37]:
#initialize Bernoulli NaiveBayes function to a variable
bnb = BernoulliNB()

In [38]:
#build the model with the training data
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [39]:
#model's predictive score on the training data
bnb.score(X_train, y_train)

0.7522935779816514

In [40]:
#test the model on unseen data
#score predictive values in variable
y_pred = bnb.predict(X_test)

In [41]:
#create confusion matrix that will show which values predict correctly vs incorrectly
cm =pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted Not_Survived', 'Predicted Survived'],
    index=['True Not_Survived', 'True Survived']
    )
cm

Unnamed: 0,Predicted Not_Survived,Predicted Survived
True Not_Survived,164,36
True Survived,43,85


In [42]:
#predictive score of the model on the test data
bnb.score(X_test, y_test)

0.7591463414634146

## Conclusion

In [None]:
# Both models give comparable results with a good but not great predictive score (77%)