In [1]:
# Import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import *

In [2]:
# Load dataset
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Preprocess data
le = preprocessing.LabelEncoder()

In [4]:
df['Sex'] = le.fit_transform(df['Sex'])

In [5]:
df['Embarked'] = le.fit_transform(df['Embarked'])

In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2


In [7]:
df = df.drop(['Name','Ticket','Cabin'],axis=1)

In [8]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked'],
      dtype='object')

In [9]:
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [10]:
category = pd.cut(df.Age,bins=[0,2,17,65,99],labels=['Toddler/baby','Child','Adult','Elderly'])

In [11]:
df['Age'] = category

In [12]:
df['Age'] = le.fit_transform(df['Age'])

In [13]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0
mean,446.0,0.382452,2.311586,0.649044,0.1991,0.524184,0.382452,32.096681,1.535433
std,256.998173,0.48626,0.8347,0.477538,0.58294,1.103705,0.806761,49.697504,0.792088
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,224.0,0.0,2.0,0.0,0.0,0.0,0.0,7.8958,1.0
50%,446.0,0.0,3.0,1.0,0.0,0.0,0.0,14.4542,2.0
75%,668.0,1.0,3.0,1.0,0.0,1.0,0.0,31.0,2.0
max,891.0,1.0,3.0,1.0,3.0,8.0,6.0,512.3292,2.0


In [14]:
Fare_category = pd.cut(df.Fare,bins=[0,20,40,100,515],labels=[4,3,2,1])

In [15]:
df['Fare'] = Fare_category

In [16]:
# Data after preprocessing
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,0,1,0,4,2
1,2,1,1,0,0,1,0,2,0
2,3,1,3,0,0,0,0,4,2
3,4,1,1,0,0,1,0,2,2
4,5,0,3,1,0,0,0,4,2


In [17]:
df.isnull().sum()

PassengerId     0
Survived        0
Pclass          0
Sex             0
Age             0
SibSp           0
Parch           0
Fare           15
Embarked        0
dtype: int64

In [18]:
df = df.fillna(method='bfill')

In [19]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [20]:
# Create function for creating multiple model
def naive_bayes(DV):
    X = df.drop(['PassengerId',DV],axis=1)
    y = df[DV]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)
    clf = GaussianNB()
    y_pred = clf.fit(X_train,y_train).predict(X_test)
    ac = accuracy_score(y_test,y_pred,normalize=True)
    cm = confusion_matrix(y_test,y_pred)
    print('The accuracy_score for {} is:'.format(DV) + '\033[1m' + ' {}'.format(ac)+'\033[0;0m')
    print('The confusion_matrix for {} is:\n {}'.format(DV,cm))

In [21]:
DV = ['Survived','Pclass','Sex','SibSp','Parch','Embarked']
for i in DV:
    print('The accuracy_score & confusion_matrix for {} is'.format(i))
    print('------------------------------------')
    naive_bayes(i)
    print('------------------------------------')

The accuracy_score & confusion_matrix for Survived is
------------------------------------
The accuracy_score for Survived is:[1m 0.7602996254681648[0;0m
The confusion_matrix for Survived is:
 [[119  38]
 [ 26  84]]
------------------------------------
The accuracy_score & confusion_matrix for Pclass is
------------------------------------
The accuracy_score for Pclass is:[1m 0.6928838951310862[0;0m
The confusion_matrix for Pclass is:
 [[ 54  14   2]
 [  4  23  22]
 [  5  35 108]]
------------------------------------
The accuracy_score & confusion_matrix for Sex is
------------------------------------
The accuracy_score for Sex is:[1m 0.6966292134831461[0;0m
The confusion_matrix for Sex is:
 [[ 55  43]
 [ 38 131]]
------------------------------------
The accuracy_score & confusion_matrix for SibSp is
------------------------------------
The accuracy_score for SibSp is:[1m 0.50187265917603[0;0m
The confusion_matrix for SibSp is:
 [[116  10   0  52   4   0   0]
 [ 17  10   0  35 