In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Name         891 non-null    object 
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
 11  Survived     891 non-null    int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Survived
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208,0.383838
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429,0.486592
min,1.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104,0.0
50%,446.0,3.0,28.0,0.0,0.0,14.4542,0.0
75%,668.5,3.0,38.0,1.0,0.0,31.0,1.0
max,891.0,3.0,80.0,8.0,6.0,512.3292,1.0


In [7]:
df.columns

Index(['PassengerId', 'Name', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Survived'],
      dtype='object')

In [10]:
df.shape

(891, 12)

In [12]:
df.isnull().sum()

PassengerId      0
Name             0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Survived         0
dtype: int64

In [13]:
# Now identify the columns that you dont want or the features that have no effect on your target
# once Identified then drop them

df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],inplace=True,axis=1)
df.head()



Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [14]:
# Check if there are any null values in the column so that it can be fixed as some operations connot be performed on columns that
# have null values

df.isnull().sum()

Pclass        0
Sex           0
Age         177
Fare          0
Survived      0
dtype: int64

In [15]:
# For age lets fill the NA values with the average age

df['Age'] = df['Age'].fillna(df['Age'].mean())

In [16]:
df.isnull().sum()

Pclass      0
Sex         0
Age         0
Fare        0
Survived    0
dtype: int64

In [18]:
# Now lets break the columns into test and target

target = df['Survived']
inputs= df[['Pclass','Sex','Age','Fare']]


In [21]:
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


In [23]:
# Now as we can see that sex is either male or female and we know that any model will not work well with strings so we will use
# hot encoding to convert it into a seprate column

dummies = pd.get_dummies(inputs.Sex)
dummies.head()

Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True


In [24]:
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


In [25]:
# NOw lets concat those dummy columns in the main input dataframe


inputs = pd.concat([inputs,dummies],axis = 1)
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,False,True
1,1,female,38.0,71.2833,True,False
2,3,female,26.0,7.925,True,False
3,1,female,35.0,53.1,True,False
4,3,male,35.0,8.05,False,True


In [27]:
inputs.drop(['Sex'],inplace=True,axis=1)

In [29]:
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,False,True
1,1,38.0,71.2833,True,False
2,3,26.0,7.925,True,False
3,1,35.0,53.1,True,False
4,3,35.0,8.05,False,True


In [32]:
# Now our input dataframe is completely ready lets split this df into test train dataset using train_test_split

from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test=train_test_split(inputs,target,test_size=0.2)

In [36]:
# lets check how the data has been divided

print("Xtrain", len(X_train))
print("Xtest", len(X_test))
print("ytrain", len(y_train))
print("ytest", len(y_test))


Xtrain 712
Xtest 179
ytrain 712
ytest 179


In [35]:
len(inputs)

891

In [37]:
# Now we will apply Gausian Naive Naive algorithm
# creating an objet of the model

from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

In [38]:
# Now train the model and for training we call fit method

model.fit(X_train,y_train)

In [40]:
model.score(X_test,y_test)

0.8100558659217877

In [41]:
X_test[:10]

Unnamed: 0,Pclass,Age,Fare,female,male
442,3,25.0,7.775,False,True
476,2,34.0,21.0,False,True
807,3,18.0,7.775,True,False
115,3,21.0,7.925,False,True
93,3,26.0,20.575,False,True
0,3,22.0,7.25,False,True
499,3,24.0,7.7958,False,True
861,2,21.0,11.5,False,True
677,3,18.0,9.8417,True,False
34,1,28.0,82.1708,False,True


In [42]:
y_test[:10]

442    0
476    0
807    0
115    0
93     0
0      0
499    0
861    0
677    1
34     0
Name: Survived, dtype: int64

In [43]:
model.predict(X_test[:10])

array([0, 0, 1, 0, 0, 0, 0, 0, 1, 0])

In [45]:
model.predict_proba(X_test[:10])

array([[0.98993745, 0.01006255],
       [0.97898705, 0.02101295],
       [0.07713825, 0.92286175],
       [0.98910616, 0.01089384],
       [0.99021465, 0.00978535],
       [0.98930385, 0.01069615],
       [0.98975914, 0.01024086],
       [0.97536153, 0.02463847],
       [0.07790807, 0.92209193],
       [0.66282154, 0.33717846]])