In [1]:
#Import all the libraries
#import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [3]:
train = pd.read_csv("/home/lauvindra/Titanic/train.csv")
test = pd.read_csv("/home/lauvindra/Titanic/test.csv")

In [4]:
train.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
171,172,0,3,"Rice, Master. Arthur",male,4.0,4,1,382652,29.125,,Q
695,696,0,2,"Chapman, Mr. Charles Henry",male,52.0,0,0,248731,13.5,,S
777,778,1,3,"Emanuel, Miss. Virginia Ethel",female,5.0,0,0,364516,12.475,,S
836,837,0,3,"Pasic, Mr. Jakob",male,21.0,0,0,315097,8.6625,,S
525,526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q


In [5]:
train.shape

(891, 12)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
#remove the column that we don't use
train = train.drop(['Cabin', 'Ticket', 'Name', 'PassengerId'], axis = 1)
train.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [8]:
#check the number of null values
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [9]:
#Dealing with the missing values
freq = train.Embarked.dropna().mode()
print(freq, '\n')
train['Embarked'] = train['Embarked'].fillna(freq[0])#fill the NAN values with the most freq values


mean = train['Age'].dropna().mean()
train['Age'] = train['Age'].fillna(round(mean))
print(round(mean))

0    S
dtype: object 

30


In [10]:
"""Convert the categorial features into numeric features"""
train['Sex'].replace('female', 0, inplace = True)
train['Sex'].replace('male', 1, inplace = True)

train['Embarked'].replace('S', 0, inplace = True)
train['Embarked'].replace('C', 1, inplace = True)
train['Embarked'].replace('Q', 2, inplace = True)

In [11]:
print(train.isnull().sum(), train.shape, train.head(), train.describe().T,
      sep = ' \n ***********   *************  *********** \n ' )

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64 
 ***********   *************  *********** 
 (891, 8) 
 ***********   *************  *********** 
    Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0         0       3    1  22.0      1      0   7.2500         0
1         1       1    0  38.0      1      0  71.2833         1
2         1       3    0  26.0      0      0   7.9250         0
3         1       1    0  35.0      1      0  53.1000         0
4         0       3    1  35.0      0      0   8.0500         0 
 ***********   *************  *********** 
           count       mean        std   min      25%      50%   75%       max
Survived  891.0   0.383838   0.486592  0.00   0.0000   0.0000   1.0    1.0000
Pclass    891.0   2.308642   0.836071  1.00   2.0000   3.0000   3.0    3.0000
Sex       891.0   0.647587   0.477990  0.00   0.0000   1.0000   1.0    1.0000
Age       891.0  29.758889  13.0025

In [12]:
cols = ['Pclass','Sex','SibSp' ,'Parch','Embarked']
for col in cols :
    print(train[[col, 'Survived']].groupby([col],as_index=False).mean().sort_values(by=
           'Survived', ascending = False), end=' \n ******** ******* ********* \n ')


   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363 
 ******** ******* ********* 
    Sex  Survived
0    0  0.742038
1    1  0.188908 
 ******** ******* ********* 
    SibSp  Survived
1      1  0.535885
2      2  0.464286
0      0  0.345395
3      3  0.250000
4      4  0.166667
5      5  0.000000
6      8  0.000000 
 ******** ******* ********* 
    Parch  Survived
3      3  0.600000
1      1  0.550847
2      2  0.500000
0      0  0.343658
5      5  0.200000
4      4  0.000000
6      6  0.000000 
 ******** ******* ********* 
    Embarked  Survived
1         1  0.553571
2         2  0.389610
0         0  0.339009 
 ******** ******* ********* 
 

In [13]:
test.sample(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
236,1128,1,"Warren, Mr. Frank Manley",male,64.0,1,0,110813,75.25,D37,C
85,977,3,"Khalil, Mr. Betros",male,,1,0,2660,14.4542,,C
374,1266,1,"Dodge, Mrs. Washington (Ruth Vidaver)",female,54.0,1,1,33638,81.8583,A34,S
224,1116,1,"Candee, Mrs. Edward (Helen Churchill Hungerford)",female,53.0,0,0,PC 17606,27.4458,,C
204,1096,2,"Andrew, Mr. Frank Thomas",male,25.0,0,0,C.A. 34050,10.5,,S


In [14]:
#check the null values
print(test.shape, test.info(), test.isnull().sum(),
     sep = ' \n ***********   *************  *********** \n ')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
(418, 11) 
 ***********   *************  *********** 
 None 
 ***********   *************  *********** 
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin     

In [15]:
#Drop the column we don't want to use
test = test.drop(['Cabin','Ticket','Name','PassengerId'], axis = 1)


freq = test.Fare.dropna().mode()
print(freq, '\n')
test['Fare'] = train['Fare'].fillna(freq[0])#fill the NAN values with the most freq values


mean = train['Age'].dropna().mean()
test['Age'] = test['Age'].fillna(round(mean))
print(round(mean))

0    7.75
dtype: float64 

30


In [16]:
#Convert the categorial features into numeric features
test['Sex'].replace('female', 0, inplace = True)
test['Sex'].replace('male', 1, inplace = True)

test['Embarked'].replace('S', 0, inplace = True)
test['Embarked'].replace('C', 1, inplace = True)
test['Embarked'].replace('Q', 2, inplace = True)

In [17]:
test.sample(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
201,3,1,0.33,0,2,69.55,0
79,3,0,24.0,0,0,12.475,2
173,3,1,30.0,0,0,7.925,1
356,1,0,59.0,2,0,55.0,0
45,3,1,25.0,0,0,8.05,0


In [18]:
X_test = test
X_train = train.drop("Survived", axis = 1)
y_train = train["Survived"]

In [19]:
#Using KNN
model1 = KNeighborsClassifier(n_neighbors=3)
model1.fit(X_train, y_train)
prediction = model1.predict(X_test)
prediction[:10]

array([0, 1, 0, 1, 1, 0, 1, 0, 1, 1])

In [20]:
#Using Logistic Regression
model2 = LogisticRegression(solver = 'liblinear')
model2.fit(X_train, y_train)
prediction = model2.predict(X_test)
prediction[:10]

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0])

In [21]:
#Using Support Vector Machine(SVM)
model3 = SVC(kernel = 'linear')
model3.fit(X_train, y_train)
prediction = model3.predict(X_test)
prediction[:10]

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

In [22]:
model4 = GaussianNB()
model4.fit(X_train, y_train)
prediction = model4.predict(X_test)
prediction[:10]

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

In [23]:
score1 = round(model1.score(X_train, y_train) * 100, 2)
score2 = round(model2.score(X_train, y_train) * 100, 2)
score3 = round(model3.score(X_train, y_train) * 100, 2)
score4 = round(model4.score(X_train, y_train) * 100, 2)

In [24]:
dict = {'Model': ['K Nearest Neighbor', 'Logistic Regression', 'Support Vector Machine', 'Naive Bayes'],
        'Score' : [score1, score2, score3, score4]}
model_score = pd.DataFrame(dict)

In [25]:
model_score

Unnamed: 0,Model,Score
0,K Nearest Neighbor,83.05
1,Logistic Regression,80.36
2,Support Vector Machine,78.68
3,Naive Bayes,79.24


In [26]:
#To submit to kaggle
submission = pd.DataFrame({ 'Survived': prediction})
submission .to_csv('my_submission.csv', index=False)