# Titanic - Machine Learning from Disaster

<p>Kaggle Competition</p>

## Importing libraries and datasets

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
dfTrain = pd.read_csv('data/train.csv')
dfTrain.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Exploratory Data Analysis

### Cleaning Data

In [3]:
dfTrain.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
dfTrain.isnull().describe()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891,891,891,891,891,891,891,891,891,891,891,891
unique,1,1,1,1,1,2,1,1,1,1,2,2
top,False,False,False,False,False,False,False,False,False,False,True,False
freq,891,891,891,891,891,714,891,891,891,891,687,889


<p>There are missing data in Age, Cabin and Embarked columns, we'll need to fill in thoses missing columns.</p>

In [5]:
fig = px.box(dfTrain, y='Age', x='Survived', color='Sex')
fig.show()

<p>We'll force age values for the missing data based on if they survived or not.</p>

In [6]:
def newAge(cols):
    Age = cols[0]
    Sex = cols[1]
    Survived = cols[2]
    
    if (pd.isnull(Age)):
        if (Survived == 1 and Sex == 'male'):
            return 28
        elif (Survived == 1 and Sex == 'female'):
            return 28
        elif (Survived == 0 and Sex == 'male'):
            return 29
        elif (Survived == 0 and Sex == 'female'):
            return 25
    else:
        return Age

In [7]:
dfTrain['Age'] = dfTrain[['Age', 'Sex', 'Survived']].apply(newAge, axis=1)

In [8]:
dfTrain.isnull().describe()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891,891,891,891,891,891,891,891,891,891,891,891
unique,1,1,1,1,1,1,1,1,1,1,2,2
top,False,False,False,False,False,False,False,False,False,False,True,False
freq,891,891,891,891,891,891,891,891,891,891,687,889


In [9]:
dfTrain[dfTrain['Age'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [10]:
fig = px.histogram(dfTrain, y='Survived', x='Pclass')
fig.show()

<p>I'll keep Pclass in our dataset, but it doesn't seen to affect the survival chance that much.</p>

In [11]:
fig = px.histogram(dfTrain, x='Survived', color='Sex')
fig.show()

In [12]:
dfTrain['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [13]:
def sex(col):
    Sex = col[0]
    if (Sex == 'male'):
        return 0
    else:
        return 1

In [14]:
dfTrain['Sex']

0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object

In [15]:
dfTrain['Sex'] = dfTrain[['Sex']].apply(sex, axis=1)
dfTrain

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,25.0,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0000,C148,C


## Building the model

In [16]:
train = dfTrain.iloc[0:624, :]
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
619,620,0,2,"Gavey, Mr. Lawrence",0,26.0,0,0,31028,10.5000,,S
620,621,0,3,"Yasbeck, Mr. Antoni",0,27.0,1,0,2659,14.4542,,C
621,622,1,1,"Kimball, Mr. Edwin Nelson Jr",0,42.0,1,0,11753,52.5542,D19,S
622,623,1,3,"Nakid, Mr. Sahid",0,20.0,1,1,2653,15.7417,,C


In [17]:
train = train.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)

In [18]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,0,22.0,1,0,7.2500
1,2,1,1,1,38.0,1,0,71.2833
2,3,1,3,1,26.0,0,0,7.9250
3,4,1,1,1,35.0,1,0,53.1000
4,5,0,3,0,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
619,620,0,2,0,26.0,0,0,10.5000
620,621,0,3,0,27.0,1,0,14.4542
621,622,1,1,0,42.0,1,0,52.5542
622,623,1,3,0,20.0,1,1,15.7417


In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
model = LogisticRegression()

In [21]:
model.fit(train.drop(columns=['Survived']), train['Survived'])


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [22]:
test = dfTrain.iloc[625:891, :]

In [23]:
test = test.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked'])
test

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
625,626,0,1,0,61.0,0,0,32.3208
626,627,0,2,0,57.0,0,0,12.3500
627,628,1,1,1,21.0,0,0,77.9583
628,629,0,3,0,26.0,0,0,7.8958
629,630,0,3,0,29.0,0,0,7.7333
...,...,...,...,...,...,...,...,...
886,887,0,2,0,27.0,0,0,13.0000
887,888,1,1,1,19.0,0,0,30.0000
888,889,0,3,1,25.0,1,2,23.4500
889,890,1,1,0,26.0,0,0,30.0000


In [24]:
from sklearn.metrics import classification_report

In [25]:
prediction = model.predict(test.drop(columns=['Survived'], axis=1))

In [26]:
print(classification_report(test['Survived'], prediction))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       170
           1       0.75      0.68      0.71        96

    accuracy                           0.80       266
   macro avg       0.79      0.77      0.78       266
weighted avg       0.80      0.80      0.80       266

