# Predict titanic suvivors

Using the knowledges in sklearn to try predict the chance of survival in titanic accident with Logistic Regression.

In [334]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [335]:
df = pd.read_csv('data/train.csv')

In [336]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Function: insertAgeByClass
**Description:** This function switch the null value for the mean age based on passenger class.

In [337]:
def insertAgeByClass(df, average_age):
  age = df['Age']
  p_class = df['Pclass']

  new_age = 0
  if (pd.isnull(age)):
    new_age = average_age[p_class]
  else:
    new_age = age

  return int(new_age)

In [338]:
def insertFareByClass(df, average_fare):
  fare = df['Fare']
  p_class = df['Pclass']

  new_fare = 0
  if (pd.isnull(fare)):
    new_fare = average_fare[p_class]
  else:
    new_fare = fare

  return new_fare

## Treating the data

In [339]:
def normalizeData(df):
  c1, c2, c3 = df.groupby('Pclass')['Age'].mean()
  f1, f2, f3 = df.groupby('Pclass')['Fare'].mean()

  average_age  = {
    1: c1,
    2: c2,
    3: c3
  }

  average_fare  = {
    1: f1,
    2: f2,
    3: f3
  }

  df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
  df = pd.get_dummies(df, columns=['Sex'], drop_first=True)

  df['NameLength'] = df['Name'].apply(lambda name: len(name))
  df['TicketLength'] = df['Ticket'].apply(lambda name: len(name))

  df['Age'] = df[['Age', 'Pclass']].apply(lambda df: insertAgeByClass(df, average_age), axis=1)
  df['Fare'] = df[['Fare', 'Pclass']].apply(lambda df: insertFareByClass(df, average_fare), axis=1)

  df.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'], inplace=True)

  return df

In [340]:
df = normalizeData(df)

In [341]:
df.head(6)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Sex_male,NameLength,TicketLength
0,0,3,22,1,0,7.25,0,1,1,23,9
1,1,1,38,1,0,71.2833,0,0,0,51,8
2,1,3,26,0,0,7.925,0,1,0,22,16
3,1,1,35,1,0,53.1,0,1,0,44,6
4,0,3,35,0,0,8.05,0,1,1,24,6
5,0,3,25,0,0,8.4583,1,0,1,16,6


## Training a new machine with train data

In [342]:
from sklearn.linear_model import LogisticRegression

In [343]:
x = df.drop(columns=['Survived'])
y = df['Survived']

In [344]:
model = LogisticRegression(solver='liblinear')

In [345]:
model.fit(x, y)

LogisticRegression(solver='liblinear')

## Importing test data and predict

In [346]:
x_test = pd.read_csv('data/test.csv')
y_test = pd.read_csv('data/gender_submission.csv')['Survived']

## Normalize data again with test data

In [347]:
x_test = normalizeData(x_test)

In [348]:
predict = model.predict(x_test)

## Viewing forecast data

In [349]:
from sklearn.metrics import classification_report, confusion_matrix

In [351]:
print(classification_report(y_test, predict), '\n', confusion_matrix(y_test, predict))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95       266
           1       0.91      0.93      0.92       152

    accuracy                           0.94       418
   macro avg       0.94      0.94      0.94       418
weighted avg       0.94      0.94      0.94       418
 
 [[252  14]
 [ 10 142]]
