# Predict Titanic Survival

The RMS Titanic set sail on its maiden voyage in 1912, crossing the Atlantic from Southampton, England to New York City. The ship never completed the voyage, sinking to the bottom of the Atlantic Ocean after hitting an iceberg, bringing down 1,502 of 2,224 passengers onboard.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [74]:
dataset = pd.read_csv("passengers.csv")
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Clean Data

In [84]:
dataset['Sex'] = dataset['Sex'].map({'male' : 1,
                   'female' : 0})
print(dataset['Sex'])

0      1
1      0
2      0
3      0
4      1
      ..
886    1
887    0
888    0
889    1
890    1
Name: Sex, Length: 891, dtype: int64


In [75]:
print(dataset.Age.values)

[22.   38.   26.   35.   35.     nan 54.    2.   27.   14.    4.   58.
 20.   39.   14.   55.    2.     nan 31.     nan 35.   34.   15.   28.
  8.   38.     nan 19.     nan   nan 40.     nan   nan 66.   28.   42.
   nan 21.   18.   14.   40.   27.     nan  3.   19.     nan   nan   nan
   nan 18.    7.   21.   49.   29.   65.     nan 21.   28.5   5.   11.
 22.   38.   45.    4.     nan   nan 29.   19.   17.   26.   32.   16.
 21.   26.   32.   25.     nan   nan  0.83 30.   22.   29.     nan 28.
 17.   33.   16.     nan 23.   24.   29.   20.   46.   26.   59.     nan
 71.   23.   34.   34.   28.     nan 21.   33.   37.   28.   21.     nan
 38.     nan 47.   14.5  22.   20.   17.   21.   70.5  29.   24.    2.
 21.     nan 32.5  32.5  54.   12.     nan 24.     nan 45.   33.   20.
 47.   29.   25.   23.   19.   37.   16.   24.     nan 22.   24.   19.
 18.   19.   27.    9.   36.5  42.   51.   22.   55.5  40.5    nan 51.
 16.   30.     nan   nan 44.   40.   26.   17.    1.    9.     nan 45.


In [76]:
dataset['Age'] = dataset['Age'].fillna(np.mean(dataset['Age']))
dataset['Age']

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

In [80]:
dataset['FirstClass'] = dataset['Pclass'].apply(lambda x: 1 if x == 1 else 0)


In [82]:
dataset['SecondClass'] = dataset['Pclass'].apply(lambda x: 1 if x == 2 else 0)

In [85]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,0,0


## Select and Split the Data

In [150]:
X = dataset[['Sex', 'Age', 'FirstClass', 'SecondClass']]
y = dataset['Survived']


In [187]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 0)


## Normalize the Data

In [188]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

## Create and Evaluate the Model

In [189]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression()

In [155]:
classifier.score(X_test, y_test)

0.7988826815642458

In [156]:
classifier.coef_

array([[-1.20797072, -0.46188153,  0.91609258,  0.45550712]])

In [157]:
classifier.intercept_

array([-0.63926296])

## Predict with the model

In [207]:
from sklearn.metrics import confusion_matrix

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[93, 17],
       [19, 50]], dtype=int64)

La cantidad de aciertos es grande por lo que es un buen modelo. Ahora hagamos predicciones sobre otros datos:

In [192]:
Jack = sc_X.transform([[1, 25.0, 0, 0]])

classifier.predict(Jack)

array([0], dtype=int64)

In [194]:
Rose = sc_X.transform([[0, 25.0, 1, 0]])

classifier.predict(Rose)

array([1], dtype=int64)

In [195]:
Yo = sc_X.transform([[1, 31.0, 0, 0]])

classifier.predict(Yo)

array([0], dtype=int64)

In [203]:
sample_passengers = [[1, 25.0, 0, 0],
                    [0, 25.0, 1, 0],
                    [1, 31.0, 0, 0]]
sample_passengers = sc_X.transform(sample_passengers)
sample_passengers

array([[ 0.72882288, -0.36207656, -0.56004744, -0.5078883 ],
       [-1.37207547, -0.36207656,  1.78556302, -0.5078883 ],
       [ 0.72882288,  0.09662937, -0.56004744, -0.5078883 ]])

In [204]:
classifier.predict(sample_passengers)

array([0, 1, 0], dtype=int64)

In [205]:
classifier.predict_proba(sample_passengers)

array([[0.89059423, 0.10940577],
       [0.0697981 , 0.9302019 ],
       [0.90959444, 0.09040556]])