### Predict Titanic Survival

In this project you will create a Logistic Regression model that predicts which passengers survived the sinking of the Titanic, based on features like age and class.

#### Import Modules

In [27]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

#### Import Data

In [28]:
passengers = pd.read_csv('passengers.csv')

#### Explore Data

In [29]:
passengers.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [30]:
print(passengers.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


#### Clean the Data

In [31]:
passengers['Sex']= passengers.apply\
(lambda row: 0 if row['Sex'] == "male" else 1, axis = 1)
passengers.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C


#### Fill  NaN values

In [32]:
passengers['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [33]:
passengers['Age'] = passengers['Age'].fillna(passengers['Age'].mean())
print(passengers['Age'])

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64


#### Create Dummy Variables

In [34]:
passenger_dummies = pd.get_dummies(passengers['Pclass'])
passenger_dummies

Unnamed: 0,1,2,3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1
...,...,...,...
886,0,1,0
887,1,0,0
888,0,0,1
889,1,0,0


In [35]:
passengers = pd.concat([passengers, passenger_dummies], axis='columns')

In [36]:
passengers.rename(columns={1: 'FirstClass', 2: 'SecondClass', 3:'ThirdClass'}, inplace=True)
passengers.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass,ThirdClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0


#### Select and Split the Data

In [37]:
features = passengers[[
'Sex',
'Age',
'FirstClass',
'SecondClass'
]]
features.head()

Unnamed: 0,Sex,Age,FirstClass,SecondClass
0,0,22.0,0,0
1,1,38.0,1,0
2,1,26.0,0,0
3,1,35.0,1,0
4,0,35.0,0,0


In [38]:
survival = passengers['Survived']
survival.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [39]:
train_features, test_features, train_labels, test_labels  = train_test_split\
(features, survival, test_size = 0.2)

#### Normalize the Data

Since Sklearn Logistic Regression implementation uses Regularization, feature data needs to be scaled. 

In [41]:
scaler = StandardScaler()
features_train = scaler.fit_transform(train_features)
features_test = scaler.transform(test_features)

#### Create and Evaluate the Model

In [42]:
model = LogisticRegression()

In [43]:
model.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
print("Training set score: {:.3f}".format(model.score(train_features, train_labels)))
print("Test set score: {:.3f}".format(model.score(test_features, test_labels)))

Training set score: 0.798
Test set score: 0.771


In [45]:
print(model.coef_)

[[ 2.57895924 -0.02864767  2.10922616  1.11611001]]


#### Predict with the Model

In [46]:
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
You  = np.array([0.0,33.0,0.0,0.0])

sample_passengers = np.array([Jack, Rose, You])

In [47]:
sample_passengers = scaler.transform(sample_passengers)

In [48]:
model.predict(sample_passengers)

array([0, 1, 0], dtype=int64)

In [49]:
model.predict_proba(sample_passengers)

array([[0.9933517 , 0.0066483 ],
       [0.00446815, 0.99553185],
       [0.99353774, 0.00646226]])