#### FOUNDATIONS OF MACHINE LEARNING: SUPERVISED LEARNING

<br>

# Predict Titanic Survival

The RMS Titanic set sail on its maiden voyage in 1912, crossing the Atlantic from Southampton, England to New York City. The ship never completed the voyage, sinking to the bottom of the Atlantic Ocean after hitting an iceberg, bringing down 1,502 of 2,224 passengers onboard.

In this project you will create a Logistic Regression model that predicts which passengers survived the sinking of the Titanic, based on features like age and class.

<hr>

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Load the Data

In [84]:
passengers = pd.read_csv('passengers.csv')
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Clean the Data

We are updating the `Sex` column so `female` will now be `1`, and `male` will be `0`.

In [85]:
passengers = passengers.replace(to_replace = ['female', 'male'], value = [1, 0])
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


We are going to update the `nan` values in age with the total mean.

In [86]:
print(passengers['Age'])

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64


In [87]:
passengers['Age'].fillna(value = passengers['Age'].mean(), inplace = True) #inplace as True will make it so it replaces the old dataframe
print(passengers['Age'])

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64


In [88]:
passengers['FirstClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 1 else 0)
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0


In [89]:
passengers['SecondClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 2 else 0)
passengers.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0
5,6,0,3,"Moran, Mr. James",0,29.699118,0,0,330877,8.4583,,Q,0,0
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,S,1,0
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.075,,S,0,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,,S,0,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,,C,0,1


### Select and Split Data

In [90]:
features = passengers[['Sex', 'Age', 'FirstClass', 'SecondClass']]
features.head()

Unnamed: 0,Sex,Age,FirstClass,SecondClass
0,0,22.0,0,0
1,1,38.0,1,0
2,1,26.0,0,0
3,1,35.0,1,0
4,0,35.0,0,0


In [91]:
survival = passengers['Survived']
survival.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [94]:
feature_train, feature_test, survival_train, survival_test = train_test_split(features, survival, test_size = 0.2, random_state = 1)

### Normalize the Data

In [95]:
scaler = StandardScaler()
scaler.fit_transform(feature_train)
scaler.transform(feature_test)

array([[ 1.36361202,  1.35216325,  1.7583953 , -0.51662744],
       [-0.73334642, -0.02810499, -0.56870034, -0.51662744],
       [ 1.36361202, -0.98588332, -0.56870034,  1.93563082],
       [ 1.36361202, -0.02810499, -0.56870034, -0.51662744],
       [ 1.36361202, -1.74009189, -0.56870034,  1.93563082],
       [-0.73334642, -0.6841999 , -0.56870034, -0.51662744],
       [-0.73334642,  0.48482339, -0.56870034,  1.93563082],
       [ 1.36361202,  0.3716921 ,  1.7583953 , -0.51662744],
       [-0.73334642, -0.1562539 ,  1.7583953 , -0.51662744],
       [ 1.36361202, -0.02810499, -0.56870034, -0.51662744],
       [-0.73334642, -0.38251647, -0.56870034, -0.51662744],
       [ 1.36361202, -0.30709561, -0.56870034, -0.51662744],
       [-0.73334642,  1.80468839,  1.7583953 , -0.51662744],
       [-0.73334642, -0.02810499, -0.56870034,  1.93563082],
       [ 1.36361202, -0.02810499, -0.56870034, -0.51662744],
       [-0.73334642, -0.1562539 ,  1.7583953 , -0.51662744],
       [-0.73334642, -1.

### Create and Evaluate the Model

In [96]:
model = LogisticRegression()
model.fit(feature_train, survival_train)

LogisticRegression()

In [97]:
print(model.score(feature_train, survival_train))
#this will return the correct classifications/accuracy

0.7963483146067416


In [98]:
print(model.score(feature_test, survival_test))

0.7932960893854749


In [102]:
#model.coef_ will print the coefficients of features
#print(model.coef_) 
print(list(zip(['Sex','Age','FirstClass','SecondClass'], model.coef_[0])))

[('Sex', 2.5078893699882325), ('Age', -0.03268692052766674), ('FirstClass', 2.2170933220846787), ('SecondClass', 1.2325603185417122)]


### Predict With Model

In [117]:
Jack = np.array([0.0, 20.0, 0.0, 0.0])
Rose = np.array([1.0, 17.0, 1.0, 0.0])
Me = np.array([1.0, 25.0, 0.0, 1.0])

In [118]:
sample_passengers = np.array([Jack, Rose, Me])

In [119]:
sample_passengers = scaler.transform(sample_passengers)



In [122]:
#Jack won't survive, but Rose and Me will

print(model.predict(sample_passengers))

[0 1 1]




In [123]:
#Jack has a 0.99 probability of not surviving, and 0.006 probability of surviving
#Rose has a 0.004 probability of not surviving, and 0.995 proability of surviving
#Me will have a 0.0399 probability of not surviving, and a 0.96 proability of surviving

print(model.predict_proba(sample_passengers))

[[0.99387787 0.00612213]
 [0.00479192 0.99520808]
 [0.03994582 0.96005418]]


