# Titanic - Logistic Regression

![title](image/Titanic_wreck.jpg)

## Kaggle Data Set

In [92]:
# https://www.kaggle.com/c/titanic/data

## Exploratory Data Analysis

In [93]:
import pandas as pd

In [94]:
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [95]:
train_data.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [96]:
test_data = pd.read_csv("test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [97]:
test_data.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            332
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
dtype: int64

## Fill the NaN values in the age column

In [98]:
train_data['Age'].fillna(value = round(train_data['Age'].mean()), inplace = True)
train_data['Age'][0:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5    30.0
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [99]:
test_data['Age'].fillna(value = round(test_data['Age'].mean()), inplace = True)
test_data['Age'][0:10]

0    34.5
1    47.0
2    62.0
3    27.0
4    22.0
5    14.0
6    30.0
7    26.0
8    18.0
9    21.0
Name: Age, dtype: float64

## Create a First Class column from Pclass

In [100]:
train_data['FirstClass'] = train_data['Pclass'].apply(lambda x: 1 if x == 1 else 0)
train_data['FirstClass'][0:10]

0    0
1    1
2    0
3    1
4    0
5    0
6    1
7    0
8    0
9    0
Name: FirstClass, dtype: int64

In [101]:
test_data['FirstClass'] = test_data['Pclass'].apply(lambda x: 1 if x == 1 else 0)
test_data['FirstClass'][0:10]

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: FirstClass, dtype: int64

## Create a Second Class column from Pclass

In [102]:
train_data['SecondClass'] = train_data['Pclass'].apply(lambda x: 1 if x == 2 else 0)
train_data['SecondClass'][0:10]

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    1
Name: SecondClass, dtype: int64

In [103]:
test_data['SecondClass'] = test_data['Pclass'].apply(lambda x: 1 if x == 2 else 0)
test_data['SecondClass'][0:10]

0    0
1    0
2    1
3    0
4    0
5    0
6    0
7    1
8    0
9    0
Name: SecondClass, dtype: int64

## Update Sex column to numerical

In [104]:
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
train_data['Sex'][0:10]

0    0
1    1
2    1
3    1
4    0
5    0
6    0
7    0
8    1
9    1
Name: Sex, dtype: int64

In [105]:
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})
test_data['Sex'][0:10]

0    0
1    1
2    0
3    0
4    1
5    0
6    1
7    0
8    1
9    0
Name: Sex, dtype: int64

## Select the desired features

In [106]:
train_data_features = train_data[['Sex', 'Age', 'FirstClass', 'SecondClass']]
train_data_survival = train_data['Survived']

In [107]:
test_data_features = train_data[['Sex', 'Age', 'FirstClass', 'SecondClass']]
test_data_survival = train_data['Survived']

# Sklearn

## Perform train, test, split with

In [108]:
from sklearn.model_selection import train_test_split

In [109]:
train_features, test_features, train_labels, test_labels = train_test_split(train_data_features, train_data_survival)

## Scale the feature data so it has mean = 0 and standard deviation = 1

In [110]:
from sklearn.preprocessing import StandardScaler

In [111]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

In [112]:
train_features[0:10]

array([[-0.71506099,  0.58423631,  1.81164325, -0.52654217],
       [ 1.39848211, -0.42381219, -0.55198505,  1.89918313],
       [-0.71506099,  0.42915192,  1.81164325, -0.52654217],
       [-0.71506099, -1.04414973, -0.55198505, -0.52654217],
       [ 1.39848211,  0.04144096, -0.55198505,  1.89918313],
       [-0.71506099,  0.04144096, -0.55198505,  1.89918313],
       [-0.71506099, -0.42381219,  1.81164325, -0.52654217],
       [-0.71506099,  0.04144096, -0.55198505, -0.52654217],
       [-0.71506099,  0.04144096, -0.55198505, -0.52654217],
       [ 1.39848211, -1.12169192, -0.55198505, -0.52654217]])

In [113]:
test_features[0:10]

array([[ 1.39848211, -0.26872781, -0.55198505,  1.89918313],
       [ 1.39848211,  1.12703165,  1.81164325, -0.52654217],
       [ 1.39848211, -1.97465603, -0.55198505,  1.89918313],
       [-0.71506099, -2.05219823, -0.55198505, -0.52654217],
       [-0.71506099,  0.42915192, -0.55198505, -0.52654217],
       [ 1.39848211,  0.6617785 ,  1.81164325, -0.52654217],
       [-0.71506099, -0.11364342, -0.55198505, -0.52654217],
       [ 1.39848211,  0.04144096,  1.81164325, -0.52654217],
       [-0.71506099,  0.04144096, -0.55198505, -0.52654217],
       [ 1.39848211, -0.42381219, -0.55198505, -0.52654217]])

## Create and train the model

In [114]:
from sklearn.linear_model import LogisticRegression

In [115]:
model = LogisticRegression()
model.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Score the model on the train data

In [116]:
print(model.score(train_features, train_labels))

0.7934131736526946


In [117]:
print(model.score(test_features, test_labels))

0.7982062780269058


## Analyze the coefficients

In [118]:
print(list(zip(['Sex','Age','FirstClass','SecondClass'],model.coef_[0])))

[('Sex', 1.2158815350006071), ('Age', -0.362745429180737), ('FirstClass', 0.933525662478166), ('SecondClass', 0.4944098240780029)]


## Prediction - Did you survived?

In [119]:
import numpy as np

In [122]:
# Age, Sex, FirstClass, SecondClass
me = np.array([0.0,44,0.0,1.0])
me = me.reshape(1, -1)

In [123]:
# Scale your features
me = scaler.transform(me)
print(me)

[[-0.71506099  1.12703165 -0.55198505  1.89918313]]


## Make survival predictions

In [124]:
print(model.predict(me))
print(model.predict_proba(me))

[0]
[[0.82703287 0.17296713]]
