Prediciting the survival of Titanic passengers with logistic regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [21]:
# Load the passenger data
passengers = pd.read_csv('/home/temi/Desktop/ML/titanic/train.csv')
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [62]:
#checking correlation between columns 
passengers.corr(method ='pearson').style.background_gradient(cmap='coolwarm') 

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,FirstClass,SecondClass
PassengerId,1.0,-0.005007,-0.035144,-0.042939,0.033019,-0.057527,-0.001652,0.012658,0.034303,-8.6e-05
Survived,-0.005007,1.0,-0.338481,0.543351,-0.070657,-0.035322,0.081629,0.257307,0.285904,0.093349
Pclass,-0.035144,-0.338481,1.0,-0.1319,-0.329727,0.083081,0.018443,-0.5495,-0.885924,-0.188432
Sex,-0.042939,0.543351,-0.1319,1.0,-0.08466,0.114631,0.245489,0.182333,0.098013,0.064746
Age,0.033019,-0.070657,-0.329727,-0.08466,1.0,-0.23244,-0.18033,0.090632,0.31912,0.004949
SibSp,-0.057527,-0.035322,0.083081,0.114631,-0.23244,1.0,0.414838,0.159651,-0.054582,-0.055932
Parch,-0.001652,0.081629,0.018443,0.245489,-0.18033,0.414838,1.0,0.216225,-0.017633,-0.000734
Fare,0.012658,0.257307,-0.5495,0.182333,0.090632,0.159651,0.216225,1.0,0.591711,-0.118557
FirstClass,0.034303,0.285904,-0.885924,0.098013,0.31912,-0.054582,-0.017633,0.591711,1.0,-0.288585
SecondClass,-8.6e-05,0.093349,-0.188432,0.064746,0.004949,-0.055932,-0.000734,-0.118557,-0.288585,1.0


In [22]:
# Update sex column to numerical
passengers['Sex'] = passengers['Sex'].map({'female': 1,'male':0})
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [24]:
# Fill the nan values in the age column
#print(df['Age'].values)
passengers['Age'].fillna(inplace=True, value=round(passengers['Age'].mean()))

# Create a first class column
passengers['FirstClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 1 else 0)

# Create a second class column
passengers['SecondClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 2 else 0)

passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0


In [12]:
# Select the desired features
features = passengers[['Sex', 'Age', 'FirstClass', 'SecondClass']]
survival = passengers['Survived']

In [13]:
# Perform train, test, split
train_features, test_features, train_labels, test_labels = train_test_split(features, survival, test_size = 0.2, random_state = 42)

In [15]:
# Scale the feature data so it has mean = 0 and standard deviation = 1
scalar = StandardScaler()
train_features = scalar.fit_transform(train_features)
test_features = scalar.transform(test_features)

In [16]:
# Create and train the model
model = LogisticRegression()
model.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
# Score the model on the train data
print(model.score(train_features, train_labels))

0.7949438202247191


In [18]:
# Score the model on the test data
print(model.score(test_features, test_labels))

0.8044692737430168


In [19]:
# Analyze the coefficients
print(model.coef_)

[[ 1.21311641 -0.32759544  0.8610146   0.51069337]]


In [36]:
test_data = pd.read_csv('/home/temi/Desktop/ML/titanic/test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [37]:
# Update sex column to numerical
test_data['Sex'] = test_data['Sex'].map({'female': 1,'male':0})

# Fill the nan values in the age column
test_data['Age'].fillna(inplace=True, value=round(test_data['Age'].mean()))

# Create a first class column
test_data['FirstClass'] = test_data['Pclass'].apply(lambda x: 1 if x == 1 else 0)

# Create a second class column
test_data['SecondClass'] = test_data['Pclass'].apply(lambda x: 1 if x == 2 else 0)

test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,S,0,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q,0,1
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S,0,0


In [40]:
#selecting test features
test_features = test_data[['Sex', 'Age', 'FirstClass', 'SecondClass']]
test_features = scalar.transform(test_features)
#test_features

array([[-0.75592895,  0.33942386, -0.58655899, -0.5349335 ],
       [ 1.32287566,  1.32992353, -0.58655899, -0.5349335 ],
       [-0.75592895,  2.51852314, -0.58655899,  1.86939125],
       ...,
       [-0.75592895,  0.65638375, -0.58655899, -0.5349335 ],
       [-0.75592895, -0.01715602, -0.58655899, -0.5349335 ],
       [-0.75592895, -0.01715602, -0.58655899, -0.5349335 ]])

In [43]:
# Making survival predictions
prediction = model.predict(test_features)
prediction_prob = model.predict_proba(test_features)

#print(prediction)
#print(prediction_prob)

In [56]:
#converting passenger ID and survival prediction to a dictionary using a dictionary comprehension
submission_dict = {test_data['PassengerId'][i]: prediction[i] for i in range(len(prediction))}
#print(submission_dict)

In [63]:
#writing to csv file 
import csv 

with open('/home/temi/Desktop/ML/titanic/titanic_submission.csv', 'w') as submission:
    for key in submission_dict.keys():
        submission.write("%s,%s\n"%(key,submission_dict[key]))