# Logistic Regression Model for the Titanic Dataset

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='whitegrid', palette=sns.husl_palette(s=1), context='talk')

## Load data

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
df.shape

(891, 12)

## Feature Engineering

One-Hot Encoding of the 'Embarked' column

In [5]:
onehot = pd.get_dummies(df['Embarked']) 
df2 = pd.concat([df, onehot], axis=1) # <-- glues two dfs together horizontally
#df2.head()

Target Encoding of the 'Sex' column (Alternative to be checked: One-Hot encoding)

In [6]:
means = df2.groupby('Sex')['Survived'].mean()
df2['sex_target_enc'] = df2['Sex'].replace(means.to_dict())
means.to_dict()

{'female': 0.7420382165605095, 'male': 0.18890814558058924}

Use Quantile Binning for 'Age' column

In [7]:
#quantile bins
qbins = pd.qcut(df2['Age'], q=4, labels=['age_qbin1', 'age_qbin2', 'age_qbin3', 'age_qbin4']) #creates 4 buckets with the same number of passengers
qbins = pd.get_dummies(qbins) 
df2 = pd.concat([df2, qbins], axis=1)
qbins.head()

Unnamed: 0,age_qbin1,age_qbin2,age_qbin3,age_qbin4
0,0,1,0,0
1,0,0,1,0
2,0,1,0,0
3,0,0,1,0
4,0,0,1,0


Make column 'SibSp' binary

In [8]:
df2['SibSp_binary'] = (df['SibSp'] >= 1).astype(int)

## Define Model Data

In [9]:
# Split the data into X and y
y = df['Survived']
X = df2[['sex_target_enc', 'Pclass', 'age_qbin1', 'age_qbin2', 'age_qbin3', 'age_qbin4', 'S', 'C', 'Q', 'SibSp_binary']]

# Split it in training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((712, 10), (179, 10), (712,), (179,))

## Create a Baseline Model to Compare to

In [10]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent') 

In [11]:
dummy_clf.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

In [12]:
dummy_clf.score(X_train, y_train) #accuracy that is achieved by dummy classifier

0.6235955056179775

## Create an Logistic Regression Model

In [13]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()

In [14]:
lr_clf.fit(X_train, y_train) 


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
lr_clf.score(X_train, y_train)

0.8047752808988764

## Cross validation

In [16]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lr_clf, X_train, y_train, cv=4, scoring='accuracy')
scores

array([0.76966292, 0.83707865, 0.76966292, 0.78089888])

In [17]:
scores.mean().round(3), scores.std().round(3)

(0.789, 0.028)

## Test against test data 

In [18]:
print('training score: ', lr_clf.score(X_train, y_train).round(3))
print('test score: ', lr_clf.score(X_test, y_test).round(3))

training score:  0.805
test score:  0.816


## Load Kaggle Testdata (Feature Engineered in Separate JN --Testdata_LogReg01_Titanic.ipynb)

In [22]:
X_testdataset = pd.read_csv('test_feateng_LogReg01.csv', index_col=0)
X_testdataset

Unnamed: 0,sex_target_enc,Pclass,age_qbin1,age_qbin2,age_qbin3,age_qbin4,S,C,Q,SibSp_binary
0,0.188908,3,0,0,1,0,0,0,1,0
1,0.742038,3,0,0,0,1,1,0,0,1
2,0.188908,2,0,0,0,1,0,0,1,0
3,0.188908,3,0,1,0,0,1,0,0,0
4,0.742038,3,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
413,0.188908,3,0,0,0,0,1,0,0,0
414,0.742038,1,0,0,1,0,0,1,0,0
415,0.188908,3,0,0,1,0,1,0,0,0
416,0.188908,3,0,0,0,0,1,0,0,0


Predict outcome for test dataset

In [33]:
y_pred_testdataset = pd.DataFrame(lr_clf.predict(X_testdataset)).rename(columns={0:'Survived'})
y_pred_testdataset

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,1
...,...
413,0
414,1
415,0
416,0


## Put Together CSV for submitting to Kaggle

In [35]:
Kaggle_LogReg01 = pd.concat([pd.read_csv('test.csv')[['PassengerId']], y_pred_testdataset], axis=1)
Kaggle_LogReg01.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [37]:
Kaggle_LogReg01.to_csv('Kaggle_LogReg01.csv', index=False)