# Logistic Regression Model for the Titanic Dataset

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='whitegrid', palette=sns.husl_palette(s=1), context='talk')

## Load data

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
df.shape

(891, 12)

## Feature Engineering

One-Hot Encoding of the 'Embarked' column

In [5]:
onehot = pd.get_dummies(df['Embarked']) 
df2 = pd.concat([df, onehot], axis=1) # <-- glues two dfs together horizontally
#df2.head()

Target Encoding of the 'Sex' column (Alternative to be checked: One-Hot encoding)

In [6]:
means = df2.groupby('Sex')['Survived'].mean()
df2['sex_target_enc'] = df2['Sex'].replace(means.to_dict())
means.to_dict()

{'female': 0.7420382165605095, 'male': 0.18890814558058924}

Make binary 'Sex' columns

In [7]:
df2['male'] = (df2['Sex'] == 'male').astype(int)
df2['female'] = (df2['Sex'] == 'female').astype(int)

Use Quantile Binning for 'Age' column

In [8]:
#quantile bins
qbins = pd.qcut(df2['Age'], q=4, labels=['age_qbin1', 'age_qbin2', 'age_qbin3', 'age_qbin4']) #creates 4 buckets with the same number of passengers
qbins = pd.get_dummies(qbins) 
df2 = pd.concat([df2, qbins], axis=1)
qbins.head()

Unnamed: 0,age_qbin1,age_qbin2,age_qbin3,age_qbin4
0,0,1,0,0
1,0,0,1,0
2,0,1,0,0
3,0,0,1,0
4,0,0,1,0


Make 'family' column

In [9]:
df2['family'] = df['SibSp'] + df['Parch'] + 1
df2['fam_single'] = (df2['family'] == 1).astype(int)
df2['fam_small'] = ((df2['family']>=2) & (df2['family']<=4)).astype(int)
df2['fam_big'] = (df2['family'] > 4).astype(int)

Make 'male singles' column

In [10]:
df2['single_male'] = df2['fam_single'] * df2['male']

## Define Model Data

In [11]:
# Split the data into X and y
y = df['Survived']
X = df2[['sex_target_enc', 'Pclass', 'age_qbin1', 'age_qbin2', 'age_qbin3', 'age_qbin4', 'S', 'C', 'Q', 'single_male', 'fam_small', 'fam_big']]

# Split it in training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((712, 12), (179, 12), (712,), (179,))

## Create a Baseline Model to Compare to

In [12]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent') 

In [13]:
dummy_clf.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

In [14]:
dummy_clf.score(X_train, y_train) #accuracy that is achieved by dummy classifier

0.6235955056179775

## Create an Logistic Regression Model

In [24]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=50, oob_score=True, random_state=42)

In [25]:
rf_clf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [26]:
rf_clf.score(X_train, y_train)

0.8370786516853933

## Cross validation

In [27]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_clf, X_train, y_train, cv=4, scoring='accuracy')
scores

array([0.76404494, 0.84831461, 0.75842697, 0.82022472])

In [28]:
scores.mean().round(3), scores.std().round(3)

(0.798, 0.038)

## Hyperparameter Optimization
Randomized Search

In [21]:
from sklearn.model_selection import RandomizedSearchCV

In [33]:
param_distributions = {
    'n_estimators' : list(range(20, 500, 20)), # it will uniformly sample from that list
}

In [34]:
randomizedcv = RandomizedSearchCV(rf_clf, param_distributions=param_distributions)
randomizedcv.fit(X_train, y_train)
random_results = pd.DataFrame(randomizedcv.cv_results_)
columns = ['mean_test_score', 'std_test_score', 'mean_fit_time', 'param_n_estimators']
random_results[columns].sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_test_score,std_test_score,mean_fit_time,param_n_estimators
2,0.794928,0.018155,0.037814,20
6,0.794918,0.02347,0.355101,200
5,0.793519,0.018307,0.601724,340
7,0.793519,0.024318,0.180865,100
3,0.792121,0.016553,0.690318,380
0,0.792111,0.020398,0.548093,300
1,0.790712,0.017644,0.849105,480
4,0.790712,0.017644,0.822033,460
9,0.790712,0.01523,0.744842,420
8,0.786487,0.016143,0.4226,240


## Test against test data 

In [36]:
print('training score: ', rf_clf.score(X_train, y_train).round(3))
print('test score: ', rf_clf.score(X_test, y_test).round(3))

training score:  0.837
test score:  0.81


## Train model with whole dataset

In [38]:
rf_clf.fit(X, y) 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [39]:
rf_clf.score(X,y)

0.8406285072951739

## Load Kaggle Testdata (Feature Engineered in Separate JN --Testdata_LogReg01_Titanic.ipynb)

In [40]:
X_testdataset = pd.read_csv('test_feateng_LogReg02.csv', index_col=0)
X_testdataset

Unnamed: 0,sex_target_enc,Pclass,age_qbin1,age_qbin2,age_qbin3,age_qbin4,S,C,Q,single_male,fam_small,fam_big
0,0.188908,3,0,0,1,0,0,0,1,1,0,0
1,0.742038,3,0,0,0,1,1,0,0,0,1,0
2,0.188908,2,0,0,0,1,0,0,1,1,0,0
3,0.188908,3,0,1,0,0,1,0,0,1,0,0
4,0.742038,3,0,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.188908,3,0,0,0,0,1,0,0,1,0,0
414,0.742038,1,0,0,1,0,0,1,0,0,0,0
415,0.188908,3,0,0,1,0,1,0,0,1,0,0
416,0.188908,3,0,0,0,0,1,0,0,1,0,0


Predict outcome for test dataset

In [41]:
y_pred_testdataset = pd.DataFrame(rf_clf.predict(X_testdataset)).rename(columns={0:'Survived'})
y_pred_testdataset

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,0
...,...
413,0
414,1
415,0
416,0


## Put Together CSV for submitting to Kaggle

In [43]:
Kaggle_RandomForest01 = pd.concat([pd.read_csv('test.csv')[['PassengerId']], y_pred_testdataset], axis=1)
Kaggle_RandomForest01.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [44]:
Kaggle_RandomForest01.to_csv('Kaggle_RandomForest01.csv', index=False)