In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
### Data managment
## input
df = pd.read_csv("data/train.csv")

## grouping into int categories
df["Sex"] = df["Sex"].replace({"male":0, "female":1})
df["Embarked"] = df["Embarked"].replace({'C':0, 'S':1, 'Q':2})
df["Cabin"] = df["Cabin"].apply(lambda x: 0 if x is np.nan else 1)

## handling names
title_dict = {"Mr.":1, "Mrs.":2, "Mme.":2, "Miss.":3, "Mlle.":3, "Master.":4, "Dr.":5, "Rev.": 6}

def name_lookup(name_str):
    for word in name_str.split():
        if word in title_dict.keys():
            return title_dict[word]
    return 0

df["Name"] = df["Name"].apply(name_lookup)
        
## handling nans
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode().iloc[0])

## get dummies
features = ["Pclass", "Name"]
for f in features:
    df = pd.concat([df.drop(columns=f), pd.get_dummies(df[f], prefix=f)], axis=1)

## creating input and output dataframes
yf = df["Survived"]
Xf = df.drop(columns=["PassengerId", "Survived", "Ticket"])

## normalizing
features = ["Age", "Fare"]
Xf[features] = (Xf[features] - Xf[features].mean()) / Xf[features].std()

In [3]:
## Logistic regression calculation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
clf_logReg = LogisticRegression(solver='newton-cg', multi_class='multinomial')

frac_train = 0.75
m, n = Xf.shape
frac_tot_arr = np.linspace(0.05, 1, 401)
score_train_arr_logReg, score_test_arr_logReg = [], []
for frac_tot in frac_tot_arr:
    print("#"*int(100*frac_tot), "{0:.0f}%".format(100*frac_tot), end='\r')
    # randomly choosing samples and splitting into train and test
    Xnp_train, Xnp_test, ynp_train, ynp_test = \
        train_test_split(Xf, yf, test_size=frac_tot*(1-frac_train), train_size=frac_tot*frac_train)
    # training
    clf_logReg = clf_logReg.fit(Xnp_train, ynp_train)
    # evaluating
    score_train_arr_logReg.append(accuracy_score(clf_logReg.predict(Xnp_train), ynp_train))
    score_test_arr_logReg.append(accuracy_score(clf_logReg.predict(Xnp_test), ynp_test))

#################################################################################################### 100%

In [4]:
## Logistic regression plotting
fig, fax = plt.subplots()
fax.plot(frac_tot_arr, score_train_arr_logReg, label="train")
fax.plot(frac_tot_arr, score_test_arr_logReg, label="test")
plt.legend()
plt.show()
print("Final accuracy for logistic regression", accuracy_score(clf_logReg.predict(Xnp_test), ynp_test))

Final accuracy for logistic regression 0.8609865470852018
