In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

In [3]:
data = pd.read_csv('/data/titanic.csv')

In [4]:
data.head(n=7)

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses,Parents/Children,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05
5,0,3,male,27.0,0,0,8.4583
6,0,1,male,54.0,0,0,51.8625


In [5]:
data['Gender'] = data['Sex'] == 'male'

In [6]:
data.head(n=7)

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses,Parents/Children,Fare,Gender
0,0,3,male,22.0,1,0,7.25,True
1,1,1,female,38.0,1,0,71.2833,False
2,1,3,female,26.0,0,0,7.925,False
3,1,1,female,35.0,1,0,53.1,False
4,0,3,male,35.0,0,0,8.05,True
5,0,3,male,27.0,0,0,8.4583,True
6,0,1,male,54.0,0,0,51.8625,True


In [7]:
Q = data[['Pclass', 'Gender', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
R = data['Survived'].values

In [8]:
model = LogisticRegression()

In [9]:
Q_train, Q_test, R_train, R_test = train_test_split(Q, R, test_size=0.3, random_state=1, stratify=R)

In [10]:
print('Q Train:', Q_train.shape)
print('Q Train:', Q_test.shape)

print('Q Train:', R_train.shape)
print('Q Train:', R_test.shape)

Q Train: (620, 6)
Q Train: (267, 6)
Q Train: (620,)
Q Train: (267,)


In [11]:
model.fit(Q_train, R_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [12]:
prediction_data = [[4, 35, True, 0, 0, 80000],
                   [3, 26, False, 1, 1, 70000]]

In [13]:
model.predict(prediction_data)

array([1, 1], dtype=int64)

In [14]:
model.predict_proba(prediction_data)

array([[0., 1.],
       [0., 1.]])

In [15]:
model.score(Q_test, R_test)

0.797752808988764

In [19]:
kf = KFold(n_splits=5, shuffle=True)

splits = list(kf.split(Q))
train_indices, test_indices = splits[0]

newQ_train = Q[train_indices]
newQ_test = Q[test_indices]
newR_train = R[train_indices]
newR_test = R[test_indices]

model2 = LogisticRegression()
model2.fit(newQ_train, newR_train)
print('K-Fold Score:', model2.score(newQ_test, newR_test))
print('70-30 split score:', model.score(Q_test, R_test))

K-Fold Score: 0.8202247191011236
70-30 split score: 0.797752808988764


