# Logistic regression
## Titanic data


In [1]:
### load the data
import pandas as pd
df = pd.read_csv('data/titanic3.csv', usecols=['pclass', 'survived', 'sex', 'age'])
print(df.head())
print('\nDimensions of data frame:', df.shape)

   pclass  survived     sex      age
0       1         1  female  29.0000
1       1         1    male   0.9167
2       1         0  female   2.0000
3       1         0    male  30.0000
4       1         0  female  25.0000

Dimensions of data frame: (1309, 4)


In [2]:
# convert columns to factors
df.survived = df.survived.astype('category').cat.codes
df.pclass = df.pclass.astype('category').cat.codes
df.sex = df.sex.astype('category').cat.codes
df.head()

Unnamed: 0,pclass,survived,sex,age
0,0,1,0,29.0
1,0,1,1,0.9167
2,0,0,0,2.0
3,0,0,1,30.0
4,0,0,0,25.0


In [3]:
# count missing values

df.isnull().sum()

pclass        0
survived      0
sex           0
age         263
dtype: int64

In [4]:
# fill missing values
import numpy as np

age_mean = np.mean(df.age)
df.age.fillna(age_mean, inplace=True)

In [5]:
# train test split
from sklearn.model_selection import train_test_split

X = df.loc[:, ['pclass', 'age', 'sex']]
y = df.survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print('train size:', X_train.shape)
print('test size:', X_test.shape)

train size: (1047, 3)
test size: (262, 3)


In [6]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.7831900668576887

In [7]:
# make predictions

pred = clf.predict(X_test)

In [8]:
# evaluate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('accuracy score: ', accuracy_score(y_test, pred))
print('precision score: ', precision_score(y_test, pred))
print('recall score: ', recall_score(y_test, pred))
print('f1 score: ', f1_score(y_test, pred))

accuracy score:  0.7977099236641222
precision score:  0.7901234567901234
recall score:  0.64
f1 score:  0.7071823204419889


In [9]:
# confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, pred)

array([[145,  17],
       [ 36,  64]])

In [10]:
# build a second classifier

clf2 = LogisticRegression(class_weight='balanced')
clf2.fit(X_train, y_train)
pred2 = clf2.predict(X_test)
print('accuracy score: ', accuracy_score(y_test, pred2))
print('precision score: ', precision_score(y_test, pred2))
print('recall score: ', recall_score(y_test, pred2))
print('f1 score: ', f1_score(y_test, pred2))

accuracy score:  0.7977099236641222
precision score:  0.7326732673267327
recall score:  0.74
f1 score:  0.7363184079601991


In [11]:
confusion_matrix(y_test, pred2)

array([[135,  27],
       [ 26,  74]])