In [1]:
import pandas as pd

In [21]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')

titanic_df.head(55)

Unnamed: 0,Survived,Pclass,Sex,Age,SiblingSpouse,ParentChild,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,3,0,27.0,0,0,7.925,0,0,1
1,1,3,1,0.42,0,1,8.5167,1,0,0
2,0,3,0,22.0,0,0,9.8375,0,0,1
3,1,3,0,5.0,4,2,31.3875,0,0,1
4,0,3,1,28.0,0,0,9.5,0,0,1
5,1,3,0,19.0,0,0,7.8792,0,1,0
6,1,3,0,22.0,0,0,7.75,0,1,0
7,0,3,1,17.0,0,0,7.125,0,0,1
8,1,2,0,6.0,0,1,33.0,0,0,1
9,1,3,0,31.0,0,0,8.6833,0,0,1


In [3]:
titanic_df.shape

(712, 10)

In [4]:
from sklearn.model_selection import train_test_split

X = titanic_df.drop('Survived', axis=1)
Y = titanic_df['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [5]:
x_train.shape, y_train.shape

((569, 9), (569,))

In [6]:
x_test.shape, y_test.shape

((143, 9), (143,))

### Logistic regression for classification

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [7]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(x_train, y_train)

In [8]:
y_pred = logistic_model.predict(x_test)

### Confusion matrix

In [9]:
pred_results = pd.DataFrame({'y_test': y_test,
                             'y_pred': y_pred})

In [13]:
pred_results.head()

Unnamed: 0,y_test,y_pred
527,0,0
55,1,0
93,1,0
402,0,0
461,1,0


In [11]:
titanic_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

titanic_crosstab

y_test,0,1
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,71,21
1,14,37


### Precision-recall scores

When we use these for multiclass classification we need to specify an averaging method to determine how the precision and recall scores for different labels should be weighted

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html

In [14]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [15]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)

accuracy_score :  0.7552447552447552
precision_score :  0.7254901960784313
recall_score :  0.6379310344827587


In [16]:
titanic_crosstab

y_test,0,1
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,71,21
1,14,37


In [17]:
TP = titanic_crosstab[1][1]
TN = titanic_crosstab[0][0]
FP = titanic_crosstab[0][1]
FN = titanic_crosstab[1][0]

In [18]:
accuracy_score_verified = (TP + TN) / (TP + FP + TN + FN)

accuracy_score_verified

0.7552447552447552

In [19]:
precision_score_survived = TP / (TP + FP)

precision_score_survived

0.7254901960784313

In [20]:
recall_score_survived = TP / (TP + FN)

recall_score_survived

0.6379310344827587