In [2]:
import pandas as pd
import numpy as np
import math
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# read the data from csv file
data = pd.read_csv('train.csv').T.to_dict()

X_categorical = []
X_age = []
y = []

for idx in data:
    info = data[idx]
    sex = info['Sex']
    p_class = info['Pclass']
    survived = info['Survived']
    age = info['Age']
    # don't use data if age is absent
    if not math.isnan(age):
        X_categorical.append([sex, p_class])
        X_age.append([age])
        y.append(survived)

# Use one hot encoding to transform the categorical data:
enc = OneHotEncoder()
enc.fit(X_categorical)
features = enc.transform(X_categorical).toarray()

# Combine the age vector with the transformed matrix
X = np.hstack((X_age, features))

# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)

print('X_train:', X_train)
print('y_train:', y_train)

# Use logistic regression to fit the model
clf = LogisticRegression().fit(X_train, y_train)

# Print out the prediction
print(clf.predict([[20,0,1,0,0,1]]))
print(clf.predict_proba([[20,0,1,0,0,1]]))


X_train: [[50.  0.  1.  1.  0.  0.]
 [35.  1.  0.  0.  1.  0.]
 [51.  1.  0.  1.  0.  0.]
 ...
 [27.  1.  0.  0.  0.  1.]
 [41.  1.  0.  0.  1.  0.]
 [20.  1.  0.  0.  0.  1.]]
y_train: [1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 

In [149]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)

print(nb.predict([[20,0,1,0,0,1]]))
print(nb.predict_proba([[20,0,1,0,0,1]]))

[0]
[[0.89927105 0.10072895]]


In [143]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

y_pred_logistic_reg = clf.predict(X_test)
y_pred_naive_bayes = nb.predict(X_test)

print(f'logistic regression accuracy: {accuracy_score(y_test, y_pred_logistic_reg)}')
print(f'logistic regression f1 score: {f1_score(y_test, y_pred_logistic_reg)}')
print(f'naive bayes accuracy: {accuracy_score(y_test, y_pred_naive_bayes)}')
print(f'naive bayes f1 score: {f1_score(y_test, y_pred_naive_bayes)}')

logistic regression accuracy: 0.813953488372093
logistic regression f1 score: 0.7802197802197802
naive bayes accuracy: 0.786046511627907
naive bayes f1 score: 0.7415730337078651
