In [134]:
import pandas as pd
import numpy as np
import math
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

# read the data from csv file
data = pd.read_csv('train.csv').T.to_dict()

X_categorical = []
X_age = []
y_train = []

for idx in data:
    info = data[idx]
    sex = info['Sex']
    p_class = info['Pclass']
    survived = info['Survived']
    age = info['Age']
    # don't use data if age is absent
    if not math.isnan(age):
        X_categorical.append([sex, p_class])
        X_age.append([age])
        y_train.append(survived)

# Use one hot encoding to transform the categorical data:
enc = OneHotEncoder()
enc.fit(X_categorical)
features = enc.transform(X_categorical).toarray()

# Combine the age vector with the transformed matrix
X_train = np.hstack((X_age, features))

# Use logistic regression to fit the model
clf = LogisticRegression().fit(X_train, y_train)

# Print out the prediction
# print(clf.predict([[20,0,1,0,0,1]]))
print(clf.predict_proba([[29,1,0,0,0,1]]))


[[0.4685322 0.5314678]]
0.7899159663865546


In [109]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)

print(nb.predict([[20,0,1,0,0,1]]))
print(nb.predict_proba([[20,0,1,0,0,1]]))

[0]
[[0.88810194 0.11189806]]


In [140]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

y_true = y_train
y_pred_logistic_reg = clf.predict(X_train)
y_pred_naive_bayes = nb.predict(X_train)

print(f'logistic regression accuracy: {accuracy_score(y_true, y_pred_logistic_reg)}')
print(f'logistic regression f1 score: {f1_score(y_true, y_pred_logistic_reg)}')
print(f'naive bayes accuracy: {accuracy_score(y_true, y_pred_naive_bayes)}')
print(f'naive bayes f1 score: {f1_score(y_true, y_pred_naive_bayes)}')

logistic regression accuracy: 0.7899159663865546
logistic regression f1 score: 0.7340425531914895
naive bayes accuracy: 0.7801120448179272
naive bayes f1 score: 0.7150635208711434
