In [149]:
import pandas as pd
import numpy as np
import os, sys

import logging

logger = logging.getLogger("")
logger.setLevel(logging.INFO)

In [203]:
## helper functions
def train_test_split(data, split_ratio, pred_col):
    data_size = len(data)
    split_point = int(data_size*split_ratio)
    
    # split data
    train_data = data[:split_point]
    test_data = data[split_point:]
    
    # split feature and label
    train_X = train_data[[c for c in train_data.columns if c != pred_col]]
    train_y = train_data[[pred_col]]
    test_X = test_data[[c for c in test_data.columns if c != pred_col]]
    test_y = test_data[[pred_col]]
    
    return train_X, train_y, test_X, test_y

def cat2num(data, cols):
    output_data = data.copy()
    for c in cols:
        categories = output_data[c].unique()
        cate2num_dict = dict(zip(list(categories), range(len(categories))))
        output_data[c] = output_data[c].apply(lambda x: cate2num_dict[x])
    return output_data

def normalize(data, cols, method):
    output_data = data.copy()
    for c in cols:
        if method == 'z-score':
            output_data[c] = (output_data[c] - output_data[c].mean())/np.std(output_data[c])
        elif method == 'min_max':
            output_data[c] = (output_data[c] - output_data[c].min())/output_data[c].max()
        
    return output_data

def accuracy(pred, true):
    correct = 0
    pred_list = list(pred)
    true_list = list(true)
    for i in range(len(pred)):
        if pred_list[i] == true_list[i]:
            correct += 1
    acc = correct/len(pred)
    return acc

In [155]:
# load data
raw_data = pd.read_csv('data/titanic/train.csv', index_col = 'PassengerId')
test_data = pd.read_csv('data/titanic/test.csv', index_col = 'PassengerId')

# clean up
cleaned_data = raw_data.copy()
cleaned_data = cleaned_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)

pred_column = 'Survived'
feature_cols = [c for c in cleaned_data.columns if c != pred_column]

cleaned_data = cat2num(cleaned_data, ['Sex', 'Embarked'])
cleaned_data.fillna(cleaned_data.mean(), inplace=True)
cleaned_data = normalize(cleaned_data, feature_cols, method='z-score')
X_train, y_train, X_val, y_val = train_test_split(cleaned_data, 0.8, pred_column)

### Linear Regression

In [191]:
class LinearRegression:
    
    def __init__(self):
        
        print('Linear Regression Initiated')
        
    def fit(self, train_X, train_y):
        X = train_X.copy()
        X['bias'] = 1
        X = X.values
        y = train_y.values
        Xt = np.transpose(X)
        XtX = np.matmul(Xt, X)
        Xty = np.matmul(Xt, y)
        B = np.matmul(np.linalg.inv(XtX), Xty)
        self.B = B
        
        print('Model trained')
        return
    
    def predict(self, pred_X, categorical=True):
        X = pred_X.copy()
        X['bias'] = 1
        X = X.values        
        pred_y = np.matmul(X, self.B)
        if categorical:
            return pred_y.ravel().round()
        else:
            return pred_y.ravel()

In [218]:
myLR = LinearRegression()
myLR.fit(X_train, y_train)
y_val_pred = myLR.predict(X_val)
accuracy(y_val_pred, y_val['Survived'])

Linear Regression Initiated
Model trained


0.8212290502793296

In [201]:
from sklearn.linear_model import  as LR

In [215]:
skLR = LR()
skLR.fit(X_train, y_train)
y_val_pred = skLR.predict(X_val)
accuracy(y_val_pred.ravel().round(), y_val['Survived'])

0.8212290502793296

### Logistic Regression