# Classification Models on Titanic Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, accuracy_score

## Initialize Variables

In [2]:
r = 4635 # random seed
np.random.seed(r)

param_grid = {
    "SVC": {
        'kernel': ['linear', 'rbf', "poly"], 
        'C': [1, 10, 100, 1000],
    },
    "kNN": {
        "n_neighbors": [2, 3, 5, 7, 9, 11, 15],
        "weights": ["uniform", "distance"],
        "metric": ["manhattan"]
    },
    "LR": {
        "solver": ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }
}
    
classifiers = []
true, predicted = [], []
best_grids = []

## Import Data

In [3]:
data = pd.read_csv("data/train_clean.csv").sample(frac=1).reset_index(drop=True)
data = data.iloc[:1000,]
data = data.drop(columns=["PassengerId", "Name", "Cabin"])
X = data.drop(columns="Survived").to_numpy() # Predictors
y = data['Survived'].to_numpy() # Response

## Train and Test Models

In [4]:
def train(train_features, train_label, test_features, classifier):

    if classifier == "SVC":
        clf = GridSearchCV(
            estimator = SVC(), param_grid = param_grid[classifier], n_jobs = 1
        )
    elif classifier == 'kNN':
        clf = GridSearchCV(
            estimator = KNeighborsClassifier(), param_grid = param_grid[classifier], n_jobs = 1
        )
    elif classifier == 'LR':
        clf = GridSearchCV(
            estimator = LogisticRegression(), param_grid = param_grid[classifier], n_jobs = 1
    )
    grid_search = clf.fit(train_features, train_label) # balanced training data
    best_grid = grid_search.best_estimator_
    result = grid_search.predict(test_features)
    return result, best_grid

In [5]:
# 5-fold cross validation
kf = KFold()
fold = 0
for train_index, test_index in kf.split(X):
    fold += 1
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # normalize
    X_train_normalized = preprocessing.normalize(X_train[:, 0:7], axis=0)
    X_train_normalized_full = np.concatenate((X_train_normalized, X_train[:,7:]), axis=1)
    X_train = pd.DataFrame(X_train_normalized_full) # to dataframe
    
    X_test_normalized = preprocessing.normalize(X_test[:, 0:7], axis=0)
    X_test_normalized_full = np.concatenate((X_test_normalized, X_test[:,7:]), axis=1)
    X_test = pd.DataFrame(X_test_normalized_full) # to dataframe
    
    for classifier in param_grid.keys():
        print("fold", str(fold) + ",", classifier)
        result, best_grid = train(X_train, y_train, X_test, classifier)

        classifiers.append(classifier)
        true.append(y_test)
        predicted.append(result)
        best_grids.append(best_grid)

fold 1, SVC
fold 1, kNN
fold 1, LR
fold 2, SVC
fold 2, kNN
fold 2, LR
fold 3, SVC
fold 3, kNN
fold 3, LR
fold 4, SVC
fold 4, kNN
fold 4, LR
fold 5, SVC
fold 5, kNN
fold 5, LR


## Compile Results

In [6]:
results = pd.DataFrame(list(zip(classifiers, best_grids, true, predicted)), 
                                columns=["classifier", "best_grid", "true", "predicted",
                                ])

## Evaluate Results

In [7]:
accuracies = []
for i, row in results.iterrows(): 
    accuracies.append(accuracy_score(row.true, row.predicted))
results['accuracy'] = accuracies

In [8]:
results.sort_values(by=['accuracy'], ascending=False)

Unnamed: 0,classifier,best_grid,true,predicted,accuracy
12,SVC,"SVC(C=10, kernel='linear')","[0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, ...","[0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, ...",0.835
6,SVC,"SVC(C=10, kernel='linear')","[1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, ...",0.79
13,kNN,"KNeighborsClassifier(metric='manhattan', n_nei...","[0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, ...","[0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, ...",0.775
9,SVC,"SVC(C=10, kernel='linear')","[1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, ...","[1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, ...",0.765
7,kNN,"KNeighborsClassifier(metric='manhattan', n_nei...","[1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, ...","[1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...",0.76
3,SVC,"SVC(C=10, kernel='linear')","[1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, ...","[1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, ...",0.745
4,kNN,"KNeighborsClassifier(metric='manhattan', n_nei...","[1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, ...","[1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, ...",0.735
10,kNN,"KNeighborsClassifier(metric='manhattan', n_nei...","[1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, ...","[1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, ...",0.735
0,SVC,"SVC(C=10, kernel='linear')","[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, ...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, ...",0.72
14,LR,LogisticRegression(solver='newton-cg'),"[0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, ...","[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",0.685
