In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn.ensemble
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns

In [8]:
#Read in data
data = pd.read_csv('train.csv')

#Remove attributes that are presumed to be irrelevant
data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

#Replace strings with their hashes, as sklearn can only handle numeric input
data['Sex'] = data['Sex'].map(lambda x: hash(x))
data['Embarked'] = data['Embarked'].map(lambda x: hash(x))

#Replace missing ages with a placeholder value
data.fillna(value=-10, axis=1, inplace=True)

In [9]:
#Split into training and testing sets
(train_data, test_data) = train_test_split(data, test_size=0.1, random_state=67854895) 

x_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']

x_test = test_data.drop('Survived', axis=1)
y_test = test_data['Survived']

In [10]:
#Returns the classifier's accuracy
def accuracy(classifier) -> float:
    num_correct = 0
    for i in range(len(x_test)):
        pred = classifier.predict([x_test.values[i]])[0]
        correct = y_test.values[i]
        if pred == correct:
                num_correct += 1
    return num_correct / len(x_test)

#Function that returns the false positive and false negative rate of a classifier
def false_rates(classifier) -> (float, float):
    num_negative = 0
    incorrect_negative = 0
    num_positive = 0
    incorrect_positive = 0
    for i in range(len(x_test)):
        pred = classifier.predict([x_test.values[i]])[0]
        correct = y_test.values[i]
        if correct == 0:
            num_negative += 1
            if pred != correct:
                incorrect_negative += 1
        else:
            num_positive += 1
            if pred != correct:
                incorrect_positive += 1
    fp = 0
    if num_negative != 0:
        fp = incorrect_negative / num_negative
    fn = 0
    if num_positive != 0:
        fn = incorrect_positive / num_positive
    return (fp, fn)

In [11]:
classifier = AdaBoostClassifier()
classifier.fit(x_train, y_train)
print(false_rates(classifier))
print(accuracy(classifier))

(0.2, 0.26666666666666666)
0.7777777777777778


In [12]:
test_data = pd.read_csv('test.csv')
test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data['Sex'] = test_data['Sex'].map(lambda x: hash(x))
test_data['Embarked'] = test_data['Embarked'].map(lambda x: hash(x))
test_data.fillna(value=-10, axis=1, inplace=True)

output = [{'PassengerId' : 892 + i, 'Survived' : classifier.predict([test_data.values[i]])[0]} for i in range(len(test_data))]
pd.DataFrame(output).to_csv(path_or_buf='ADA_Output.csv', index=False)