In [2]:
# Classification template

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Import Data
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

**Cleaning Data**

In [3]:
# Cleaning training data -------------------------------------
# Clean Data and Create Dummy Variables
from sklearn.preprocessing import LabelBinarizer
# Create Dummy Variable for Sex
train['Sex'] = pd.get_dummies(train['Sex']) #0 for male, one for female
# Create Dummy Variable for port where they embarked from
train = pd.get_dummies(train, columns=['Embarked'])
train = train.drop(['Name','Ticket','Cabin'], axis=1)

# Cleaning Test Data -----------------
# Create Dummy Variable for Sex
test['Sex'] = pd.get_dummies(test['Sex']) #0 for male, one for female
# Create Dummy Variable for port where they embarked from
test = pd.get_dummies(test, columns=['Embarked'])
# Drop Unneeded Columns
test = test.drop(['Name','Ticket','Cabin'], axis=1)

#Filled in NaN in age with mean age
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())

**Training model on local level**

In [4]:
X = train.iloc[:, 2:].values
y = train.iloc[:, 1].values

In [5]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_local_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Fitting classifier to the Training set
# Create your classifier here
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)



**Accuracy Summary for local test**

In [6]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_local_test, y_pred)
# model statistics for using the test data in the training set
from sklearn.metrics import r2_score, classification_report, accuracy_score
print('This is R2: {}'.format(r2_score(y_local_test, y_pred)))
print('This is the accuracy score: {}'.format(accuracy_score(y_local_test, y_pred)))
print('Confusion Matrix')
print(cm)

This is R2: -0.21238095238095234
This is the accuracy score: 0.7164179104477612
Confusion Matrix
[[137  31]
 [ 45  55]]


In [7]:
with open('output/knn_stats.txt','w') as file:
    file.write('Summary Statistics\n')
    file.write('-------------------\n')
    file.write('This is R2: {}\n'.format(r2_score(y_local_test, y_pred)))
    file.write('This is the accuracy score: {}\n\n'.format(accuracy_score(y_local_test, y_pred)))
    file.write('Confusion Matrix:\n')
    for row in cm:
        file.write('{}\n'.format(row))

**Predict Model**

In [8]:
X_test = test.iloc[:,1:].values
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1,

**Output Prediction**

In [9]:
with open('output/knn_submission.csv', 'w') as file:
    file.write('PassengerId, Survived\n')
    for i in range(len(y_pred)):
        file.write('{},{}\n'.format(str(test.iloc[i,0]),str(y_pred[i])))