In [148]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

In [149]:
train = pd.read_csv('train.csv')

In [150]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [151]:
test = pd.read_csv('test.csv')

# Cleaning
## The columns which are not numeric will be converted to numeric and the columns with too many NaNs will be removed (e.g., the Cabin)

In [152]:
print("Total number of samples in train set is:", len(train))
print("-"*45)
print("The number of null (NaN) values in each column of the train set is:")
print(train.isnull().sum())

Total number of samples in train set is: 891
---------------------------------------------
The number of null (NaN) values in each column of the train set is:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


### From the above cell output, the cabin column has too many NaN values, so we will remove it. Also, we will replace the other NaN values in the Age and Embarked columns with their average values (or most frequent values in the case of discrete distribution)

In [153]:
trData = train.drop('Cabin', 1)
# Name and Ticket number are also dropped. Name is irelevant and Ticket number is unique for each person (i.e. 891 different tickets!)
trData = trData.drop('Name', 1)
trData = trData.drop('Ticket', 1)

testData = test.drop('Cabin', 1)
# Name and Ticket number are also dropped. Name is irelevant and Ticket number is unique for each person (i.e. 891 different tickets!)
testData = testData.drop('Name', 1)
testData = testData.drop('Ticket', 1)

In [154]:
testData[testData['Fare'].isnull()]

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
152,1044,3,male,60.5,0,0,,S


In [155]:
testData['Fare'] = testData['Fare'].fillna(trData['Fare'].mean())

In [156]:
testData.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [157]:
testData['Fare'].isnull().sum()

0

In [158]:
# Age has a lot of missing data (177 out of 891), we replace the missing ages with the average value
trData['Age']  = trData['Age'].fillna(trData['Age'].mean());
testData['Age']  = testData['Age'].fillna(trData['Age'].mean());

In [159]:
trData['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [160]:
trData[trData['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
61,62,1,1,female,38.0,0,0,80.0,
829,830,1,1,female,62.0,0,0,80.0,


In [161]:
# remove the two rows without Embarked Info
trData = trData.drop(trData.index[[61, 829]]);
len(trData)

889

In [162]:
# reset the index
trData = trData.reset_index()

In [163]:
trData.head()

Unnamed: 0,index,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,1,0,3,male,22.0,1,0,7.25,S
1,1,2,1,1,female,38.0,1,0,71.2833,C
2,2,3,1,3,female,26.0,0,0,7.925,S
3,3,4,1,1,female,35.0,1,0,53.1,S
4,4,5,0,3,male,35.0,0,0,8.05,S


In [164]:
trData = trData.drop('index', 1);

In [165]:
trData['Sex'] = trData['Sex'].replace(['female', 'male'],[0,1])
testData['Sex'] = testData['Sex'].replace(['female', 'male'],[0,1])

In [166]:
print(trData['Embarked'].unique())
print(testData['Embarked'].unique())

['S' 'C' 'Q']
['Q' 'S' 'C']


In [167]:
trData['Embarked'] = trData['Embarked'].replace(['S','C','Q'],[0,1,2])
testData['Embarked'] = testData['Embarked'].replace(['S','C','Q'],[0,1,2])

In [168]:
trData.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,0
1,2,1,1,0,38.0,1,0,71.2833,1
2,3,1,3,0,26.0,0,0,7.925,0
3,4,1,1,0,35.0,1,0,53.1,0
4,5,0,3,1,35.0,0,0,8.05,0


In [169]:
testData.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,2
1,893,3,0,47.0,1,0,7.0,0
2,894,2,1,62.0,0,0,9.6875,2
3,895,3,1,27.0,0,0,8.6625,0
4,896,3,0,22.0,1,1,12.2875,0


In [170]:
Xtrain = trData[['Pclass','Sex','Pclass','Age','SibSp','Parch','Fare','Embarked']]

In [171]:
ytrain = trData['Survived']

In [172]:
Xtrain.head()

Unnamed: 0,Pclass,Sex,Pclass.1,Age,SibSp,Parch,Fare,Embarked
0,3,1,3,22.0,1,0,7.25,0
1,1,0,1,38.0,1,0,71.2833,1
2,3,0,3,26.0,0,0,7.925,0
3,1,0,1,35.0,1,0,53.1,0
4,3,1,3,35.0,0,0,8.05,0


In [173]:
testData.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,2
1,893,3,0,47.0,1,0,7.0,0
2,894,2,1,62.0,0,0,9.6875,2
3,895,3,1,27.0,0,0,8.6625,0
4,896,3,0,22.0,1,1,12.2875,0


In [174]:
kneighbor = KNeighborsClassifier(n_neighbors=3)

In [175]:
kneighbor.fit(Xtrain, ytrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [276]:
arPredict = kneighbor.predict(testData)

In [280]:
yPredict = pd.DataFrame({'PassengerId':testData['PassengerId'], 'Survived': arPredict})

In [281]:
yPredict.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0


In [282]:
yPredict.to_csv('../predictions.csv', index = False)