In [142]:
import pandas as pd
import numpy as np
import tabulate as tab
import numbers

dataset = pd.read_csv('titanic.csv')
dataset["Sex"] = dataset["Sex"].astype('category').cat.codes
dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,C


In [143]:
data = dataset[["Sex","Age","Pclass","Fare","Survived"]]
data

Unnamed: 0,Sex,Age,Pclass,Fare,Survived
0,1,22.0,3,7.2500,0
1,0,38.0,1,71.2833,1
2,0,26.0,3,7.9250,1
3,0,35.0,1,53.1000,1
4,1,35.0,3,8.0500,0
...,...,...,...,...,...
886,1,27.0,2,13.0000,0
887,0,19.0,1,30.0000,1
888,0,,3,23.4500,0
889,1,26.0,1,30.0000,1


In [144]:
train_data = data[["Sex","Pclass","Fare","Survived"]].where(data["Age"].notnull())
train_data = train_data.dropna()
train_data["Sex"] = train_data["Sex"].astype(int)
train_data

Unnamed: 0,Sex,Pclass,Fare,Survived
0,1,3.0,7.2500,0.0
1,0,1.0,71.2833,1.0
2,0,3.0,7.9250,1.0
3,0,1.0,53.1000,1.0
4,1,3.0,8.0500,0.0
...,...,...,...,...
885,0,3.0,29.1250,0.0
886,1,2.0,13.0000,0.0
887,0,1.0,30.0000,1.0
889,1,1.0,30.0000,1.0


In [145]:
train_label = data[["Age"]].where(data["Age"].notnull())
train_label.dropna(inplace=True)
train_label

Unnamed: 0,Age
0,22.0
1,38.0
2,26.0
3,35.0
4,35.0
...,...
885,39.0
886,27.0
887,19.0
889,26.0


In [146]:
test_data = data[["Sex","Pclass","Fare","Survived"]].where(data["Age"].isnull())
test_data.dropna(inplace=True)
test_data["Sex"] = test_data["Sex"].astype(int)
test_data

Unnamed: 0,Sex,Pclass,Fare,Survived
5,1,3.0,8.4583,0.0
17,1,2.0,13.0000,1.0
19,0,3.0,7.2250,1.0
26,1,3.0,7.2250,0.0
28,0,3.0,7.8792,1.0
...,...,...,...,...
859,1,3.0,7.2292,0.0
863,0,3.0,69.5500,0.0
868,1,3.0,9.5000,0.0
878,1,3.0,7.8958,0.0


In [147]:
newMin, newMax = 0, 1
train_minmax = (train_data-train_data.min()) * (newMax-newMin)/(train_data.max()-train_data.min()) + newMin
minFare = train_minmax["Fare"].min()
maxFare = train_minmax["Fare"].max()
minPclass = train_minmax["Pclass"].min()
maxPclass = train_minmax["Pclass"].max()
minSurvived = train_minmax["Survived"].min()
maxSurvived = train_minmax["Survived"].max()
train_minmax["Sex"] = train_minmax["Sex"].astype(int)
train_minmax

Unnamed: 0,Sex,Pclass,Fare,Survived
0,1,1.0,0.014151,0.0
1,0,0.0,0.139136,1.0
2,0,1.0,0.015469,1.0
3,0,0.0,0.103644,1.0
4,1,1.0,0.015713,0.0
...,...,...,...,...
885,0,1.0,0.056848,0.0
886,1,0.5,0.025374,0.0
887,0,0.0,0.058556,1.0
889,1,0.0,0.058556,1.0


In [148]:
test_minmax = (test_data-train_data.min()) * (newMax-newMin)/(train_data.max()-train_data.min()) + newMin
test_minmax["Sex"] = test_minmax["Sex"].astype(int)
test_minmax

Unnamed: 0,Sex,Pclass,Fare,Survived
5,1,1.0,0.016510,0.0
17,1,0.5,0.025374,1.0
19,0,1.0,0.014102,1.0
26,1,1.0,0.014102,0.0
28,0,1.0,0.015379,1.0
...,...,...,...,...
859,1,1.0,0.014110,0.0
863,0,1.0,0.135753,0.0
868,1,1.0,0.018543,0.0
878,1,1.0,0.015412,0.0


In [149]:
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

classifier = KNeighborsRegressor(n_neighbors=3)
classifier.fit(train_minmax, train_label)
ypred = classifier.predict(test_minmax)
ypred

array([[31.66666667],
       [35.66666667],
       [16.66666667],
       [36.83333333],
       [21.33333333],
       [26.66666667],
       [46.        ],
       [20.        ],
       [22.        ],
       [26.66666667],
       [33.5       ],
       [34.66666667],
       [20.        ],
       [18.66666667],
       [41.33333333],
       [38.5       ],
       [10.66666667],
       [26.66666667],
       [33.5       ],
       [20.        ],
       [33.5       ],
       [33.5       ],
       [26.66666667],
       [24.33333333],
       [23.33333333],
       [33.5       ],
       [38.        ],
       [23.33333333],
       [29.33333333],
       [24.33333333],
       [28.33333333],
       [13.33333333],
       [32.        ],
       [55.66666667],
       [33.        ],
       [33.33333333],
       [41.33333333],
       [52.        ],
       [18.        ],
       [38.        ],
       [20.        ],
       [13.33333333],
       [38.        ],
       [26.66666667],
       [28.33333333],
       [32

In [150]:
colIdx = 0
for idx,row in data.iterrows():
    if (np.isnan(row["Age"])):
        data["Age"][idx] = ypred[colIdx]
        colIdx += 1
data

Unnamed: 0,Sex,Age,Pclass,Fare,Survived
0,1,22.000000,3,7.2500,0
1,0,38.000000,1,71.2833,1
2,0,26.000000,3,7.9250,1
3,0,35.000000,1,53.1000,1
4,1,35.000000,3,8.0500,0
...,...,...,...,...,...
886,1,27.000000,2,13.0000,0
887,0,19.000000,1,30.0000,1
888,0,14.333333,3,23.4500,0
889,1,26.000000,1,30.0000,1


In [151]:
test_dataset = pd.read_csv('titanic_test.csv')
test_dataset["Sex"] = test_dataset["Sex"].astype('category').cat.codes
test_dataset

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",1,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",0,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",1,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",1,,0,0,359309,8.0500,,S


In [152]:
train_data = data[["Sex","Age","Pclass","Fare"]]
train_data

Unnamed: 0,Sex,Age,Pclass,Fare
0,1,22.000000,3,7.2500
1,0,38.000000,1,71.2833
2,0,26.000000,3,7.9250
3,0,35.000000,1,53.1000
4,1,35.000000,3,8.0500
...,...,...,...,...
886,1,27.000000,2,13.0000
887,0,19.000000,1,30.0000
888,0,14.333333,3,23.4500
889,1,26.000000,1,30.0000


In [153]:
train_label = data[["Survived"]]
train_label

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [154]:
test_data = test_dataset[["Sex","Age","Pclass","Fare"]]
deletedIdx = []
for i in range(0, len(test_data)):
    if test_data["Age"][i] >= 0 and test_data["Fare"][i] >= 0:
        pass
    else:
        deletedIdx.append(i)
test_data = test_data.dropna(subset=['Age',"Fare"])
test_data

Unnamed: 0,Sex,Age,Pclass,Fare
0,1,34.5,3,7.8292
1,0,47.0,3,7.0000
2,1,62.0,2,9.6875
3,1,27.0,3,8.6625
4,0,22.0,3,12.2875
...,...,...,...,...
409,0,3.0,3,13.7750
411,0,37.0,1,90.0000
412,0,28.0,3,7.7750
414,0,39.0,1,108.9000


In [155]:
test_label = pd.read_csv('titanic_testlabel.csv')["Survived"]
test_label = test_label.drop(deletedIdx)
test_label

0      0
1      1
2      0
3      0
4      1
      ..
409    1
411    1
412    1
414    1
415    0
Name: Survived, Length: 331, dtype: int64

In [156]:
newMin, newMax = 0, 1
train_minmax = (train_data-train_data.min()) * (newMax-newMin)/(train_data.max()-train_data.min()) + newMin
train_minmax["Sex"] = train_minmax["Sex"].astype(int)
minAge = train_minmax["Age"].min()
maxAge = train_minmax["Age"].max()
minPclass = train_minmax["Pclass"].min()
maxPclass = train_minmax["Pclass"].max()
minFare = train_minmax["Fare"].min()
maxFare = train_minmax["Fare"].max()
train_minmax

Unnamed: 0,Sex,Age,Pclass,Fare
0,1,0.271174,1.0,0.014151
1,0,0.472229,0.0,0.139136
2,0,0.321438,1.0,0.015469
3,0,0.434531,0.0,0.103644
4,1,0.434531,1.0,0.015713
...,...,...,...,...
886,1,0.334004,0.5,0.025374
887,0,0.233476,0.0,0.058556
888,0,0.174835,1.0,0.045771
889,1,0.321438,0.0,0.058556


In [157]:
test_minmax = (test_data-train_data.min()) * (newMax-newMin)/(train_data.max()-train_data.min()) + newMin
test_minmax["Sex"] = test_minmax["Sex"].astype(int)
test_minmax

Unnamed: 0,Sex,Age,Pclass,Fare
0,1,0.428248,1.0,0.015282
1,0,0.585323,1.0,0.013663
2,1,0.773813,0.5,0.018909
3,1,0.334004,1.0,0.016908
4,0,0.271174,1.0,0.023984
...,...,...,...,...
409,0,0.032420,1.0,0.026887
411,0,0.459663,0.0,0.175668
412,0,0.346569,1.0,0.015176
414,0,0.484795,0.0,0.212559


In [158]:
train_minmax["Sex"] = train_minmax["Sex"].astype(int)
test_minmax["Sex"] = test_minmax["Sex"].astype(int)

In [162]:
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(train_minmax, train_label["Survived"].values)
ypred = classifier.predict(test_minmax)
ypred

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,

In [163]:
score = classifier.score(test_minmax, test_label)
print("Accuracy: " + str(score))
print("Error: " + str(1-score))

Accuracy: 0.8459214501510574
Error: 0.15407854984894265


In [164]:
print("Error ratio: " + str((1-score)*100) + " %")

Error ratio: 15.407854984894264 %
