In [10]:
# Imports:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from methods import *
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from sklearn.pipeline import Pipeline



train = r'data\train.csv'
test = r'data\test.csv'

dropped_features = ["Embarked","Cabin","SibSp","Parch","Fare"]
num_neighbors = 7

In [11]:
#load the training set
data = pd.read_csv(train)

# drop names of passengers and ticket labels
data = data.drop(["Name","Ticket"], axis = 1)

# change cabin sections (letters) and change them to numerical data
data["Cabin"] = data["Cabin"].apply(lambda x: x if str(x)[0] == "n" else str(x)[0])
data["Cabin"] = data["Cabin"].apply(lambda x: cabinToFloat(x))

# change categorical data to numerical data
data["Sex"] = data["Sex"].apply(lambda x: 1 if x == 'male' else 0)
data["Embarked"] = data["Embarked"].apply(lambda x: condit(x))

# initialize imputer and fit it to the data
imputer = KNNImputer(n_neighbors = num_neighbors, weights = 'uniform', metric= 'nan_euclidean')
imputer.fit(data)
dataT = imputer.transform(data)

#convert the transformed data back into the correct data type, and 
data = pd.DataFrame(dataT, columns = imputer.get_feature_names_out())

data = normData(data).drop(dropped_features, axis = 1)
data


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
0,1.0,0.0,0.826913,0.737281,-0.578731
1,2.0,1.0,-1.565228,-1.354813,0.630129
2,3.0,1.0,0.826913,-1.354813,-0.276516
3,4.0,1.0,-1.565228,-1.354813,0.403468
4,5.0,0.0,0.826913,0.737281,0.403468
...,...,...,...,...,...
886,887.0,0.0,-0.369158,0.737281,-0.200962
887,888.0,1.0,-1.565228,-1.354813,-0.805392
888,889.0,0.0,0.826913,-1.354813,-0.168582
889,890.0,1.0,-1.565228,0.737281,-0.276516


In [12]:
#Similar process for the test set except now we are predicting the "Survived" status
dataTest = pd.read_csv(test)

# drop everything except for sex and age
dataTest = dataTest.drop(["Name","Ticket"], axis = 1)

# change cabin sections (letters) and change them to numerical data
dataTest["Cabin"] = dataTest["Cabin"].apply(lambda x: x if str(x)[0] == "n" else str(x)[0])
dataTest["Cabin"] = dataTest["Cabin"].apply(lambda x: cabinToFloat(x))

# change categorical data to numerical data
dataTest["Sex"] = dataTest["Sex"].apply(lambda x: 1 if x == 'male' else 0)
dataTest["Embarked"] = dataTest["Embarked"].apply(lambda x: condit(x))

# initialize imputer and fit it to the data
imputer = KNNImputer(n_neighbors = num_neighbors, weights = 'uniform', metric= 'nan_euclidean')
imputer.fit(dataTest)
dataT = imputer.transform(dataTest)

#convert the transformed data back into the correct data type, and 
dataTest = pd.DataFrame(dataT, columns = imputer.get_feature_names_out())

# normalize the data
dataTest = normData(dataTest).drop(dropped_features, axis = 1)
dataTest

Unnamed: 0,PassengerId,Pclass,Sex,Age
0,892.0,0.872436,0.755024,0.382358
1,893.0,0.872436,-1.321292,1.350439
2,894.0,-0.315441,0.755024,2.512136
3,895.0,0.872436,0.755024,-0.198490
4,896.0,0.872436,-1.321292,-0.585723
...,...,...,...,...
413,1305.0,0.872436,0.755024,-0.458489
414,1306.0,-1.503319,-1.321292,0.730867
415,1307.0,0.872436,0.755024,0.692144
416,1308.0,0.872436,0.755024,-0.458489


In [13]:
matrix , _ = matrixBuilder(data)

# should delete survived status from vector calculation
matrix = np.delete(matrix, 1, 1)

labels = list(zip(data["PassengerId"],data["Survived"]))

# Project the matrix onto 2D and 3D space 
proj_matrix_2D = project_to_2d(matrix)
proj_matrix_3D = project_to_3d(matrix)

# Plot the projections
plot_3d_projection(proj_matrix_3D,labels)
plot_2d_projection(proj_matrix_3D,labels)

In [14]:
x_train = data.drop(["Survived","PassengerId"], axis = 1)
y_train = data["Survived"]

x_test = dataTest.drop("PassengerId", axis = 1)

In [15]:
model = KNeighborsClassifier(n_neighbors=num_neighbors, metric="euclidean")
model.fit(x_train,y_train)
predicted_labels = list(model.predict(x_test))

In [16]:
# write the predicted labels to a .csv file to submit
out = pd.DataFrame({"PassengerId" : (int(x) for x in dataTest["PassengerId"]),"Survived" : (int(y) for y in predicted_labels)})
out.to_csv('submission.csv', index = False)

out

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
