In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.cluster import KMeans

In [3]:
test_data = pd.read_csv('../data/spaceship-titanic/test.csv')
train_data = pd.read_csv('../data/spaceship-titanic/train.csv')

In [4]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [14]:
train_data.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin             0
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
dtype: int64

In [13]:
# clean data
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['HomePlanet'].fillna(train_data['HomePlanet'].mode()[0], inplace=True)
train_data['CryoSleep'].fillna(train_data['CryoSleep'].mode()[0], inplace=True)
train_data['Destination'].fillna(train_data['Destination'].mode()[0], inplace=True)
train_data['Cabin'].fillna(train_data['Cabin'].mode()[0], inplace=True)
train_data['VIP'].fillna(train_data['VIP'].mode()[0], inplace=True)
train_data['RoomService'].fillna(train_data['RoomService'].median(), inplace=True)
train_data['FoodCourt'].fillna(train_data['FoodCourt'].median(), inplace=True)
train_data['ShoppingMall'].fillna(train_data['ShoppingMall'].median(), inplace=True)
train_data['Spa'].fillna(train_data['Spa'].median(), inplace=True)
train_data['VRDeck'].fillna(train_data['VRDeck'].median(), inplace=True)

test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['HomePlanet'].fillna(test_data['HomePlanet'].mode()[0], inplace=True)
test_data['CryoSleep'].fillna(test_data['CryoSleep'].mode()[0], inplace=True)
test_data['Destination'].fillna(test_data['Destination'].mode()[0], inplace=True)
test_data['Cabin'].fillna(test_data['Cabin'].mode()[0], inplace=True)
test_data['VIP'].fillna(test_data['VIP'].mode()[0], inplace=True)
test_data['RoomService'].fillna(test_data['RoomService'].median(), inplace=True)
test_data['FoodCourt'].fillna(test_data['FoodCourt'].median(), inplace=True)
test_data['ShoppingMall'].fillna(test_data['ShoppingMall'].median(), inplace=True)
test_data['Spa'].fillna(test_data['Spa'].median(), inplace=True)
test_data['VRDeck'].fillna(test_data['VRDeck'].median(), inplace=True)

In [16]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [17]:
train_data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [24]:
# features = ['Age', 'HomePlanet', 'CryoSleep', 'Destination', 'Cabin', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

cat_features = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']

enc = preprocessing.OrdinalEncoder()
enc.fit(train_data[cat_features])
train_data[cat_features] = enc.transform(train_data[cat_features])

enc.fit(test_data[cat_features])
test_data[cat_features] = enc.transform(test_data[cat_features])

In [25]:
num_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

scaler = preprocessing.StandardScaler()
scaler.fit(train_data[num_features])

train_data[num_features] = scaler.transform(train_data[num_features])
test_data[num_features] = scaler.transform(test_data[num_features])

In [26]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,1.0,0.0,149.0,2.0,0.711945,0.0,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,Maham Ofracculy,False
1,0002_01,0.0,0.0,2184.0,2.0,-0.334037,0.0,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,Juanna Vines,True
2,0003_01,1.0,0.0,1.0,2.0,2.036857,1.0,-0.268001,1.959998,-0.283579,5.695623,-0.219796,Altark Susent,False
3,0003_02,1.0,0.0,1.0,2.0,0.293552,0.0,-0.333105,0.52301,0.336851,2.687176,-0.092818,Solam Susent,False
4,0004_01,0.0,0.0,2186.0,2.0,-0.891895,0.0,0.125652,-0.237159,-0.031059,0.231374,-0.26124,Willy Santantines,True


In [61]:
X_train = train_data.drop(['PassengerId', 'Name', 'Transported'], axis=1).values
X_test = test_data.drop(['PassengerId', 'Name'], axis=1).values

y = train_data['Transported'].values

In [62]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y)

y_pred = knn.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': y_pred})
output.to_csv('submissions/spaceship/submissionKNNNew.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
