In [10]:
import torch
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mss
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
PROCESSED_DATA_PATH = "./data/processed"
train_df = pd.read_csv(os.path.join(PROCESSED_DATA_PATH,'train.csv'))
test_df = pd.read_csv(os.path.join(PROCESSED_DATA_PATH,'test.csv'))

In [3]:
train_df.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Transported
0,0,0.696428,0,-0.335475,-0.287209,-0.283024,-0.274476,-0.269909,0,1,0,0,0,1,0
1,0,-0.337018,0,-0.173732,-0.281828,-0.243568,0.199108,-0.230813,1,0,0,0,0,1,1
2,0,2.00546,1,-0.271668,1.850568,-0.283024,5.518075,-0.22637,0,1,0,0,0,1,0
3,0,0.28305,0,-0.335475,0.479784,0.302505,2.597215,-0.098421,0,1,0,0,0,1,0
4,0,-0.88819,0,0.114141,-0.245362,-0.044709,0.21291,-0.268132,1,0,0,0,0,1,1


In [4]:
test_df.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,1,-0.112544,0,-0.356108,-0.295677,-0.307818,-0.274546,-0.254932,1,0,0,0,0,1
1,0,-0.67771,0,-0.356108,-0.289583,-0.307818,2.327261,-0.254932,1,0,0,0,0,1
2,1,0.170039,0,-0.356108,-0.295677,-0.307818,-0.274546,-0.254932,0,1,0,1,0,0
3,0,0.664559,0,-0.356108,4.208659,-0.307818,-0.107728,0.254054,0,1,0,0,0,1
4,0,-0.607065,0,-0.339717,-0.295677,0.816417,-0.274546,-0.254932,1,0,0,0,0,1


In [6]:
y = train_df.Transported.copy()
y

0       0
1       1
2       0
3       0
4       1
       ..
6918    0
6919    0
6920    1
6921    0
6922    1
Name: Transported, Length: 6923, dtype: int64

In [7]:
X = train_df.drop(['Transported'],axis=1)
X

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0,0.696428,0,-0.335475,-0.287209,-0.283024,-0.274476,-0.269909,0,1,0,0,0,1
1,0,-0.337018,0,-0.173732,-0.281828,-0.243568,0.199108,-0.230813,1,0,0,0,0,1
2,0,2.005460,1,-0.271668,1.850568,-0.283024,5.518075,-0.226370,0,1,0,0,0,1
3,0,0.283050,0,-0.335475,0.479784,0.302505,2.597215,-0.098421,0,1,0,0,0,1
4,0,-0.888190,0,0.114141,-0.245362,-0.044709,0.212910,-0.268132,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6918,0,0.834221,1,-0.335475,3.789273,-0.283024,1.142823,-0.204157,0,1,0,1,0,0
6919,1,-0.750397,0,-0.335475,-0.287209,-0.283024,-0.274476,-0.269909,1,0,0,0,1,0
6920,0,-0.199225,0,-0.335475,-0.287209,2.671452,-0.273613,-0.269909,1,0,0,0,0,1
6921,0,0.214153,0,-0.335475,0.339896,-0.283024,0.030032,2.604501,0,1,0,1,0,0


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [9]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((5538, 14), (1385, 14), (5538,), (1385,))

In [11]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train,y_train)
svc_pred = svc.predict(X_test)
svc_acc = accuracy_score(svc_pred,y_test)
svc_acc

0.7870036101083032

In [12]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_test)
sgd_accuracy = accuracy_score(sgd_pred, y_test)

sgd_accuracy

0.8036101083032491

In [13]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
decision_tree_pred = decision_tree.predict(X_test)
decision_tree_accuracy = accuracy_score(decision_tree_pred, y_test)

decision_tree_accuracy

0.7393501805054151

In [14]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
random_forest_pred = random_forest.predict(X_test)
random_forest_accuracy = accuracy_score(random_forest_pred, y_test)

random_forest_accuracy

0.7826714801444044

In [15]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(random_state = 42)
gbc.fit(X_train, y_train)
gbc_pred = gbc.predict(X_test)
gbc_accuracy = accuracy_score(gbc_pred, y_test)

gbc_accuracy

0.8036101083032491

In [16]:
test_df

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,1,-0.112544,0,-0.356108,-0.295677,-0.307818,-0.274546,-0.254932,1,0,0,0,0,1
1,0,-0.677710,0,-0.356108,-0.289583,-0.307818,2.327261,-0.254932,1,0,0,0,0,1
2,1,0.170039,0,-0.356108,-0.295677,-0.307818,-0.274546,-0.254932,0,1,0,1,0,0
3,0,0.664559,0,-0.356108,4.208659,-0.307818,-0.107728,0.254054,0,1,0,0,0,1
4,0,-0.607065,0,-0.339717,-0.295677,0.816417,-0.274546,-0.254932,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3433,0,1.017788,0,-0.279072,-0.295677,6.510182,-0.274546,-0.254932,0,0,1,0,0,1
3434,0,0.805851,0,-0.356108,0.290049,-0.307818,-0.271781,-0.254932,1,0,0,0,0,1
3435,1,0.381976,0,-0.356108,-0.295677,-0.307818,-0.274546,-0.254932,1,0,0,0,0,1
3436,0,0.947142,0,-0.356108,0.277860,-0.277721,-0.265330,-0.129643,1,0,0,0,0,1


In [17]:
submission_pred = sgd.predict(test_df)
submission_pred

array([1, 0, 1, ..., 1, 1, 1], dtype=int64)

In [18]:
len(submission_pred)

3438

In [19]:
RAW_DATA_PATH = "./data/raw_data"
submission_df = pd.read_csv(os.path.join(RAW_DATA_PATH,"sample_submission.csv"))

In [20]:
submission_df

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False
