In [24]:
import pandas as pd
import os
import numpy as np
from sklearn import preprocessing

In [41]:
train = pd.read_csv('/Users/kassandramadulka/Downloads/spaceship-titanic/train.csv')
test = pd.read_csv('/Users/kassandramadulka/Downloads/spaceship-titanic/test.csv')
sample_submission = pd.read_csv('/Users/kassandramadulka/Downloads/spaceship-titanic/sample_submission.csv')

In [33]:
# PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
# HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
# CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
# Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
# Destination - The planet the passenger will be debarking to.
# Age - The age of the passenger.
# VIP - Whether the passenger has paid for special VIP service during the voyage.
# RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
# Name - The first and last names of the passenger.
# Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [36]:
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [42]:
# change True/False to dummy variables
train['Transported'] = train['Transported'].astype(int)
train['CryoSleep'] = train['CryoSleep'].fillna(0).astype(int)
train['VIP'] = train['VIP'].fillna(0).astype(int)
train['Deck'] = train['Cabin'].str.split(pat="/").str[0]
train['CabinNum'] = train['Cabin'].str.split(pat="/").str[1]
train['CabinSide'] = train['Cabin'].str.split(pat="/").str[2] 
# make P = 1, S = 0
train['CabinSide'] = np.where(train['CabinSide']=='P', 1, 0) 
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,CabinNum,CabinSide
0,0001_01,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,B,0,1
1,0002_01,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,F,0,0
2,0003_01,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,A,0,0
3,0003_02,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,A,0,0
4,0004_01,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,F,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,0,A/98/P,55 Cancri e,41.0,1,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,A,98,1
8689,9278_01,Earth,1,G/1499/S,PSO J318.5-22,18.0,0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,G,1499,0
8690,9279_01,Earth,0,G/1500/S,TRAPPIST-1e,26.0,0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,G,1500,0
8691,9280_01,Europa,0,E/608/S,55 Cancri e,32.0,0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,E,608,0


In [90]:
# check if data is somewhat balanced
train.groupby(['Transported']).size()

Transported
0    4315
1    4378
dtype: int64

In [46]:
# change HomePlanet, Deck, Destination to dummy variables
homeplanet_list = list(train['HomePlanet'].fillna('Unknown').unique())
for home in homeplanet_list:
    train[home]=np.where(train['HomePlanet']==home, 1,0)

deck_list = list(train['Deck'].fillna('Unknown').unique())
for deck in deck_list:
    train['Deck'+deck]=np.where(train['Deck']==deck, 1,0)

destination_list = list(train['Destination'].fillna('Unknown').unique())
for dest in destination_list:
    train['Dest'+dest]=np.where(train['Destination']==dest, 1,0)


train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,DeckG,DeckUnknown,DeckE,DeckD,DeckC,DeckT,DestTRAPPIST-1e,DestPSO J318.5-22,Dest55 Cancri e,DestUnknown
0,0001_01,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
1,0002_01,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,...,0,0,0,0,0,0,1,0,0,0
2,0003_01,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,...,0,0,0,0,0,0,1,0,0,0
3,0003_02,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,...,0,0,0,0,0,0,1,0,0,0
4,0004_01,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,0,A/98/P,55 Cancri e,41.0,1,0.0,6819.0,0.0,...,0,0,0,0,0,0,0,0,1,0
8689,9278_01,Earth,1,G/1499/S,PSO J318.5-22,18.0,0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,1,0,0
8690,9279_01,Earth,0,G/1500/S,TRAPPIST-1e,26.0,0,0.0,0.0,1872.0,...,1,0,0,0,0,0,1,0,0,0
8691,9280_01,Europa,0,E/608/S,55 Cancri e,32.0,0,0.0,1049.0,0.0,...,0,0,1,0,0,0,0,0,1,0


In [56]:
# normalize features
x_columns = ['CryoSleep', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
        'CabinSide', 'Europa',
       'Earth', 'Mars', 'Unknown', 'DeckB', 'DeckF', 'DeckA', 'DeckG',
       'DeckUnknown', 'DeckE', 'DeckD', 'DeckC', 'DeckT', 'DestTRAPPIST-1e',
       'DestPSO J318.5-22', 'Dest55 Cancri e', 'DestUnknown']
y_column = 'Transported'


min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(train[x_columns])
x_train_scaled_df = pd.DataFrame(x_scaled, columns=x_columns)
x_train_scaled_df

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinSide,Europa,...,DeckG,DeckUnknown,DeckE,DeckD,DeckC,DeckT,DestTRAPPIST-1e,DestPSO J318.5-22,Dest55 Cancri e,DestUnknown
0,0.0,0.493671,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.303797,0.0,0.007608,0.000302,0.001064,0.024500,0.001823,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.734177,1.0,0.003001,0.119948,0.000000,0.299670,0.002030,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.417722,0.0,0.000000,0.043035,0.015793,0.148563,0.007997,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.202532,0.0,0.021149,0.002348,0.006428,0.025214,0.000083,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.0,0.518987,1.0,0.000000,0.228726,0.000000,0.073322,0.003066,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8689,1.0,0.227848,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8690,0.0,0.329114,0.0,0.000000,0.000000,0.079687,0.000045,0.000000,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8691,0.0,0.405063,0.0,0.000000,0.035186,0.000000,0.015753,0.134049,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [55]:
# transform and normalize test data
# change True/False to dummy variables
test['CryoSleep'] = test['CryoSleep'].fillna(0).astype(int) # could also remove na's
test['VIP'] = test['VIP'].fillna(0).astype(int)
test['Deck'] = test['Cabin'].str.split(pat="/").str[0]
test['CabinNum'] = test['Cabin'].str.split(pat="/").str[1]
test['CabinSide'] = test['Cabin'].str.split(pat="/").str[2] 
# make P = 1, S = 0
test['CabinSide'] = np.where(test['CabinSide']=='P', 1, 0) 
# make dummy variables
for home in homeplanet_list:
    test[home]=np.where(test['HomePlanet']==home, 1,0)

for deck in deck_list:
    test['Deck'+deck]=np.where(test['Deck']==deck, 1,0)

for dest in destination_list:
    test['Dest'+dest]=np.where(test['Destination']==dest, 1,0)

# normalize test data
min_max_scaler = preprocessing.MinMaxScaler()
x_test_scaled = min_max_scaler.fit_transform(test[x_columns])
x_test_scaled_df = pd.DataFrame(x_test_scaled, columns=x_columns)
x_test_scaled_df

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinSide,Europa,...,DeckG,DeckUnknown,DeckE,DeckD,DeckC,DeckT,DestTRAPPIST-1e,DestPSO J318.5-22,Dest55 Cancri e,DestUnknown
0,1.0,0.341772,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.240506,0.0,0.000000,0.000356,0.00000,0.142260,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.392405,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.481013,0.0,0.000000,0.263206,0.00000,0.009121,0.026266,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.253165,0.0,0.000865,0.000000,0.07658,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,1.0,0.430380,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4273,0.0,0.531646,0.0,0.000000,0.033514,0.00205,0.000504,0.006466,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4274,1.0,,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4275,0.0,,0.0,0.000000,0.106042,0.00000,0.000000,0.023482,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
# put data through classifier?
# Logistic Regression, Decision Tree, SVM, AdaBoost? - Try logit and random forest first
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split


# create test, train split separate from CV
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(x_train_scaled_df.fillna(0), train[y_column], test_size=0.20, random_state=42)


In [80]:
# fit logistic regression
clf = LogisticRegressionCV(cv=5, random_state=0, max_iter=500).fit(X_train_train, y_train_train)
# get score
clf.score(X_train_test, y_train_test)

0.7832087406555491

In [94]:
# predictions
clf_predict = clf.predict(X_train_test)

# 10-Fold Cross validation
clf_cv_score = cross_val_score(clf, X_train_train, y_train_train, cv=10, scoring='roc_auc')

print("=== Confusion Matrix ===")
print(confusion_matrix(y_train_test, clf_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_train_test, clf_predict))
print('\n')
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", clf_cv_score.mean())

=== Confusion Matrix ===
[[645 216]
 [161 717]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.80      0.75      0.77       861
           1       0.77      0.82      0.79       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



=== All AUC Scores ===
[0.8711891  0.87959125 0.87682907 0.88806358 0.87352795 0.8740207
 0.884      0.85921325 0.89233954 0.89099793]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.8789772376404695


In [93]:
# Decision Tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix



rfc = RandomForestClassifier()
rfc.fit(X_train_train, y_train_train)
# predictions
rfc_predict = rfc.predict(X_train_test)

# 10-Fold Cross validation
rfc_cv_score = cross_val_score(rfc, X_train_train, y_train_train, cv=10, scoring='roc_auc')

print("=== Confusion Matrix ===")
print(confusion_matrix(y_train_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_train_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[689 172]
 [204 674]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.77      0.80      0.79       861
           1       0.80      0.77      0.78       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



=== All AUC Scores ===
[0.8614327  0.86687036 0.86682907 0.863782   0.85281159 0.84735818
 0.871147   0.85504762 0.8690559  0.87793789]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.863227229861535


In [97]:
# try tuning parameters
# focus on 3 hyperparameters: n_estimators, max_features, and max_depth
from sklearn.model_selection import RandomizedSearchCV
# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# number of features at every split
max_features = ['auto', 'sqrt']

# max depth
max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
max_depth.append(None)
# create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }
# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the model
rfc_random.fit(X_train_train, y_train_train)
# print results
print(rfc_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'n_estimators': 1200, 'max_features': 'auto', 'max_depth': 420}


In [98]:
# test new parameters- doesnt make a big difference
rfc = RandomForestClassifier(n_estimators=1200, max_depth=420, max_features='auto')
rfc.fit(X_train_train,y_train_train)
rfc_predict = rfc.predict(X_train_test)
rfc_cv_score = cross_val_score(rfc, X_train_train, y_train_train, cv=10, scoring='roc_auc')
print("=== Confusion Matrix ===")
print(confusion_matrix(y_train_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_train_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[685 176]
 [201 677]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.77      0.80      0.78       861
           1       0.79      0.77      0.78       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



=== All AUC Scores ===
[0.8599422  0.87003303 0.87136251 0.86748555 0.85657971 0.85483644
 0.87038923 0.8582236  0.87221532 0.88100621]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.8662073804138395


In [107]:
# use random forest model to get new predictions
rfc_predict_test = rfc.predict(x_test_scaled_df.fillna(0))
d = {'PassengerId': test['PassengerId'], 'Transported': rfc_predict_test }
df = pd.DataFrame(data=d)
df

Unnamed: 0,PassengerId,Transported
0,0013_01,1
1,0018_01,0
2,0019_01,1
3,0021_01,1
4,0023_01,1
...,...,...
4272,9266_02,1
4273,9269_01,0
4274,9271_01,1
4275,9273_01,1


In [110]:
df.replace({0: False, 1: True}, inplace=True)
df

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
