In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

## Data exploration

Load and explore the dataset

In [91]:
df = pd.read_csv('spaceship-titanic/train.csv')
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [92]:
print(df.dtypes)
original_datapoints = len(df)
print(f'\nDataset has {original_datapoints} elements.')

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

Dataset has 8693 elements.


Attributes details: 

* **PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
* **HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.
* **CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
* **Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
* **Destination** - The planet the passenger will be debarking to.
* **Age** - The age of the passenger.
* **VIP** - Whether the passenger has paid for special VIP service during the voyage.
* **RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
* **Name** - The first and last names of the passenger.
* **Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [93]:
def nan_count(df):
    for col in df.columns:
        hasNan = df[col].isnull().values.any()
        if hasNan:
            nan_count = df[col].isna().sum()
            nan_perc = nan_count/len(df[col])
            print(f'{col} has: {nan_perc*100} % of NaN values')
        
        else:
            print(f'{col} has no NaN values')

nan_count(df)

PassengerId has no NaN values
HomePlanet has: 2.312205222592891 % of NaN values
CryoSleep has: 2.4962613597147127 % of NaN values
Cabin has: 2.289198205452663 % of NaN values
Destination has: 2.093638559760727 % of NaN values
Age has: 2.0591280340503855 % of NaN values
VIP has: 2.3352122397331185 % of NaN values
RoomService has: 2.082135051190613 % of NaN values
FoodCourt has: 2.105142068330841 % of NaN values
ShoppingMall has: 2.392729782583688 % of NaN values
Spa has: 2.105142068330841 % of NaN values
VRDeck has: 2.1626596111814105 % of NaN values
Name has: 2.300701714022777 % of NaN values
Transported has no NaN values


## Data processing

In [94]:
from sklearn.preprocessing import LabelEncoder

# Homeplanet
df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
df['Group'] = df['Group'].astype(int)

# Cabin
df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if not pd.isnull(x) else x)
df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if not pd.isnull(x) else x)
df.head()


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,Deck,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,B,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2,F,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3,A,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,3,A,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,4,F,S


In [95]:
from typing import Tuple

def drop_Nans(col: str, df: pd.DataFrame) -> pd.DataFrame:
    return df.dropna(subset=col)

def fill_Nan_based_on_group(col: str, df: pd.DataFrame) -> pd.DataFrame:
    nan_map = df.isna()
    for id in range(len(df)):
        if nan_map.loc[id, col]:
            group = df.loc[id, 'Group']
            same_group_df = df.loc[df['Group'] == group]

            for ind in same_group_df.index:
                if not nan_map.loc[ind, col]:
                    df.loc[id, col] = same_group_df.loc[ind, col]
                    break        
    return df

def fill_Nan_with_median(col: str, df: pd.DataFrame, median: float = None) -> Tuple[pd.DataFrame, float]:
    if median == None:
        median = df[col].median()
    df[col] = df[col].fillna(median)
    return df, median

def normalize_col(col: str, df: pd.DataFrame, mean:float = None, std:float = None) -> Tuple[pd.DataFrame, float, float]:
    if mean == None and std == None:
        mean = df[col].mean()
        std = df[col].std()
    df[col] = (df[col] - mean) / std
    return (df, mean, std)

def fillna(col, mcv):
    col.fillna(mcv, inplace=True)
    return col


In [96]:
for i in ['HomePlanet', 'CryoSleep', 'Destination', 'Cabin']:
    df = fill_Nan_based_on_group(i, df)

df['Total_Expenses'] = sum(df[i] for i in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])

# Normalize and store
medians = {}
means = {}
stds = {}
for i in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Total_Expenses']:
    df, median = fill_Nan_with_median(i, df)
    df, mean, std = normalize_col(i, df)
    medians[i] = median
    means[i] = mean
    stds[i] = std

df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,Deck,Side,Total_Expenses
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,0.711904,False,-0.333085,-0.281011,-0.283562,-0.27061,-0.262988,Maham Ofracculy,False,1,B,P,-0.52045
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,-0.334018,False,-0.168064,-0.275371,-0.241757,0.217146,-0.224192,Juanna Vines,True,2,F,S,-0.248089
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,2.03674,True,-0.267985,1.959885,-0.283562,5.695295,-0.219783,Altark Susent,False,3,A,S,3.321842
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,0.293535,False,-0.333085,0.52298,0.336832,2.687022,-0.092813,Solam Susent,False,3,A,S,1.39496
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,-0.891844,False,0.125645,-0.237145,-0.031057,0.231361,-0.261225,Willy Santantines,True,4,F,S,-0.116719


In [97]:
df = df.drop(columns=['PassengerId', 'Cabin', 'Name', 'Group'])

# Fill Nans with most common values
mcv = [df[col].value_counts().index[0] for col in df.columns]
for i,col in enumerate(df.columns):
    df[col] = fillna(df[col], mcv[i])
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side,Total_Expenses
0,Europa,False,TRAPPIST-1e,0.711904,False,-0.333085,-0.281011,-0.283562,-0.27061,-0.262988,False,B,P,-0.52045
1,Earth,False,TRAPPIST-1e,-0.334018,False,-0.168064,-0.275371,-0.241757,0.217146,-0.224192,True,F,S,-0.248089
2,Europa,False,TRAPPIST-1e,2.03674,True,-0.267985,1.959885,-0.283562,5.695295,-0.219783,False,A,S,3.321842
3,Europa,False,TRAPPIST-1e,0.293535,False,-0.333085,0.52298,0.336832,2.687022,-0.092813,False,A,S,1.39496
4,Earth,False,TRAPPIST-1e,-0.891844,False,0.125645,-0.237145,-0.031057,0.231361,-0.261225,True,F,S,-0.116719


In [98]:
for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
    df = df.join(pd.get_dummies(df[col], prefix=col))
    df = df.drop(columns=col)

df

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Total_Expenses,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,False,0.711904,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,False,-0.520450,...,False,True,False,False,False,False,False,False,True,False
1,False,-0.334018,False,-0.168064,-0.275371,-0.241757,0.217146,-0.224192,True,-0.248089,...,False,False,False,False,False,True,False,False,False,True
2,False,2.036740,True,-0.267985,1.959885,-0.283562,5.695295,-0.219783,False,3.321842,...,True,False,False,False,False,False,False,False,False,True
3,False,0.293535,False,-0.333085,0.522980,0.336832,2.687022,-0.092813,False,1.394960,...,True,False,False,False,False,False,False,False,False,True
4,False,-0.891844,False,0.125645,-0.237145,-0.031057,0.231361,-0.261225,True,-0.116719,...,False,False,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,0.851361,True,-0.333085,3.992106,-0.283562,1.189104,-0.197740,False,2.638349,...,True,False,False,False,False,False,False,False,True,False
8689,True,-0.752387,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,False,-0.520450,...,False,False,False,False,False,False,True,False,False,True
8690,False,-0.194562,False,-0.333085,-0.281011,2.846835,-0.269722,-0.262988,True,0.172665,...,False,False,False,False,False,False,True,False,False,True
8691,False,0.223807,False,-0.333085,0.376344,-0.283562,0.043011,2.589428,False,1.195500,...,False,False,False,False,True,False,False,False,False,True


In [99]:
X = df.drop(columns=['Transported'])
y = df['Transported']

## Model

In [100]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


In [101]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

classifiers = [
    KNeighborsClassifier(),
    # SVC(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
]

parameters = [
    {'n_neighbors':[1, 5, 10]},
    # {'kernel':('linear', 'rbf'), 'C':[0.025, 0.1, 1, 10]},
    {'max_depth':[5, 10], 'n_estimators':[10, 15, 50]},
    {'n_estimators': [10, 15], 'learning_rate':[0.1, 1]}
]

for model, params in zip(classifiers, parameters):
    clf = GridSearchCV(model, params, cv=10)
    clf.fit(X_train, y_train)
    print(type(model).__name__, 'has best params:', clf.best_params_, 'with a score of', clf.best_score_*100, '%')

KNeighborsClassifier has best params: {'n_neighbors': 10} with a score of 77.88066610491747 %
RandomForestClassifier has best params: {'max_depth': 10, 'n_estimators': 50} with a score of 80.06654567453116 %
AdaBoostClassifier has best params: {'learning_rate': 1, 'n_estimators': 15} with a score of 77.46950890156425 %


In [102]:
from sklearn.metrics import precision_recall_curve, accuracy_score, f1_score, confusion_matrix, precision_score, recall_score

def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print('Confusion Matrix: ')
    print(confusion_matrix(y_pred=y_pred, y_true=y_test))

    print('\nAccuracy: ')
    print(accuracy_score(y_pred=y_pred, y_true=y_test) * 100)

    print('\nPrecision: ')
    print(precision_score(y_pred=y_pred, y_true=y_test) * 100)

    print('\nRecall: ')
    print(recall_score(y_pred=y_pred, y_true=y_test) * 100)

    print('\nF1 Score: ')
    print(f1_score(y_pred=y_pred, y_true=y_test) * 100)

In [103]:
model = RandomForestClassifier(max_depth=10, n_estimators=50)
model.fit(X_train, y_train)
evaluate(model, X_test, y_test)

Confusion Matrix: 
[[1023  280]
 [ 249 1056]]

Accuracy: 
79.71625766871165

Precision: 
79.04191616766467

Recall: 
80.91954022988506

F1 Score: 
79.96970844377131


In [104]:
model.fit(X, y)

# TEST


In [105]:
test_df = pd.read_csv('spaceship-titanic/test.csv')
test_ids = test_df['PassengerId']
# Homeplanet
test_df['Group'] = test_df['PassengerId'].apply(lambda x: x.split('_')[0])
test_df['Group'] = test_df['Group'].astype(int)

# Cabin
test_df['Deck'] = test_df['Cabin'].apply(lambda x: x.split('/')[0] if not pd.isnull(x) else x)
test_df['Side'] = test_df['Cabin'].apply(lambda x: x.split('/')[2] if not pd.isnull(x) else x)

for i in ['HomePlanet', 'CryoSleep', 'Destination', 'Cabin']:
    test_df = fill_Nan_based_on_group(i, test_df)

test_df['Total_Expenses'] = sum(test_df[i] for i in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])

test_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Group,Deck,Side,Total_Expenses
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,13,G,S,0.0
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,18,F,S,2832.0
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,19,C,S,0.0
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,21,C,S,7418.0
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,23,F,S,645.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,9266,G,S,0.0
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,9269,,,1018.0
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,9271,D,P,0.0
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,9273,D,P,3203.0


In [106]:
# Normalize and store
for i in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Total_Expenses']:
    median = medians[i]
    mean = means[i]
    std = stds[i]

    test_df[i] = test_df[i].fillna(median)
    test_df, mean, std = normalize_col(i, test_df, mean, std)

    median, mean, std = None, None, None

test_df = test_df.drop(columns=['PassengerId', 'Cabin', 'Name', 'Group'])

test_df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side,Total_Expenses
0,Earth,True,TRAPPIST-1e,-0.124834,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,G,S,-0.520450
1,Earth,False,TRAPPIST-1e,-0.682659,False,-0.333085,-0.275371,-0.283562,2.237469,-0.262988,F,S,0.527549
2,Europa,True,55 Cancri e,0.154079,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,C,S,-0.520450
3,Europa,False,TRAPPIST-1e,0.642176,False,-0.333085,3.887456,-0.283562,-0.109802,0.252827,C,S,2.224626
4,Earth,False,TRAPPIST-1e,-0.612931,False,-0.317946,-0.281011,0.778298,-0.270610,-0.262988,F,S,-0.281764
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,Earth,True,TRAPPIST-1e,0.363264,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,G,S,-0.520450
4273,Earth,False,TRAPPIST-1e,0.921089,False,-0.333085,0.249761,-0.255135,-0.261726,-0.136018,,,-0.143733
4274,Mars,True,55 Cancri e,-0.124834,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,D,P,-0.520450
4275,Europa,False,,-0.124834,False,-0.333085,1.398408,-0.283562,-0.270610,0.198160,D,P,0.664840


In [107]:
# Fill Nans with most common values
for i,col in enumerate(test_df.columns):
    test_df[col] = fillna(test_df[col], mcv[i])

for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
    test_df = test_df.join(pd.get_dummies(test_df[col], prefix=col))
    test_df = test_df.drop(columns=col)

test_df

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Total_Expenses,HomePlanet_Earth,...,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_F,Side_P,Side_S
0,True,-0.124834,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,-0.520450,True,...,False,False,False,False,False,True,False,False,False,True
1,False,-0.682659,False,-0.333085,-0.275371,-0.283562,2.237469,-0.262988,0.527549,True,...,False,False,False,False,True,False,False,False,False,True
2,True,0.154079,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,-0.520450,False,...,False,True,False,False,False,False,False,False,False,True
3,False,0.642176,False,-0.333085,3.887456,-0.283562,-0.109802,0.252827,2.224626,False,...,False,True,False,False,False,False,False,False,False,True
4,False,-0.612931,False,-0.317946,-0.281011,0.778298,-0.270610,-0.262988,-0.281764,True,...,False,False,False,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,True,0.363264,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,-0.520450,True,...,False,False,False,False,False,True,False,False,False,True
4273,False,0.921089,False,-0.333085,0.249761,-0.255135,-0.261726,-0.136018,-0.143733,True,...,False,False,False,False,False,False,False,True,False,False
4274,True,-0.124834,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,-0.520450,False,...,False,False,True,False,False,False,False,False,True,False
4275,False,-0.124834,False,-0.333085,1.398408,-0.283562,-0.270610,0.198160,0.664840,False,...,False,False,True,False,False,False,False,False,True,False


In [108]:
diff = np.setdiff1d(df.columns, test_df.columns)

for el in diff:
    if el != 'Transported':
        test_df[el] = [0] * len(test_df)

diff = np.setdiff1d(test_df.columns, df.columns)
for el in diff:
    test_df = test_df.drop(columns=el)

test_df

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Total_Expenses,HomePlanet_Earth,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,True,-0.124834,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,-0.520450,True,...,False,False,False,False,False,False,True,False,False,True
1,False,-0.682659,False,-0.333085,-0.275371,-0.283562,2.237469,-0.262988,0.527549,True,...,False,False,False,False,False,True,False,False,False,True
2,True,0.154079,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,-0.520450,False,...,False,False,True,False,False,False,False,False,False,True
3,False,0.642176,False,-0.333085,3.887456,-0.283562,-0.109802,0.252827,2.224626,False,...,False,False,True,False,False,False,False,False,False,True
4,False,-0.612931,False,-0.317946,-0.281011,0.778298,-0.270610,-0.262988,-0.281764,True,...,False,False,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,True,0.363264,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,-0.520450,True,...,False,False,False,False,False,False,True,False,False,True
4273,False,0.921089,False,-0.333085,0.249761,-0.255135,-0.261726,-0.136018,-0.143733,True,...,False,False,False,False,False,False,False,False,False,False
4274,True,-0.124834,False,-0.333085,-0.281011,-0.283562,-0.270610,-0.262988,-0.520450,False,...,False,False,False,True,False,False,False,False,True,False
4275,False,-0.124834,False,-0.333085,1.398408,-0.283562,-0.270610,0.198160,0.664840,False,...,False,False,False,True,False,False,False,False,True,False


In [109]:
X = test_df
y_pred = model.predict(X)

In [111]:
return_df = predictions = pd.DataFrame(columns=['PassengerId','Transported'])
return_df['PassengerId'] = test_ids
return_df['Transported'] = y_pred

return_df.to_csv("spaceship-titanic/my_submission.csv", index=False)
