# Imports

In [8]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# CSVs

In [9]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

In [10]:
train_data.head(20)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


While looking through the data, it is important to consider which data is useful, and which are not. For example, the names of the people most likely won't have an effect on who survives as many of the names are unique. All of the columns which show numbers in the form of cash will be important because even just looking through this data, many of the transported customers had spent some money with the exception of a few. The Cryosleep column kind of tells us that those who didn't spend money had a chance to be transported if they were in Cryosleep.

While working on the feature engineering section of each of the notebooks, the categories which negatively affected the stats were the home planet, destination, and age, though age seems to have some influence over who survives, but not enough to justify using it. The best score came from splitting up the cabin deck and sides which proved extremely useful to the regressor as it increased its MAE percentage by a lot.

# 'Preprocessing'

First, we choose which features we want to implement into our pipelines.

In [11]:
features = ["RoomService", 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin', 'CryoSleep']

X = train_data[features]
y = train_data['Transported']

test_X = test_data[features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42)

We preprocess the data and encode things into numbers which our regressor can work with.

In [12]:
encoded_X_train = train_X.copy()
encoded_X_valid = val_X.copy()
encoded_test = test_X.copy()

# Split Cabin into Side and Deck (Cabin).
encoded_X_train['Side'] = encoded_X_train['Cabin'].str[-1]
encoded_X_valid['Side'] = encoded_X_valid['Cabin'].str[-1]
encoded_test['Side'] = encoded_test['Cabin'].str[-1]

encoded_X_train['Cabin'] = encoded_X_train['Cabin'].str[0]
encoded_X_valid['Cabin'] = encoded_X_valid['Cabin'].str[0]
encoded_test['Cabin'] = encoded_test['Cabin'].str[0]

s = (encoded_X_train.dtypes == 'object')
object_cols = list(s[s].index)

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(encoded_X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(encoded_X_valid[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(encoded_test[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = encoded_X_train.index
OH_cols_valid.index = encoded_X_valid.index
OH_cols_test.index = encoded_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = encoded_X_train.drop(object_cols, axis=1)
num_X_valid = encoded_X_valid.drop(object_cols, axis=1)
num_test = encoded_test.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
encoded_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
encoded_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
encoded_test = pd.concat([num_test, OH_cols_test], axis=1)

# Ensure all columns have string type
encoded_X_train.columns = encoded_X_train.columns.astype(str)
encoded_X_valid.columns = encoded_X_train.columns.astype(str)
encoded_test.columns = encoded_test.columns.astype(str)

encoded_X_train

Unnamed: 0,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
5623,441.0,0.0,397.0,471.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5253,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
478,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5344,0.0,29.0,317.0,434.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,14.0,2.0,144.0,610.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
5191,690.0,0.0,30.0,762.0,428.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5390,158.0,0.0,476.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
860,379.0,0.0,1626.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


We can impute missing data so that no missing numbers error out our model.

In [13]:
simp_impute = SimpleImputer()

imputed_X_train = pd.DataFrame(simp_impute.fit_transform(encoded_X_train))
imputed_X_valid = pd.DataFrame(simp_impute.transform(encoded_X_valid))
imputed_test = pd.DataFrame(simp_impute.fit_transform(encoded_test))

imputed_X_train.columns = encoded_X_train.columns
imputed_X_valid.columns = encoded_X_valid.columns
imputed_test.columns = encoded_test.columns

# Building the Model

Now we do a manual search among many different tunable parameters to find which ones are best suited for predicting the outcome of the test data.

In [14]:
def get_mae(n_estimators, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=n_estimators, criterion='absolute_error', n_jobs= -1, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

scores = {estimators: get_mae(estimators, imputed_X_train, imputed_X_valid, train_y, val_y) for estimators in range(85, 90)}
best_estimator_size= min(scores, key=scores.get)

print(best_estimator_size)

88


In [15]:
def get_mae(max_features, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=best_estimator_size, max_features=max_features, criterion='absolute_error', n_jobs= -1, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

scores = {features: get_mae(features, imputed_X_train, imputed_X_valid, train_y, val_y) for features in range(16, 20)}
best_feature_size= min(scores, key=scores.get)

print(best_feature_size)

16


In [16]:
def get_mae(max_depth, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=best_estimator_size, max_features=best_feature_size, max_depth=max_depth, criterion='absolute_error', n_jobs= -1, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

scores = {depths: get_mae(depths, imputed_X_train, imputed_X_valid, train_y, val_y) for depths in range(10, 20)}
best_depth_size= min(scores, key=scores.get)
print(best_depth_size)

13


In [17]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, n_estimators=best_estimator_size, max_features=best_feature_size, max_depth=best_depth_size, n_jobs= -1, criterion='absolute_error', random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

scores = {leaf_size: get_mae(leaf_size, imputed_X_train, imputed_X_valid, train_y, val_y) for leaf_size in range(170, 180)}
best_tree_size = min(scores, key=scores.get)
print(best_tree_size)

179


# Verification

Finally, we can verify if the MAE of our validation data is good enough to make a submission with.

In [19]:
ship_model = RandomForestRegressor(max_leaf_nodes=best_tree_size, n_estimators=best_estimator_size, max_depth=best_depth_size, max_features=best_feature_size, criterion='absolute_error', n_jobs=-1, random_state=0)

ship_model.fit(imputed_X_train, train_y)

ship_pred = ship_model.predict(imputed_X_valid)

print(1 - mean_absolute_error(val_y, ship_pred))

0.7540248390064397


We fit our model to the entire dataset itself in order to amass as much data as possible.

In [20]:
full_data = pd.concat([imputed_X_train, imputed_X_valid])
full_valid = pd.concat([train_y, val_y])

ship_model.fit(full_data, full_valid)

0,1,2
,n_estimators,88
,criterion,'absolute_error'
,max_depth,13
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,16
,max_leaf_nodes,179
,min_impurity_decrease,0.0
,bootstrap,True


We predict the test data and make a submission next.

In [21]:
test_preds = ship_model.predict(imputed_test)
final_preds = []

for num in test_preds:
    final_preds.append(bool(np.round(num)))

# Submission

In [22]:
# Note that your test data needs to be in dataframe called test_data, or change it below
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': final_preds})

# Create a submission file name with date and time appended
submission_file_name = f'submission_{pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")}.csv'

# Save to csv file
output.to_csv(submission_file_name, index=False)

Best Score: 78.957%