# Imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBRegressor as xgb

# CSVs

In [40]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

In [58]:
train_data.head(20)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


# 'Preprocessing'

In [42]:
features = ["RoomService", 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin', 'HomePlanet']

X = train_data[features]
y = train_data['Transported']

test_X = test_data[features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42)

I need to One Hot Encode the data below because there is no ranking to one hot encoding

In [69]:
encoded_X_train = train_X.copy()
encoded_X_valid = val_X.copy()
encoded_test = test_X.copy()

# Specifically Cabin
encoded_X_train['Cabin'] = encoded_X_train['Cabin'].str[0]
encoded_X_valid['Cabin'] = encoded_X_valid['Cabin'].str[0]
encoded_test['Cabin'] = encoded_test['Cabin'].str[0]

s = (encoded_X_train.dtypes == 'object')
object_cols = list(s[s].index)

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(encoded_X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(encoded_X_valid[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(encoded_test[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = encoded_X_train.index
OH_cols_valid.index = encoded_X_valid.index
OH_cols_test.index = encoded_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = encoded_X_train.drop(object_cols, axis=1)
num_X_valid = encoded_X_valid.drop(object_cols, axis=1)
num_test = encoded_test.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
encoded_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
encoded_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
encoded_test = pd.concat([num_test, OH_cols_test], axis=1)

# Ensure all columns have string type
encoded_X_train.columns = encoded_X_train.columns.astype(str)
encoded_X_valid.columns = encoded_X_train.columns.astype(str)
encoded_test.columns = encoded_test.columns.astype(str)

In [72]:
simp_impute = SimpleImputer()

imputed_X_train = pd.DataFrame(simp_impute.fit_transform(encoded_X_train))
imputed_X_valid = pd.DataFrame(simp_impute.transform(encoded_X_valid))
imputed_test = pd.DataFrame(simp_impute.fit_transform(encoded_test))

imputed_X_train.columns = encoded_X_train.columns
imputed_X_valid.columns = encoded_X_valid.columns
imputed_test.columns = encoded_test.columns

# Building the Model

In [73]:
def get_mae(n_estimators, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=n_estimators, criterion='absolute_error', n_jobs= -1, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

scores = {estimators: get_mae(estimators, imputed_X_train, imputed_X_valid, train_y, val_y) for estimators in range(85, 90)}
best_estimator_size= min(scores, key=scores.get)

print(best_estimator_size)

87


In [77]:
def get_mae(max_features, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=best_estimator_size, max_features=max_features, criterion='absolute_error', n_jobs= -1, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

scores = {features: get_mae(features, imputed_X_train, imputed_X_valid, train_y, val_y) for features in range(16, 20)}
best_feature_size= min(scores, key=scores.get)

print(best_feature_size)

16


In [78]:
def get_mae(max_depth, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=best_estimator_size, max_features=best_feature_size, max_depth=max_depth, criterion='absolute_error', n_jobs= -1, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

scores = {depths: get_mae(depths, imputed_X_train, imputed_X_valid, train_y, val_y) for depths in range(10, 20)}
best_depth_size= min(scores, key=scores.get)
print(best_depth_size)

11


In [79]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, n_estimators=best_estimator_size, max_features=best_feature_size, max_depth=best_depth_size, n_jobs= -1, criterion='absolute_error', random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

scores = {leaf_size: get_mae(leaf_size, imputed_X_train, imputed_X_valid, train_y, val_y) for leaf_size in range(170, 180)}
best_tree_size = min(scores, key=scores.get)
print(best_tree_size)

178


# Verification

In [54]:
ship_model = RandomForestRegressor(max_leaf_nodes=best_tree_size, n_estimators=best_estimator_size, max_depth=best_depth_size, max_features=best_feature_size, criterion='absolute_error', n_jobs=-1, random_state=0)

ship_model.fit(imputed_X_train, train_y)

ship_pred = ship_model.predict(imputed_X_valid)

print(1 - mean_absolute_error(val_y, ship_pred))

0.7562536824369722


In [47]:
test_preds = ship_model.predict(imputed_test)
final_preds = []

for num in test_preds:
    final_preds.append(bool(np.round(num)))

# Submission

In [48]:
# Note that your test data needs to be in dataframe called test_data, or change it below
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': final_preds})

# Create a submission file name with date and time appended
submission_file_name = f'submission_{pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")}.csv'

# Save to csv file
output.to_csv(submission_file_name, index=False)