# Imports

In [315]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


from xgboost import XGBRegressor

# Getting the Data

In [316]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

In [317]:
train_data.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


# Preprocessing

First, we choose which features we want to implement into our pipelines.

In [318]:
features = ["RoomService", 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin', 'CryoSleep', 'VIP']

X = train_data[features]
y = train_data['Transported']

test_X = test_data[features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42)

In [325]:
new_X_train = train_X.copy()
new_X_valid = val_X.copy()
new_test = test_X.copy()

# Split Cabin into Side and Deck (Cabin).
new_X_train['Side'] = new_X_train['Cabin'].str[-1]
new_X_valid['Side'] = new_X_valid['Cabin'].str[-1]
new_test['Side'] = new_test['Cabin'].str[-1]

new_X_train['Cabin'] = new_X_train['Cabin'].str[0]
new_X_valid['Cabin'] = new_X_valid['Cabin'].str[0]
new_test['Cabin'] = new_test['Cabin'].str[0]

# Select categorical columns
categorical_cols = [cname for cname in new_X_train.columns if new_X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in new_X_train.columns if 
                new_X_train[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
new_X_train = new_X_train[my_cols].copy()
new_X_valid = new_X_valid[my_cols].copy()
new_test = new_test[my_cols].copy()

new_test

Unnamed: 0,Cabin,CryoSleep,VIP,Side,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,G,True,False,S,0.0,0.0,0.0,0.0,0.0
1,F,False,False,S,0.0,9.0,0.0,2823.0,0.0
2,C,True,False,S,0.0,0.0,0.0,0.0,0.0
3,C,False,False,S,0.0,6652.0,0.0,181.0,585.0
4,F,False,False,S,10.0,0.0,635.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
4272,G,True,False,S,0.0,0.0,0.0,0.0,0.0
4273,,False,False,,0.0,847.0,17.0,10.0,144.0
4274,D,True,False,P,0.0,0.0,0.0,0.0,0.0
4275,D,False,False,P,0.0,2680.0,0.0,0.0,523.0


Now we set up our pipelines to do the work for us and prevent data leakage.

In [323]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

Now we define our model and actual processing pipeline. After doing that, we can build a gridseach algorithm which can pick out the best parameters from a set of ones we define in order to find the best estimator. Once we do this, we get the best score estimator and apply that to our actual full model.

In [329]:
# Define model
model = XGBRegressor()

# Bundle preprocessing and modeling code in a pipeline
ship = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

boost_grid = {'model__n_estimators': [710, 725, 750, 775, 800],
        'model__max_depth': [9, 10, 11, 12],
        'model__learning_rate': [0.01, 0.05, 0.1, 0.15],
        'model__verbosity': [0]}

def mae_score(y, y_pred):
    return 1 - mean_absolute_error(y, y_pred)

mae = make_scorer(mae_score) #https://stackoverflow.com/questions/47393060/how-can-i-define-my-own-scoring-strategy-sklearn-model-selection-gridsearchcv

grid = GridSearchCV(estimator=ship, param_grid=boost_grid, scoring=mae, n_jobs=-1, cv=5) #https://dev.to/anurag629/gridsearchcv-in-scikit-learn-a-comprehensive-guide-2a72

# Preprocessing of training data, fit model 
grid.fit(new_X_train, train_y)

print(grid.best_params_)
print(grid.best_score_)

{'model__learning_rate': 0.01, 'model__max_depth': 11, 'model__n_estimators': 725, 'model__verbosity': 0}
0.7254368960857391


We can use validation data in order to get a better understanding of where our model stands when given unknown data.

In [330]:
ship_pred = grid.best_estimator_.predict(new_X_valid)

print(1 - mean_absolute_error(val_y, ship_pred))

0.7199558913707733


# Submission

Finally, we fit to the entire dataset and use it to predict the test data.

In [332]:
full_data = pd.concat([new_X_train, new_X_valid])
full_valid = pd.concat([train_y, val_y])

grid.best_estimator_.fit(full_data, full_valid)

test_preds = grid.best_estimator_.predict(new_test)
final_preds = []

for num in test_preds:
    final_preds.append(bool(np.round(num)))

In [333]:
# Note that your test data needs to be in dataframe called test_data, or change it below
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': final_preds})

# Create a submission file name with date and time appended
submission_file_name = f'submission_{pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")}.csv'

# Save to csv file
output.to_csv(submission_file_name, index=False)

Best Score: 80.243%