<a href="https://www.kaggle.com/code/sabinakhadysy/spaceship-titanic-competition?scriptVersionId=143441669" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
#Imports
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

# Loading data

In [None]:
#Loading data
train_data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv", index_col = 'PassengerId') #data used to build the model
display(train_data.head())
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv", index_col = 'PassengerId') #data used to create submission
display(test_data.head())

#Setting the target
y = train_data.Transported

In [None]:
#Getting overall information about data
train_data.info()

# Initial model using XGBRegressor

In [None]:
#Choosing initial features and setting initial X
features_in = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP']
X_in = train_data[features_in]
X_in.head()

In [None]:
X_in.HomePlanet.unique() #how many unique home planets there are

In [None]:
X_in.Destination.unique() #how many unique destinations there are

In [None]:
#Splitting data into training and test set
#first parameters
#train_X_in, val_X_in, train_y, val_y = train_test_split(X_in, y, random_state = 0)
#changed parameters
train_X_in, val_X_in, train_y, val_y = train_test_split(X_in, y, train_size = 0.8, test_size = 0.2, random_state = 0)

In [None]:
#Looking for missing values
print(f'Shape of training data is {train_X_in.shape}')

missing_val_count_by_col = train_X_in.isnull().sum()
print(f'Columns with sum of missing values:\n {missing_val_count_by_col}')


In [None]:
#preprocessing for numerical data
numerical_col = ["Age"]
numerical_transformer = SimpleImputer(strategy = "median")

#preporcessing for categorical data
categorical_col = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
categorical_transformer = OneHotEncoder(handle_unknown = "ignore")

#Bundle preprocesisng
preprocessor = ColumnTransformer(
    transformers =[
        ('num', numerical_transformer, numerical_col),
        ('cat', categorical_transformer, categorical_col)
    ]
)

#Buildign the model
initial_model = XGBRegressor()

# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', initial_model)
                          ])

#Preprocessing of training data and fitting the model
pipeline.fit(train_X_in, train_y)

#Preprocessing of validation data and getting predictions
preds = pipeline.predict(val_X_in)

print(f'MAE: {mean_absolute_error(preds, val_y)}')

In [None]:
#Chacking parameters of initial model
initial_model

In [None]:
#Changing parameters in XGBRegressor()
initial_model_p1 = XGBRegressor(objective="binary:logistic", random_state = 42, n_estimators = 1000)
#Training the model
pipeline_p1 = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', initial_model_p1)
                             ])
pipeline_p1.fit(train_X_in, train_y)
preds_p1 = pipeline_p1.predict(val_X_in)

print(f'MAE p1: {mean_absolute_error(preds_p1, val_y)}')

# Adding feature engineering

In [None]:
X = train_data.drop(columns = "Transported")
X.head()

In [None]:
#Calcualting the total amount spend on luxuries by each passenger
X["LuxTotalPass"] = X.RoomService + X.FoodCourt + X.ShoppingMall + X.Spa + X.VRDeck
X.head()

In [None]:
#Choosing features and setting X
features_FE = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'LuxTotalPass']
X_FE = X[features_FE]
X_FE.head()

In [None]:
#Splitting data into training and test set
train_X_FE, val_X_FE, train_y_FE, val_y_FE = train_test_split(X_FE, y, train_size = 0.8, test_size = 0.2, random_state = 0)

#Looking for missing values
print(f'Shape of training data is {train_X_FE.shape}')

missing_val_count_by_col_FE = train_X_FE.isnull().sum()
print(f'Columns with sum of missing values:\n {missing_val_count_by_col_FE}')

In [None]:
#preprocessing for numerical data
numerical_col_FE = ["Age", "LuxTotalPass"]
numerical_transformer_FE = SimpleImputer(strategy = "median")

#preporcessing for categorical data
categorical_col_FE = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
categorical_transformer_FE = OneHotEncoder(handle_unknown = "ignore")

#Bundle preprocesisng
preprocessor_FE = ColumnTransformer(
    transformers =[
        ('num', numerical_transformer_FE, numerical_col_FE),
        ('cat', categorical_transformer_FE, categorical_col_FE)
    ]
)

#Buildign the model
model_FE = XGBRegressor(random_state = 42, n_estimators = 1500)

# Bundle preprocessing and modeling code in a pipeline
pipeline_FE = Pipeline(steps=[('preprocessor', preprocessor_FE),
                           ('model', model_FE)
                          ])

#Preprocessing of training data and fitting the model
pipeline_FE.fit(train_X_FE, train_y_FE)

#Preprocessing of validation data and getting predictions
preds_FE = pipeline_FE.predict(val_X_FE)

print(f'MAE_FE: {mean_absolute_error(preds_FE, val_y_FE)}')

# Generating a submission

In [None]:
#Creating test_X containing columns with features from test dataset

#test_X = test_data[features_in] #initial model
#model with feature engineering
test_data_FE = test_data.copy()
test_data_FE["LuxTotalPass"] = test_data_FE.RoomService + test_data_FE.FoodCourt + test_data_FE.ShoppingMall + test_data_FE.Spa + test_data_FE.VRDeck
display(test_data_FE.head())
test_X_FE = test_data_FE[features_FE]
display(test_X_FE.head())

#Making predicitions
#test_pred = pipeline.predict(test_X) #initial model
test_pred_FE = pipeline_FE.predict(test_X_FE)

In [None]:
#Changing numerical values of test_pred into True and False
threshold = 0.5
#binary_labels = ["True" if p >= threshold else "False" for p in test_pred] #initial model
binary_labels_FE = ["True" if p >= threshold else "False" for p in test_pred_FE] #model with feature engineering
print(binary_labels_FE[0:3])

In [None]:
#Generating a submission
output = pd.DataFrame({'Transported': binary_labels_FE}, index = test_data.index)
output.to_csv('my_submission.csv')
print("Your submission was successfully saved!")

In [None]:
output