<a href="https://www.kaggle.com/code/sabinakhadysy/spaceship-titanic-competition?scriptVersionId=142822599" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
#Imports
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

# Initial model using XGBRegressor

In [None]:
#Loading data
train_data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv", index_col = 'PassengerId') #data used to build the model
display(train_data.head())
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv", index_col = 'PassengerId') #data used to create submission
display(test_data.head())

#Setting the target
y = train_data.Transported

In [None]:
#Getting overall information about data
train_data.info()

In [None]:
#Choosing initial features and setting initial X
features_in = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP']
X_in = train_data[features_in]
X_in.head()

In [None]:
X_in.HomePlanet.unique() #how many unique home planets there are

In [None]:
X_in.Destination.unique() #how many unique destinations there are

In [None]:
#Splitting data into training and test set
train_X_in, val_X_in, train_y, val_y = train_test_split(X_in, y, random_state = 0)

In [None]:
#Looking for missing values
print(f'Shape of training data is {train_X_in.shape}')

missing_val_count_by_col = train_X_in.isnull().sum()
print(f'Columns with sum of missing values:\n {missing_val_count_by_col}')


In [None]:
#preprocessing for numerical data
numerical_col = ["Age"]
numerical_transformer = SimpleImputer(strategy = "median")

#preporcessing for categorical data
categorical_col = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
categorical_transformer = OneHotEncoder(handle_unknown = "ignore")

#Bundle preprocesisng
preprocessor = ColumnTransformer(
    transformers =[
        ('num', numerical_transformer, numerical_col),
        ('cat', categorical_transformer, categorical_col)
    ]
)

#Buildign the model
initial_model = XGBRegressor()

# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', initial_model)
                          ])

#Preprocessing of training data and fitting the model
pipeline.fit(train_X_in, train_y)

#Preprocessing of validation data and getting predictions
preds = pipeline.predict(val_X_in)

print(f'MAE: {mean_absolute_error(preds, val_y)}')

In [None]:
#Chacking parameters of initial model
initial_model

In [None]:
#Changing parameters in XGBRegressor()
initial_model_p1 = XGBRegressor(objective="binary:logistic", random_state = 42, n_estimators = 1000)
#Training the model
pipeline_p1 = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', initial_model_p1)
                             ])
pipeline_p1.fit(train_X_in, train_y)
preds_p1 = pipeline_p1.predict(val_X_in)

print(f'MAE: {mean_absolute_error(preds_p1, val_y)}')

In [None]:
#Creating test_X containing columns with features from test dataset
test_X = test_data[features_in]
#Making predicitions
test_pred = pipeline_p1.predict(test_X)

In [None]:
#Changing numerical values of test_pred into True and False
threshold = 0.5
binary_labels = ["True" if p >= threshold else "False" for p in test_pred]
print(binary_labels[0:3])

In [None]:
#Generating a submission
output = pd.DataFrame({'Transported': binary_labels}, index = test_data.index)
output.to_csv('my_submission.csv')
print("Your submission was successfully saved!")

In [None]:
output