# Spaceship Titanic - Notebook

<!-- TODO -->


Import the necessary libraries. We will use:

- `pandas` to load the data and manipulate it.
- `scikit-learn` to build the model.
<!-- TODO - `matplotlib` and `seaborn` to plot the data. -->


In [97]:
import os
import json
from itertools import combinations

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import (
    SelectFromModel,
    SelectKBest,
    f_classif,
    mutual_info_classif,
    RFE,
)
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LassoCV, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
    LabelEncoder,
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures,
    RobustScaler,
    StandardScaler,
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.validation import check_is_fitted
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from featuretools import EntitySet, dfs

import optuna
from optuna.samplers import TPESampler, GridSampler

import signal

In [98]:
# Define constants
CURRENT_DIR = os.getcwd()

DATA_DIR = f"{CURRENT_DIR}/data"
TRAIN_DATA_FILE = f"{DATA_DIR}/train.csv"
TEST_DATA_FILE = f"{DATA_DIR}/test.csv"

TARGET_COLUMN = "Transported"
ID_COLUMN = "PassengerId"

RANDOM_SEED = 42
VALIDATION_SIZE = 0.2

MISSING_VALUE = "Missing"

In [99]:
# Load the data files into pandas dataframes
train_data = pd.read_csv(TRAIN_DATA_FILE)
test_data = pd.read_csv(TEST_DATA_FILE)

## Data Exploration


In [100]:
print("First few rows of data:")
display(train_data.head())

First few rows of data:


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [101]:
print("Data columns and types:")
print(train_data.dtypes)

Data columns and types:
PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object


In [102]:
NUMERICAL_COLUMNS = train_data.select_dtypes(include=[np.number]).columns.tolist()
CATEGORICAL_COLUMNS = train_data.select_dtypes(include=["object"]).columns.tolist()


leftover_columns = [
    col
    for col in train_data.columns
    if col not in NUMERICAL_COLUMNS
    and col not in CATEGORICAL_COLUMNS
    and col != TARGET_COLUMN
]
assert not leftover_columns

## Clean Dataset

We need to clean the train and test datasets the same way


In [103]:
def clean_data(data: pd.DataFrame):

    data = data.copy()

    # Convert columns to integer (with missing values)
    for col in [
        "CabinNumber",
        "CryoSleep",
        "VIP",
        "Transported",
        "PassengerGroupId",
        "PassengerIntraGroupId",
    ]:
        if col in data.columns:
            data[col] = pd.to_numeric(data[col], errors="coerce").astype("Int64")

    # Make PassengerId the index
    data.set_index(ID_COLUMN, inplace=True)

    # Drop columns
    for col in [
        "Name",
        "Cabin",
        "PassengerGroupId",
        "PassengerIntraGroupId",
    ]:
        if col in data.columns:
            data.drop(columns=col, inplace=True)

    return data

In [104]:
# train_data = clean_data(train_data)
# test_data = clean_data(test_data)

## Create Features


In [105]:
CREATED_FEATURES = [
    "AmountSpentTotal",
    "CabinDeck",
    "CabinNumber",
    "CabinSide",
    "CabinMates",
    "PassengerGroupSize",
]


def create_features(
    data: pd.DataFrame,
    **kwargs,
):

    # Create new features:
    # - AmountSpentTotal: Total money spent in the ship's service
    # - CabinDeck: Deck of the cabin
    # - CabinNumber: Number of the cabin
    # - CabinSide: Side of the cabin
    # - CabinMates: Number of people in the same cabin
    # - PassengerGroupSize: Group Size

    # Get from kwargs the features to return
    selected_features = CREATED_FEATURES

    new_data = data.copy()
    data = data.copy()

    # Create new feature: Total money spent in the ship's service
    new_data["AmountSpentTotal"] = data[
        ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    ].sum(axis=1, skipna=True)

    # Create new feature: Mean money spent in the ship's service
    # TODO is the same as the other one
    # new_data["AmountSpentMean"] = data[
    #     ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    # ].mean(axis=1, skipna=True)

    # Create new features: Convert Cabin to three different columns (Deck, Number, Side)
    new_data[["CabinDeck", "CabinNumber", "CabinSide"]] = data["Cabin"].str.split(
        "/", expand=True
    )

    # Create new feature: Number of people in the same cabin
    new_data["CabinMates"] = data.groupby("Cabin")["Cabin"].transform("count")

    # Create new features: Group Id, Group Size, Intra Group Id,
    new_data[["PassengerGroupId", "PassengerIntraGroupId"]] = data[ID_COLUMN].str.split(
        "_", expand=True
    )
    new_data["PassengerGroupSize"] = new_data.groupby("PassengerGroupId")[
        "PassengerGroupId"
    ].transform("count")

    # Only return the old features and the selected new ones
    return pd.concat([data, new_data[selected_features]], axis=1)

## Data Preprocessing Pipeline

### Handle Missing Values

- Input Data
- Mark as "Missing"

### Data Preprocessing

- Make Categorical Columns Numerical
  - One-Hot encoding
  - Ordinal encoding
- Scale Numerical Columns


In [106]:
MAX_CARDINALITY = 4


def select_high_cardinality_categorical_features(df: pd.DataFrame):
    hi_c_cat = df.select_dtypes(include=["object"]).nunique() > MAX_CARDINALITY
    features = hi_c_cat[hi_c_cat].index.tolist()
    return features


def select_low_cardinality_categorical_features(df: pd.DataFrame):
    lo_c_cat = df.select_dtypes(include=["object"]).nunique() <= MAX_CARDINALITY
    features = lo_c_cat[lo_c_cat].index.tolist()
    return features


def select_numerical_features(df: pd.DataFrame):
    return df.select_dtypes(include=[np.number]).columns.tolist()


# Combine handling missing values and preprocessing into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        (
            "cat_low_cardinality",
            Pipeline(
                steps=[
                    # (
                    #     "impute",
                    #     SimpleImputer(strategy="most_frequent"),
                    # ),
                    (
                        "impute",
                        SimpleImputer(strategy="constant", fill_value=MISSING_VALUE),
                    ),
                    ("to_num", OneHotEncoder()),
                    # ("to_num", OrdinalEncoder()),
                ]
            ),
            select_low_cardinality_categorical_features,
        ),
        (
            "cat_high_cardinality",
            Pipeline(
                steps=[
                    # (
                    #     "impute",
                    #     SimpleImputer(strategy="constant", fill_value=MISSING_VALUE),
                    # ),
                    (
                        "impute",
                        SimpleImputer(strategy="most_frequent"),
                    ),
                    ("to_num", OrdinalEncoder()),
                    # ("to_num", OneHotEncoder()),
                ]
            ),
            select_high_cardinality_categorical_features,
        ),
        (
            "num",
            Pipeline(
                steps=[
                    ("impute", KNNImputer(n_neighbors=5)),
                    ("scale", StandardScaler()),
                ]
            ),
            select_numerical_features,
        ),
    ],
    remainder="passthrough",
)

# preprocessor.set_output(transform="pandas")

In [107]:
def transform_data(data: pd.DataFrame, pipeline: Pipeline) -> pd.DataFrame:
    X = data.drop(columns=[TARGET_COLUMN])
    y = data[TARGET_COLUMN]

    # Fit and transform the data using the pipeline
    data_transformed = pipeline.fit_transform(X=X, y=y)

    # Extract feature names from the preprocessor step
    if "preprocessor" in pipeline.named_steps:
        feature_names = pipeline.named_steps["preprocessor"].get_feature_names_out()
    else:
        feature_names = data.columns

    # Extract the selected feature indices from the feature engineering step
    if "feature_engineering" in pipeline.named_steps:
        feature_selector = pipeline.named_steps["feature_engineering"].named_steps[
            "feature_selection"
        ]
        if isinstance(feature_selector, SelectFromModel):
            support_mask = feature_selector.get_support()
        elif isinstance(feature_selector, RFE):
            support_mask = feature_selector.support_
        else:
            support_mask = np.ones(len(feature_names), dtype=bool)
        selected_feature_names = [
            name for name, selected in zip(feature_names, support_mask) if selected
        ]
    else:
        selected_feature_names = feature_names

    # Remove prefixes from the column names
    selected_feature_names = [name.split("__")[-1] for name in selected_feature_names]

    # Convert the transformed data back to a DataFrame
    data_transformed_df = pd.DataFrame(data_transformed, columns=selected_feature_names)

    data_transformed_df[TARGET_COLUMN] = y.values

    return data_transformed_df

## Feature Engineering


In [108]:
LASSO_CV = 5

In [109]:
feature_engineering = Pipeline(
    steps=[
        (
            "feature_selection",
            SelectFromModel(
                LassoCV(cv=LASSO_CV, random_state=RANDOM_SEED, max_iter=10000)
            ),
        ),
    ]
)

## Model Training and Parameter Search


In [110]:
# Split the train data into training and validation sets
X_train = train_data.drop(columns=[TARGET_COLUMN])
y_train = train_data[TARGET_COLUMN]

In [111]:
pipeline = Pipeline(
    steps=[
        ("create_features", FunctionTransformer(create_features)),
        ("clean_data", FunctionTransformer(clean_data)),
        ("preprocessor", preprocessor),
        ("feature_engineering", feature_engineering),
        (
            "classifier",
            GradientBoostingClassifier(
                random_state=RANDOM_SEED,
                max_depth=5,
                criterion="friedman_mse",
                # learning_rate=0.013207297032220256,
                learning_rate=0.01,
                # min_samples_leaf=44,
                # min_samples_leaf=45,
                # min_samples_split=11,
                # min_samples_split=10,
                # n_estimators=424,
                n_estimators=500,
                # subsample=0.8818913987485083,
                subsample=0.9,
            ),
        ),
    ]
)

## Best Model Evaluation with Validation Set


In [112]:
# # Evaluate all estimators in grid search with validation set
# for estimator, _ in all_estimators_with_scores:
#     pipeline.set_params(**estimator)
#     pipeline.fit(X_train, y_train)
#     score = evaluate_model(pipeline, estimator, X_val, y_val)

## Final Model Training and Submission


In [113]:
# best_pipeline = map_and_set_params(pipeline, best_params)
best_pipeline = pipeline

best_pipeline.fit(X_train, y_train)

In [116]:
from sklearn.model_selection import cross_val_score

# Evaluar el modelo final con validación cruzada (CV = 5)
cv_scores = cross_val_score(best_pipeline, X_train, y_train, cv=5, scoring='accuracy')

# Imprimir los resultados de la validación cruzada
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean()}")
print(f"Standard deviation of CV scores: {cv_scores.std()}")

Cross-validation scores: [0.74985624 0.74813111 0.80448534 0.82911392 0.78365938]
Mean CV score: 0.7830491976196259
Standard deviation of CV scores: 0.03131416526086161


In [114]:
# Make predictions on the test data
X_test = test_data
y_pred = best_pipeline.predict(X_test)
test_data[TARGET_COLUMN] = y_pred.astype(bool)

# Make predictions on the test set with the best model
# best_model = max(best_models.items(), key=lambda x: cross_val_score(x[1], X_train, y_train, cv=5).mean())[1]
# test_predictions = best_model.predict(test_data)
# test_data[TARGET_COLUMN] = test_predictions.astype(bool)

In [115]:
# Create a DataFrame with only the ID_COLUMN and Predictions
predictions_df = test_data.reset_index()[[ID_COLUMN, TARGET_COLUMN]]

# Print predictions
print(predictions_df)

# Save predictions to a CSV file
predictions_df.to_csv(f"{DATA_DIR}/predictions.csv", index=False)

     PassengerId  Transported
0        0013_01         True
1        0018_01        False
2        0019_01         True
3        0021_01         True
4        0023_01         True
...          ...          ...
4272     9266_02         True
4273     9269_01        False
4274     9271_01         True
4275     9273_01         True
4276     9277_01         True

[4277 rows x 2 columns]
