## Pipeline model - Spaceship Titanic Kaggle Competition

### Importing Libraries

In [91]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

### Loading and Combining Data

In [92]:
train_path = 'train.csv'
test_path = 'test.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

y = train_data['Transported']
train_data = train_data.drop(columns=['Transported'])

combined = pd.concat([train_data, test_data], keys=['train', 'test'])

### Calculating Total Spending

In [93]:
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

combined['TotalSpending'] = combined[spend_cols].sum(axis=1)

### Splitting the Cabin Column in to deck and side

In [94]:
cabin_split = combined['Cabin'].str.split('/', expand=True)

combined['Deck'] = cabin_split[0]
combined['Side'] = cabin_split[2]

### Selecting Features and filling Unknown for Missing Values

In [95]:
features = [
    'CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt',
    'ShoppingMall', 'Spa', 'VRDeck', 'Deck', 'Side', 'TotalSpending'
]

X_train_full = combined.loc['train', features]
X_test_full = combined.loc['test', features]

X_train_full = (
    X_train_full.fillna({'Deck': 'Unknown', 'Side': 'Unknown'})
    .fillna(0)
)

X_test_full = (
    X_test_full.fillna({'Deck': 'Unknown', 'Side': 'Unknown'})
    .fillna(0)
)

### Splitting the Training Data for Validation

In [96]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y, test_size=0.2, random_state=0
)

### Defining Categorical and Numerical Columns

In [97]:
categorical_cols = ['Deck', 'Side']

numerical_cols = [
    'CryoSleep', 'VIP', 'TotalSpending', 'Age', 'Spa',
    'VRDeck', 'RoomService', 'FoodCourt', 'ShoppingMall'
]

### Creating a Pipeline model

In [98]:
numerical_transformer = SimpleImputer(strategy='constant', fill_value=0)

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

### Training the XGBoost Model Pipeline

In [99]:
xgb_model = XGBClassifier(
    n_estimators=1310,
    learning_rate=0.05,
    n_jobs=4,
    random_state=0
)

my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb_model)
])

my_pipeline.fit(X_train, y_train)

preds = my_pipeline.predict(X_valid)
score = accuracy_score(y_valid, preds)

print('Validation Accuracy:', score)

Validation Accuracy: 0.7935595169637722


### Training on Full Data and getting Submission CSV file

In [100]:
my_pipeline.fit(X_train_full, y)

test_preds = my_pipeline.predict(X_test_full)
test_preds_bool = test_preds.astype(bool)

output = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_preds_bool
})

submission_file_name = f'submission_{pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")}.csv'
output.to_csv(submission_file_name, index=False)