In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
train_df.head()

In [None]:
spend_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
cat_cols = ['HomePlanet', 'CabinSide', 'CabinDeck', 'Destination']  # will label-encode
binary_cols = ['CryoSleep', 'VIP', 'NoSpending']

for df in [train_df, test_df]:
    
    #---- Split Cabin ----
    #split
    df['Cabin'] = df['Cabin'].fillna('unknown/0/unknown')
    df["CabinDeck"] = df["Cabin"].str.split("/").str[0]
    df["CabinNum"]  = df["Cabin"].str.split("/").str[1].astype(int)
    df["CabinSide"] = df["Cabin"].str.split("/").str[2]
    df["GroupId"] = df["PassengerId"].str.split("_").str[0]


    #---- Spending ----
    df[spend_cols] = df[spend_cols].fillna(0)
    df["NoSpending"] = (df[spend_cols].sum(axis=1) == 0)

In [None]:
#---- Fill in HomePlanet ----
train_df['HomePlanet'] = train_df.groupby('GroupId')['HomePlanet'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Earth'))

#---- Fill in CryoSleep ----
train_df['CryoSleep'] = train_df.groupby('GroupId')['CryoSleep'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else False))

#---- Fill in Age ----
train_df['Age'] = train_df.groupby('GroupId')['Age'].transform(lambda x: x.fillna(x.median()))

#remaining
train_df['Destination'] = train_df['Destination'].fillna(train_df['Destination'].mode()[0])
train_df['VIP'] = train_df['VIP'].fillna(False)
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())

#---- fix inconsistent rows in train ----
train_df.loc[(train_df['CryoSleep'] == True) & (train_df['NoSpending'] == False), 'CryoSleep'] = False

In [None]:
#---- Fill in HomePlanet ----
test_df['HomePlanet'] = test_df.groupby('GroupId')['HomePlanet'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Earth'))

#---- Fill in CryoSleep ----
test_df['CryoSleep'] = test_df.groupby('GroupId')['CryoSleep'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else False))

#---- Fill in Age ----
test_df['Age'] = test_df.groupby('GroupId')['Age'].transform(lambda x: x.fillna(x.median()))

#Remaining
test_df['Destination'] = test_df['Destination'].fillna(train_df['Destination'].mode()[0])
test_df['VIP'] = test_df['VIP'].fillna(False)
test_df['Age'] = test_df['Age'].fillna(train_df['Age'].median())

#---- fix inconsistent rows in test ----
test_df.loc[(test_df['CryoSleep'] == True) & (test_df['NoSpending'] == False), 'CryoSleep'] = False

In [None]:
#---- Dropping ----
for df in (train_df, test_df):
    if 'Name' in df.columns:
        df.drop(columns=['Name'], inplace=True)
    if 'Cabin' in df.columns:
        df.drop(columns=['Cabin'], inplace=True)

In [None]:
#----Feature Engineering ----
for df in [train_df, test_df]:
    df['TotalSpend'] = df[spend_cols].sum(axis=1)
    df['SpendPerAge'] = df['TotalSpend'] / (df['Age'] + 1)
    df['GroupSize'] = df.groupby('GroupId')['GroupId'].transform('count')
    df['CryoSleep_NoSpend'] = ((df['CryoSleep'] == True) & (df['NoSpending'] == True)).astype(int)
    df['TotalSpendPerGroup'] = df.groupby('GroupId')['TotalSpend'].transform('mean').fillna(0)
    df['HighSpender'] = (df['TotalSpend'] > df['TotalSpend'].median()).astype(int)
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0,12,18,30,50,80], labels=[0,1,2,3,4])
    df['AgeGroup'] = df['AgeGroup'].cat.add_categories([5]).fillna(5)
    df['SpendPerGroup'] = df['TotalSpend'] / (df['GroupSize'] + 1)
    df['IsAlone'] = (df['GroupSize'] == 1).astype(int)
    df['AgeSpendRatio'] = df['Age'] / (df['TotalSpend'] + 1)

In [None]:
test_df.isna().sum().sort_values(ascending = False)

In [None]:
for col in ['HomePlanet', 'CryoSleep', 'CabinSide', 'CabinNum', 'CabinDeck', 'Destination', 'VIP']:
    print(f'number of uniques in {col} is : ',train_df[col].nunique())
    print(f'uniques names in {col} is : ',train_df[col].unique())

In [None]:
for col in ['HomePlanet', 'CryoSleep', 'CabinSide', 'CabinNum', 'CabinDeck', 'Destination', 'VIP']:
    print(f'number of uniques in {col} is : ', test_df[col].nunique())
    print(f'uniques names in {col} is : ', test_df[col].unique())

In [None]:
from sklearn.preprocessing import LabelEncoder

#---- Drop identifier columns ----
passenger_ids = test_df['PassengerId'].copy()
train_df = train_df.drop(columns=['PassengerId', 'GroupId'], errors='ignore')
test_df = test_df.drop(columns=['PassengerId', 'GroupId'], errors='ignore')
    
#---- split features ----
X = train_df.drop(['Transported'], axis=1)
y = train_df['Transported']

test_df = test_df.reindex(columns=X.columns, fill_value=0)

In [None]:
#---- Splitting ----
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for tr_i, val_i in split.split(X, y):
    X_train, X_val = X.iloc[tr_i], X.iloc[val_i]
    y_train, y_val = y.iloc[tr_i], y.iloc[val_i]

In [None]:
categorical_cols = [c for c in X_train.columns if X_train[c].dtype == 'object' or str(X_train[c].dtype).startswith('category')]
numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

print("Num cols:", numeric_cols)
print("Cat cols:", categorical_cols)

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
], remainder='drop') 

In [None]:
#---- Model ----
RFC = RandomForestClassifier(n_estimators=400, max_depth=12, min_samples_split=2,
                             min_samples_leaf=2, max_features='sqrt', random_state=42, n_jobs=-1)

GCB = GradientBoostingClassifier(n_estimators=400, learning_rate=0.05, max_depth=5, subsample=0.8, random_state=42)

ensemble = VotingClassifier(estimators=[('RFC', RFC), ('GCB', GCB)], voting='soft', n_jobs=-1)
model_pipeline = Pipeline([('PRE', preprocessor), ('CLF', ensemble)])

#---- Fit on train ----
model_pipeline.fit(X_train, y_train)

#---- Validate ----
y_val_predict = model_pipeline.predict(X_val)
val_acc = accuracy_score(y_val, y_val_predict)
print('Validation accuracy:', val_acc)
print('Prediction distribution (val):')
print(pd.Series(y_val_predict).value_counts(normalize=True))

In [None]:
#---- Test predictions and submission ----

test_predict = model_pipeline.predict(test_df[X.columns])

submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Transported': test_predict.astype(bool)
})

submission.to_csv('submission.csv', index=False)
print("Wrote submission.csv (first rows):")
print(submission.head())
