# Spaceship Titanic Model

Predicting which passengers are transported to an alternate dimension. From the [spaceship titanic kaggle competition](https://www.kaggle.com/competitions/spaceship-titanic/overview).

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

SEED = 42

In [27]:
train_df = pd.read_csv('data/train_processed.csv')
test_df = pd.read_csv('data/test_processed.csv')

## Training a random forest classifier on the data

In [30]:
FEATURES = [c for c in train_df.columns if c not in ['PassengerId', 'Transported']]
X = train_df[FEATURES]
y = train_df['Transported']

# split into train + validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=SEED, stratify=y)

# base estimator
rf = RandomForestClassifier(random_state=SEED)

# hyperparameter grid (you can expand this)
param_dist = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [1, 2, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# randomized search
n_iter_search = 50  # number of parameter settings that are sampled
rs = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    n_jobs=2,          # use fewer workers
    pre_dispatch='2*n_jobs',  # don’t dispatch more tasks than you need
    random_state=SEED,
    verbose=2
)

# fit on training set
rs.fit(X_train, y_train)

# best parameters & estimator
print("Best parameters found:")
print(rs.best_params_)
print(f"CV accuracy of best estimator: {rs.best_score_:.4f}\n")

best_rf = rs.best_estimator_

# evaluate on validation set
y_pred = best_rf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

# feature importances
importances = pd.Series(best_rf.feature_importances_, index=FEATURES)
importances.sort_values(ascending=False, inplace=True)
print("\nTop 10 Features by Importance:\n", importances.head(10))

# predict on test set & prepare submission
X_test = test_df[FEATURES]
test_preds = best_rf.predict(X_test)
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': test_preds
})
submission.to_csv('submission_rf.csv', index=False)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   2.8s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   2.8s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   2.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   1.2s
[CV] END max_depth=50, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.6s
[CV] END max_depth=50, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.6s
[CV] END max_depth=50, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.6s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   1.5s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   1.5s
[CV] END max_dept



[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   2.8s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   2.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   1.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   1.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   1.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   1.2s
[CV] END max_depth=50, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.6s
[CV] END max_depth=50, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.6s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   1.5s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=500; t