In [1]:
# Import necessary libraries
import pandas as pd
# NEW: Import the more powerful model and cross-validation tool
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_val_score

# --- 1. Load the Data ---
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

# Separate features (X) and target (y) from the training data
X_train_full = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']
# Use the original test data
X_test_full = test_df.copy() # Use copy to avoid SettingWithCopyWarning

In [2]:
# --- 2. Leak-Proof Preprocessing Pipeline ---

# We will apply the same steps to both train and test data,
# but all calculations will be based ONLY on the training data.

# --- Step A: Calculate statistics from training data ---
fare_median = X_train_full['Fare'].median()
embarked_mode = X_train_full['Embarked'].mode()[0]

# --- Step B: Define a preprocessing function ---
def preprocess(df, fare_median, embarked_mode):
    # Make a copy to avoid changing the original data
    X = df.copy()
    
    # Fill missing values using stats from the TRAINING data
    X['Fare'] = X['Fare'].fillna(fare_median)
    X['Embarked'] = X['Embarked'].fillna(embarked_mode)

    # Feature Engineering
    X['Title'] = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    X['Title'] = X['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    X['Title'] = X['Title'].replace('Mlle', 'Miss')
    X['Title'] = X['Title'].replace('Ms', 'Miss')
    X['Title'] = X['Title'].replace('Mme', 'Mrs')
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5}
    X['Title'] = X['Title'].map(title_mapping).fillna(0) # Use fillna(0) for any new titles in test data

    # Impute Age based on Title medians calculated ONLY from the training data
    # We calculate these medians inside the function but based on the training set logic
    title_age_median = X_train_full.join(X['Title']).groupby('Title')['Age'].median()
    X['Age'] = X.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
    # If any NaNs remain (e.g., a new title in test set), fill with global training median
    X['Age'] = X['Age'].fillna(X_train_full['Age'].median())

    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
    
    # Convert categoricals
    X['Sex'] = X['Sex'].map({'female': 1, 'male': 0}).astype(int)
    X = pd.get_dummies(X, columns=['Embarked'], prefix='Embarked')
    
    # Drop unnecessary columns
    cols_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']
    X = X.drop(columns=cols_to_drop)
    
    return X
    
# --- Step C: Apply the function ---
X_train = preprocess(X_train_full, fare_median, embarked_mode)
X_test = preprocess(X_test_full, fare_median, embarked_mode)

# Align columns - crucial for ensuring test set has same columns as train set
train_cols = X_train.columns
test_cols = X_test.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0
X_test = X_test[train_cols] # Ensure order is the same

In [3]:
# --- 3. Model Training and Validation ---
model = HistGradientBoostingClassifier(random_state=42, learning_rate=0.05, max_iter=200)

# The new, HONEST cross-validation score
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

print(f"HONEST cross-validation scores: {cv_scores}")
print(f"HONEST average cross-validation score: {cv_scores.mean():.4f}")

# --- 4. Final Training and Submission ---
model.fit(X_train, y_train)
predictions = model.predict(X_test)

submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': predictions.astype(int)
})
submission_df.to_csv('submission.csv', index=False)

print("\nLeak-proof submission file created. This score should be reliable.")

HONEST cross-validation scores: [0.83240223 0.79775281 0.87078652 0.80337079 0.8258427 ]
HONEST average cross-validation score: 0.8260

Leak-proof submission file created. This score should be reliable.
