In [14]:
import pandas as pd
import numpy as np
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

class Config:
    """Stores all configurations for the model."""
    FEATURES_TO_USE = [
        'Pclass', 'Sex', 'FamilySize', 'IsAlone', 'Embarked',
        'Title', 'AgeBin', 'FareBin', 'Deck'
    ]

    # Set of best parameters previously found by optuna
    BEST_PARAMS = {
        'n_estimators': 312,
        'max_depth': 11,
        'min_samples_leaf': 2,
        'min_samples_split': 5,
        'max_features': 'sqrt'
    }
    # Main toggler to decide whether we want to use optuna
    TUNE_HYPERPARAMETERS = False
    
    # Number of trials for Optuna to run
    N_TRIALS_OPTUNA = 50
    
class RandomForestPredictor:
    """Implements complete pipeline for training and predicting model."""
    def __init__(self, config):
        """Initialize model."""
        self.config = config
        self.preprocessor = None
        self.model = None

    def _clean_data(self, data):
        """Fill the missing values in columns."""
        df = data.copy()
        # Fill with the median age grouped by Pclass and Sex for more accuracy.
        df['Age'] = df.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    
        # In case any group was entirely NaN, fill remaining with the global median
        if df['Age'].isnull().any():
            df['Age'] = df['Age'].fillna(df['Age'].median())
    
        # Fill missing values with the median.
        df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
        # Fill the missing values with the most common port.
        df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
        return df
        
    def _apply_feature_engineering(self, data):
        """Add custom engineering features to improve predictive power of model."""
        df = data.copy()
        df['FamilySize'] = df['Parch'] + df['SibSp'] + 1
        df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
        df['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)
        
        # Simplify titles into a few main categories
        common_titles = ['Mr', 'Miss', 'Mrs', 'Master']
        df['Title'] = df['Title'].apply(lambda x: x if x in common_titles else 'Other')

        df['Deck'] = df['Cabin'].str[0].fillna('M')
        df['AgeBin'] = pd.cut(df['Age'], bins = [0, 18, 60, 100], labels = ['Child', 'Adult', 'Senior'])
        df['FareBin'] = pd.qcut(df['Fare'], 4, labels = ['Low', 'Medium', 'High', 'VeryHigh'])
        return df

    def _build_preprocessor(self, X):
        """Builds preprocessing pipeline for numeric and categorical data."""
        # Select only the features we intend to use in the model
        X = X[self.config.FEATURES_TO_USE]

        # Automatically find numeric and categorical columns based on their data types
        num_features = X.select_dtypes(include = np.number).columns.tolist()
        cat_features = X.select_dtypes(include = ['object', 'category']).columns.tolist()
        
        num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy = 'median')),
            ('scaler', StandardScaler())
        ])
        cat_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy = 'most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
        ])
        preprocessor = ColumnTransformer([
            ('num', num_pipeline, num_features),
            ('cat', cat_pipeline, cat_features)
        ])
        return preprocessor

    def _tune_with_optuna(self, X, y):
        """Finds the best hyperparameters using Optuna."""
        def objective(trial):
            """Define the search space for algorithm."""
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
                'max_depth': trial.suggest_int('max_depth', 5, 20),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 15),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
            }
            model = RandomForestClassifier(**params, random_state = 1, n_jobs = -1)
            pipeline = Pipeline(steps = [
                ('preprocessor', self.preprocessor),
                ('classifier', model)
            ])
            # Use Stratified Fold for more accurate cross validation
            strfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 1)
            score = cross_val_score(pipeline, X, y, cv = strfold, scoring = 'accuracy').mean()
            return score

        study = optuna.create_study(direction = 'maximize')
        study.optimize(objective, n_trials = self.config.N_TRIALS_OPTUNA, show_progress_bar = True)
        
        return study.best_params

    def train(self, X, y):
        """Trains a logistic regression model using gradient descent."""
        # Build the preprocessor
        self.preprocessor = self._build_preprocessor(X)

        params = self.config.BEST_PARAMS
        if self.config.TUNE_HYPERPARAMETERS:
            params = self._tune_with_optuna(X, y)
        
        # Create the final pipeline with the best parameters
        self.model = Pipeline(steps = [
            ('preprocessor', self.preprocessor),
            ('classifier', RandomForestClassifier(**params, random_state = 1, n_jobs = -1))
        ])
        # Train model on all available data
        self.model.fit(X, y)
        
        # Show performance metrics
        self._evaluate(X, y, params)
        
    def _evaluate(self, X, y, best_params):
        """Prints performance metrics on a hold-out validation set for an honest estimate."""
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)
        
        temp_model = Pipeline(steps = [
            ('preprocessor', self._build_preprocessor(X_train)),
            ('classifier', RandomForestClassifier(**best_params, random_state = 1, n_jobs = -1))
        ])
        temp_model.fit(X_train, y_train)

        train_preds = temp_model.predict(X_train)
        train_accuracy = accuracy_score(y_train, train_preds)
        val_preds = temp_model.predict(X_val)
        val_accuracy = accuracy_score(y_val, val_preds)

        print("\n--- Model Performance Estimate ---")
        print(f"Hyperparameters Used: {best_params}")
        print(f"Training Set Accuracy: {train_accuracy:.4f}")
        print(f"Validation Set Accuracy: {val_accuracy:.4f}")
    
    def predict(self, X):
        """Gives final predictions for test data."""
        return self.model.predict(X)

    def run(self):
        """Runs the entire pipeline from loading data to submission."""
        # Load data
        train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
        test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

        # Separate features and target
        X_train = train_df.drop('Survived', axis = 1)
        y_train = train_df['Survived']
        X_test = test_df.copy()

        # Clean data
        X_train = self._clean_data(X_train)
        X_test = self._clean_data(X_test)

        # Apply feature engineering
        X_train = self._apply_feature_engineering(X_train)
        X_test = self._apply_feature_engineering(X_test)

        # Train model
        self.train(X_train, y_train)

        # Make predictions on the test set
        predictions = self.predict(X_test)
        
        # Generate the submission file
        submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predictions})
        submission.to_csv('submission.csv', index = False)

# Main function
if __name__ == '__main__':
    config = Config()
    predictor = RandomForestPredictor(config = config)
    predictor.run()


--- Model Performance Estimate ---
Hyperparameters Used: {'n_estimators': 312, 'max_depth': 11, 'min_samples_leaf': 2, 'min_samples_split': 5, 'max_features': 'sqrt'}
Training Set Accuracy: 0.8708
Validation Set Accuracy: 0.8212
