In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score

class LogisticRegressionModel:
    """Implements logistic regression model from scratch."""
    def __init__(self, features, iterations, learning_rate, lambda_param):
        """Initialize model."""
        self.features = features
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.lambda_param = lambda_param
        self.weights = None
        self.preprocessor = None
        self.threshold = 0.5

    def _clean_data(self, data):
        """Fill the missing values in columns."""
        df = data.copy()
        # Note: We dont fill Cabin, because it is late used in IsCabin feature
        
        # Fill with the median age grouped by Pclass and Sex for more accuracy.
        df['Age'] = df.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    
        # In case any group was entirely NaN, fill remaining with the global median
        if df['Age'].isnull().any():
            df['Age'] = df['Age'].fillna(df['Age'].median())
    
        # Fill missing values with the median.
        df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
        # Fill the missing values with the most common port.
        df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

        return df
        
    def _apply_feature_engineering(self, data):
        """Add custom engineering features to improve predictive power of model."""
        df = data.copy()
        df['FamilySize'] = df['Parch'] + df['SibSp'] + 1
        df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
        df['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)
        
        # Simplify titles into a few main categories
        common_titles = ['Mr', 'Miss', 'Mrs', 'Master']
        df['Title'] = df['Title'].apply(lambda x: x if x in common_titles else 'Other')

        df['Deck'] = df['Cabin'].str[0].fillna('M')
        df['AgeBin'] = pd.cut(df['Age'], bins = [0, 18, 60, 100], labels = ['Child', 'Adult', 'Senior'])
        df['FareBin'] = pd.qcut(df['Fare'], 4, labels = ['Low', 'Medium', 'High', 'VeryHigh'])
        df['Sex_Pclass'] = df['Sex'] + '_' + df['Pclass'].astype(str)
        return df

    def _build_preprocessor(self, X):
        """Builds preprocessing pipeline for numeric and categorical data."""
        # Select only the features we intend to use in the model
        X_subset = X[self.features]

        # Automatically find numeric and categorical columns based on their data types
        num_features = X_subset.select_dtypes(include = np.number).columns.tolist()
        cat_features = X_subset.select_dtypes(include = ['object', 'category']).columns.tolist()
        
        num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy = 'median')),
            ('scaler', StandardScaler())
        ])
        cat_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy = 'most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
        ])
        preprocessor = ColumnTransformer([
            ('num', num_pipeline, num_features),
            ('cat', cat_pipeline, cat_features)
        ])
        return preprocessor

    def _add_bias(self, X):
        """
        # Adds a bias term to the feature matrix.
        This is a critical addition for a correct logistic regression implementation.
        """
        return np.c_[np.ones((X.shape[0], 1)), X]
        
    @staticmethod
    def _sigmoid(z):
        """Computes the sigmoid activation function used in logistic regression."""
        return 1 / (1 + np.exp(-z))

    def _generate_probabilities(self, X):
        """Returns predicted probabilities for binary classification."""
        z = np.dot(X, self.weights)
        return self._sigmoid(z)

    def train(self, X, y):
        """Trains a logistic regression model using gradient descent."""
        # Build the preprocessor
        self.preprocessor = self._build_preprocessor(X)
    
        # Preprocess the data
        X = self.preprocessor.fit_transform(X[self.features])
            
        # Split data for fair evaluation
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size = 0.2, random_state = 1)

        # Add bias
        X_train = self._add_bias(X_train)
        X_val = self._add_bias(X_val)

        # Train model using gradient descent on training set
        self.weights = np.zeros(X_train.shape[1])
        for i in range(self.iterations):
            predictions = self._generate_probabilities(X_train)
            error = predictions - y_train
            gradient = np.dot(X_train.T, error) / len(y_train)

            # Add penalty to the gradient
            regularization = (self.lambda_param / len(y_train)) * self.weights
            regularization[0] = 0 # Do not regularize the bias term
            gradient += regularization
            self.weights -= self.learning_rate * gradient

        val_probs = self._generate_probabilities(X_val)
        self.threshold = self._find_best_threshold(y_val.values, val_probs)
        return self._evaluate(X_train, y_train, X_val, y_val)

    def _find_best_threshold(self, y_true, y_pred):
        """Finds optimal threshold that maximizes accuracy."""
        best_accuracy = 0
        best_threshold = 0.5
        for threshold in np.arange(0.1, 0.9, 0.01):
            predictions = (y_pred >= threshold).astype(int)
            accuracy = accuracy_score(y_true, predictions)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_threshold = threshold
        return best_threshold

    def _evaluate(self, X_train, y_train, X_val, y_val):
        """Prints model accuracy on the training and validation sets."""
        train_probabilities = self._generate_probabilities(X_train)
        train_preds = (train_probabilities >= self.threshold).astype(int)
        train_accuracy = accuracy_score(y_train, train_preds)

        val_probabilities = self._generate_probabilities(X_val)
        val_preds = (val_probabilities >= self.threshold).astype(int)
        val_accuracy = accuracy_score(y_val, val_preds)
        
        print("\n--- Model Evaluation ---")
        print(f"Optimal Threshold: {self.threshold:.2f}")
        print(f"Training Accuracy: {train_accuracy:.4f}")
        print(f"Validation Accuracy: {val_accuracy:.4f}")
    
    def predict(self, X):
        """Gives final predictions for test data."""
        X = self.preprocessor.transform(X[self.features])
        X = self._add_bias(X)
        probabilities = self._generate_probabilities(X)
        return (probabilities >= self.threshold).astype(int)

    def run(self):
        """Runs the entire pipeline from loading data to submission."""
        # Load data
        train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
        test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

        # Separate features and target
        X_train = train_df.drop('Survived', axis = 1)
        y_train = train_df['Survived']
        X_test = test_df.copy()

        # Clean data
        X_train = self._clean_data(X_train)
        X_test = self._clean_data(X_test)

        # Apply feature engineering
        X_train = self._apply_feature_engineering(X_train)
        X_test = self._apply_feature_engineering(X_test)

        # Train model
        self.train(X_train, y_train)

        # Make predictions on the test set
        predictions = self.predict(X_test)
        
        # Generate the submission file
        submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predictions})
        submission.to_csv('submission.csv', index = False)

# Main function
if __name__ == '__main__':
    features = ['Pclass', 'Sex', 'FamilySize', 'IsAlone', 'Title', 'AgeBin', 'FareBin']
    iterations = 20000
    learning_rate = 0.05
    lambda_param = 0.5
    predictor = LogisticRegressionModel(features, iterations, learning_rate, lambda_param)
    predictor.run()


--- Model Evaluation ---
Optimal Threshold: 0.38
Training Accuracy: 0.8132
Validation Accuracy: 0.8101
