In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

In [3]:
data = pd.read_csv("train.csv")

display(data)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
# Data preprocessing
def preprocess_data(df):
    # Make a copy to avoid modifying the original dataframe
    data = df.copy()

    # Extract titles from names as they might be predictive
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

    # Group rare titles
    title_mapping = {
        "Mr": "Mr",
        "Miss": "Miss",
        "Mrs": "Mrs",
        "Master": "Master",
        "Dr": "Officer",
        "Rev": "Officer",
        "Col": "Officer",
        "Major": "Officer",
        "Mlle": "Miss",
        "Mme": "Mrs",
        "Don": "Royalty",
        "Lady": "Royalty",
        "Countess": "Royalty",
        "Jonkheer": "Royalty",
        "Sir": "Royalty",
        "Capt": "Officer",
        "Ms": "Mrs",
        "Dona": "Royalty"
    }
    data['Title'] = data['Title'].map(title_mapping)

    # Create family size feature
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

    # Create is_alone feature
    data['IsAlone'] = (data['FamilySize'] == 1).astype(int)

    # Fill missing ages with the median age of their respective title
    title_age_median = data.groupby('Title')['Age'].transform('median')
    data['Age'].fillna(title_age_median, inplace=True)

    # Fill missing embarked with most common value
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

    # Fill missing Fare with median fare
    data['Fare'].fillna(data['Fare'].median(), inplace=True)

    # Create age groups
    data['AgeBin'] = pd.qcut(data['Age'], 4, labels=False)

    # Create fare bins
    data['FareBin'] = pd.qcut(data['Fare'], 4, labels=False)

    # Select features to keep
    features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'AgeBin', 'FareBin']

    return data[features], data['Survived'] if 'Survived' in data.columns else None

# Build the ML pipeline
def build_model():
    # Define which columns are numerical and categorical
    numeric_features = ['Age', 'Fare', 'FamilySize']
    categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'IsAlone', 'AgeBin', 'FareBin']

    # Preprocessing for numerical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Create and return the full pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    return model

# Train the model and do hyperparameter tuning
def train_model(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build the model
    model = build_model()

    # Define hyperparameters to tune
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2]
    }

    # Perform grid search
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Evaluate on test set
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Best Model Parameters: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)

    return best_model

# Make predictions on new data
def make_predictions(model, X_new):
    return model.predict(X_new)


In [6]:
 # Load the training data
train_data = data

# Preprocess the data
X, y = preprocess_data(train_data)

# Train the model
model = train_model(X, y)

# # Load test data
# test_data = load_data('test.csv')

# # Preprocess test data
# X_test, _ = preprocess_data(test_data)

# # Make predictions on test data
# predictions = make_predictions(model, X_test)

# # Create submission file
# submission = pd.DataFrame({
#     'PassengerId': test_data['PassengerId'],
#     'Survived': predictions
# })

# # Save to file
# submission.to_csv('submission.csv', index=False)
# print("Predictions saved to 'submission.csv'")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(title_age_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whic

Best Model Parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Accuracy: 0.8380
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       105
           1       0.82      0.78      0.80        74

    accuracy                           0.84       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179

