In [1]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient
import os

In [2]:
# Read my dataset "df_train"
df_train = pd.read_csv("train.csv")

# Drop irrelevant columns
df_train = df_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Handle missing values
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].median()).astype(int)
df_train['Embarked'] = df_train['Embarked'].fillna(df_train['Embarked'].mode()[0])

# Convert categorical variables to numeric
df_train = pd.get_dummies(df_train, columns=['Sex','Embarked'], drop_first=True)

# Change the data type of the new dummy columns to integer
dummy_columns = [col for col in df_train.columns if 'Sex_' in col or 'Embarked_' in col]
df_train[dummy_columns] = df_train[dummy_columns].astype(int)

# Rearrange columns in the "df_train" dataset
df_train.columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'Survived']

In [3]:
# Split the dataset into features (X) and target (y)
X = df_train.drop('Survived', axis=1)
y = df_train['Survived']

# Define scalers
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler()
}

# Define models
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# MLflow setup
mlflow.set_experiment("Titanic Experiment")
client = MlflowClient()

# Iterate over scalers and models
for scaler_name, scaler in scalers.items():
    # Scale the data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    for model_name, model in models.items():
        with mlflow.start_run(run_name=f"{model_name} with {scaler_name}"):
            # Train the model
            model.fit(X_train_scaled, y_train)
            predictions = model.predict(X_test_scaled)

            # Calculate accuracy
            accuracy = accuracy_score(y_test, predictions)

            # Log parameters and metrics
            mlflow.log_param("scaler", scaler_name)
            mlflow.log_param("model", model_name)
            mlflow.log_metric("accuracy", accuracy)

            # Log classification report as artifact
            report = classification_report(y_test, predictions, output_dict=True)
            report_path = f"classification_report_{model_name}_{scaler_name}.txt"
            with open(report_path, "w") as f:
                f.write(classification_report(y_test, predictions))
            mlflow.log_artifact(report_path)
            os.remove(report_path)

            # Log the model with input example and signature
            input_example = pd.DataFrame(X_train_scaled[:5], columns=X.columns)  # Create an input example
            signature = infer_signature(X_train_scaled, model.predict(X_train_scaled))  # Infer input-output signature
            model_info = mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path=f"{model_name}_model",
                input_example=input_example,
                signature=signature
            )

            # Register the model to the registry
            model_uri = model_info.model_uri
            registered_model_name = f"{model_name}_Model_Registry"
            try:
                client.create_registered_model(registered_model_name)  # Create registry entry (if not exists)
            except Exception as e:
                print(f"Model registry '{registered_model_name}' already exists. Continuing...")

            model_version = client.create_model_version(
                name=registered_model_name,
                source=model_uri,
                run_id=mlflow.active_run().info.run_id
            )

            # Add tags to indicate the model's lifecycle stage
            client.set_model_version_tag(
                name=registered_model_name,
                version=model_version.version,
                key="stage",
                value="Staging"  # Replace with "Production" or other stages
            )

            # Update metadata for better clarity
            client.update_model_version(
                name=registered_model_name,
                version=model_version.version,
                description=f"Model trained with {scaler_name}. Accuracy: {accuracy:.4f}"
            )

            print(f"Logged {model_name} with {scaler_name}: Accuracy={accuracy:.4f}")

2025/01/05 15:23:29 INFO mlflow.tracking.fluent: Experiment with name 'Titanic Experiment' does not exist. Creating a new experiment.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Logged Decision Tree with StandardScaler: Accuracy=0.8492


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Logged Random Forest with StandardScaler: Accuracy=0.8212


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Logged Gradient Boosting with StandardScaler: Accuracy=0.8268


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model registry 'Decision Tree_Model_Registry' already exists. Continuing...
Logged Decision Tree with MinMaxScaler: Accuracy=0.8547


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model registry 'Random Forest_Model_Registry' already exists. Continuing...
Logged Random Forest with MinMaxScaler: Accuracy=0.8324


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model registry 'Gradient Boosting_Model_Registry' already exists. Continuing...
Logged Gradient Boosting with MinMaxScaler: Accuracy=0.8268


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model registry 'Decision Tree_Model_Registry' already exists. Continuing...
Logged Decision Tree with RobustScaler: Accuracy=0.8603


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model registry 'Random Forest_Model_Registry' already exists. Continuing...
Logged Random Forest with RobustScaler: Accuracy=0.8212


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model registry 'Gradient Boosting_Model_Registry' already exists. Continuing...
Logged Gradient Boosting with RobustScaler: Accuracy=0.8268
