In [None]:
# Setting up experiement and experiment ID

In [9]:
import mlflow

# Get the current tracking URI
print(mlflow.get_tracking_uri())
exp_id = mlflow.create_experiment("titanic")
print(exp_id)

In [None]:
target_col = "Survived"  # Target Feature to be modelled

In [None]:
## Reading Data

In [None]:
import mlflow
import os
import uuid
import pandas as pd
import tempfile

# Replace 'path_to_your_file.csv' with the actual path to your CSV file
df = pd.read_csv('train.csv')

# Display the first few rows of the DataFrame
print(df.head())

In [5]:
### Select supported columns

In [None]:
import pandas as pd

# Assuming df is your DataFrame and it's already defined
# df = pd.read_csv('your_dataset.csv') 

supported_cols = ["Pclass", "Cabin", "Age", "Fare", "Ticket", "PassengerId", "Parch", "Embarked", "Sex", "SibSp"]

# Selecting the desired columns
df_selected = df[supported_cols]


from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X[self.columns]

supported_cols = ["Pclass", "Cabin", "Age", "Fare", "Ticket", "PassengerId", "Parch", "Embarked", "Sex", "SibSp"]
col_selector = ColumnSelector(supported_cols)


In [6]:
## Preprocessors

In [None]:
# Boolean Columns: For each column, impute missing values and then convert into ones and zeros.

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder


bool_imputers = []

bool_pipeline = Pipeline(steps=[
    ("cast_type", FunctionTransformer(lambda df: df.astype(object))),
    ("imputers", ColumnTransformer(bool_imputers, remainder="passthrough")),
    ("onehot", SklearnOneHotEncoder(handle_unknown="ignore", drop="first")),
])

bool_transformers = [("boolean", bool_pipeline, ["Sex"])]


## Numerical Columns: Missing values for numerical columns are imputed with mean by default.

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler

num_imputers = []
num_imputers.append(("impute_mean", SimpleImputer(), ["Age", "Fare", "Parch", "PassengerId", "Pclass", "SibSp"]))

numerical_pipeline = Pipeline(steps=[
    ("converter", FunctionTransformer(lambda df: df.apply(pd.to_numeric, errors='coerce'))),
    ("imputers", ColumnTransformer(num_imputers)),
    ("standardizer", StandardScaler()),
])

numerical_transformers = [("numerical", numerical_pipeline, ["Pclass", "Age", "Fare", "PassengerId", "Parch", "SibSp"])]


## Categorical Columns: Convert each low-cardinality categorical column into multiple binary columns through one-hot encoding.
# For each input categorical column (string or numeric), the number of output columns is equal to the number of unique values in the input column.

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Define a pipeline to apply SimpleImputer and OneHotEncoder
one_hot_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),  # you can adjust the strategy parameter as needed
    ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))  # "ignore" to avoid errors when the test set contains classes not present in training set
])

categorical_cols = ["Cabin", "Embarked", "Parch", "Pclass", "SibSp", "Ticket"]

categorical_one_hot_transformers = [(col, one_hot_pipeline, [col]) for col in categorical_cols]


In [None]:
from sklearn.compose import ColumnTransformer

transformers = bool_transformers + numerical_transformers + categorical_one_hot_transformers

preprocessor = ColumnTransformer(transformers, remainder="passthrough", sparse_threshold=1)

In [7]:
## Train - Validate - Test

In [None]:
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame, target_col is your target column
X = df.drop([target_col], axis=1)
y = df[target_col]

# Split the data into training and remaining data (80-20 split)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the remaining data into training and validation data (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

# help(LogisticRegression)

In [None]:
# Model Training

In [None]:
import mlflow
from mlflow.models import Model, infer_signature, ModelSignature
from mlflow.pyfunc import PyFuncModel
from mlflow import pyfunc
import sklearn
from sklearn import set_config
from sklearn.pipeline import Pipeline

from hyperopt import hp, tpe, fmin, STATUS_OK, Trials

def objective(params):
  with mlflow.start_run(experiment_id="exp_id") as mlflow_run:     # Make sure to declare the exp_id
    sklr_classifier = LogisticRegression(**params)

    model = Pipeline([
        ("column_selector", col_selector),
        ("preprocessor", preprocessor),
        ("classifier", sklr_classifier),
    ])

    # Enable automatic logging of input samples, metrics, parameters, and models
    mlflow.sklearn.autolog(
        log_input_examples=True,
        silent=True)

    model.fit(X_train, y_train)

    
    # Log metrics for the training set
    mlflow_model = Model()
    pyfunc.add_to_model(mlflow_model, loader_module="mlflow.sklearn")
    pyfunc_model = PyFuncModel(model_meta=mlflow_model, model_impl=model)
    training_eval_result = mlflow.evaluate(
        model=pyfunc_model,
        data=X_train.assign(**{str(target_col):y_train}),
        targets=target_col,
        model_type="classifier",
        evaluator_config = {"log_model_explainability": False,
                            "metric_prefix": "training_" , "pos_label": 1 }
    )
    sklr_training_metrics = training_eval_result.metrics
    # Log metrics for the validation set
    val_eval_result = mlflow.evaluate(
        model=pyfunc_model,
        data=X_val.assign(**{str(target_col):y_val}),
        targets=target_col,
        model_type="classifier",
        evaluator_config = {"log_model_explainability": False,
                            "metric_prefix": "val_" , "pos_label": 1 }
    )
    sklr_val_metrics = val_eval_result.metrics
    # Log metrics for the test set
    test_eval_result = mlflow.evaluate(
        model=pyfunc_model,
        data=X_test.assign(**{str(target_col):y_test}),
        targets=target_col,
        model_type="classifier",
        evaluator_config = {"log_model_explainability": False,
                            "metric_prefix": "test_" , "pos_label": 1 }
    )
    sklr_test_metrics = test_eval_result.metrics

    loss = sklr_val_metrics["val_roc_auc"]

    # Truncate metric key names so they can be displayed together
    sklr_val_metrics = {k.replace("val_", ""): v for k, v in sklr_val_metrics.items()}
    sklr_test_metrics = {k.replace("test_", ""): v for k, v in sklr_test_metrics.items()}

    return {
      "loss": loss,
      "status": STATUS_OK,
      "val_metrics": sklr_val_metrics,
      "test_metrics": sklr_test_metrics,
      "model": model,
      "run": mlflow_run,
    }

In [8]:
# Configure HyperParameter

In [None]:
space = {
  "C": 4.825020885796637,
  "penalty": "l2",
  "random_state": 719335539,
}

In [None]:
# When widening the search space and training multiple models, switch to `SparkTrials` to parallelize
# training on Spark:
# ```
# from hyperopt import SparkTrials
# trials = SparkTrials()

# ```

In [None]:
trials = Trials()
fmin(objective,
     space=space,
     algo=tpe.suggest,
     max_evals=1,  # Increase this when widening the hyperparameter search space.
     trials=trials)

best_result = trials.best_trial["result"]
model = best_result["model"]
mlflow_run = best_result["run"]

display(
  pd.DataFrame(
    [best_result["val_metrics"], best_result["test_metrics"]],
    index=["validation", "test"]))

set_config(display="diagram")
model

In [None]:
## Feature importance

# SHAP is a game-theoretic approach to explain machine learning models, providing a summary plot
# of the relationship between features and model output. Features are ranked in descending order of
# importance, and impact/color describe the correlation between the feature and the target variable.

In [None]:
# Set this flag to True and re-run the notebook to see the SHAP plots
shap_enabled = True
if shap_enabled:
    mlflow.autolog(disable=True)
    mlflow.sklearn.autolog(disable=True)
    from shap import KernelExplainer, summary_plot
    # SHAP cannot explain models using data with nulls.
    # To enable SHAP to succeed, both the background data and examples to explain are imputed with the mode (most frequent values).
    mode = X_train.mode().iloc[0]

    # Sample background data for SHAP Explainer. Increase the sample size to reduce variance.
    train_sample = X_train.sample(n=min(100, X_train.shape[0]), random_state=719335539).fillna(mode)

    # Sample some rows from the validation set to explain. Increase the sample size for more thorough results.
    example = X_val.sample(n=min(100, X_val.shape[0]), random_state=719335539).fillna(mode)

    # Use Kernel SHAP to explain feature importance on the sampled rows from the validation set.
    predict = lambda x: model.predict(pd.DataFrame(x, columns=X_train.columns))
    explainer = KernelExplainer(predict, train_sample, link="identity")
    shap_values = explainer.shap_values(example, l1_reg=False, nsamples=500)
    summary_plot(shap_values, example, class_names=model.classes_)

In [11]:
#Downloading Artifacts and Plots

In [None]:
import tempfile
import os


# Specify the desired download location
custom_location = r"location path"   # Make sure to declare the location 

# Create the custom location directory if it doesn't exist
# os.makedirs(custom_location, exist_ok=True)

# Download the artifact to the custom location
eval_path = mlflow.artifacts.download_artifacts(run_id=mlflow_run.info.run_id, dst_path=custom_location)

# Check the location of the downloaded artifact
print("Artifact downloaded to:", eval_path)



In [None]:
# Confusion Matrix

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

eval_confusion_matrix_path = os.path.join(eval_path, "training_confusion_matrix.png")

# Open the image file
img = Image.open(eval_confusion_matrix_path)

# Display the image
plt.imshow(img)
plt.show()


In [None]:
# RoC Curve

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

eval_roc_curve_path = os.path.join(eval_path, "training_roc_curve_plot.png")
img = mpimg.imread(eval_roc_curve_path)
plt.imshow(img)
plt.axis('off')
plt.show()
