In [1]:
%pip install mlflow

Collecting mlflow
  Downloading mlflow-2.10.2-py3-none-any.whl (19.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython<4,>=2.1.0 (from mlflow)
  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.0.0-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.6/147.6 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting querystring-parser<2 (from mlflow)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting gunicorn<

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, average_precision_score
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

In [3]:
def load_split_data(train, validation, test):
    train_data = pd.read_csv(train)
    validation_data = pd.read_csv(validation)
    test_data = pd.read_csv(test)

    # Split features and target
    X_train = train_data['text']
    Y_train = train_data['spam']
    X_val = validation_data['text']
    Y_val = validation_data['spam']
    X_test = test_data['text']
    Y_test = test_data['spam']

    return X_train, Y_train, X_val, Y_val, X_test, Y_test

In [4]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
train = r'/content/drive/MyDrive/train.csv'
validation = r'/content/drive/MyDrive/validation.csv'
test= r'/content/drive/MyDrive/test.csv'

Mounted at /content/drive


In [5]:
X_train, y_train, X_val, y_val, X_test, y_test = load_split_data(train, validation, test)

In [10]:
def train_and_log_model(model_name, X_train, Y_train, X_val, Y_val, params={},tags={}):
    with mlflow.start_run(run_name=model_name):
        # Defining the model pipeline
        if model_name == 'random_forest':
            model = Pipeline([
        ('tfidf', TfidfVectorizer()),  # Text processing using TF-IDF
        ('classifier', RandomForestClassifier(random_state=42))  # Random Forest Classifier
    ])
        elif model_name == 'logistic_regression':
            model = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', LogisticRegression(random_state=42, **params))
            ])

        elif model_name == 'SVM':
            model = Pipeline([
        ('tfidf', TfidfVectorizer()),  # Text processing using TF-IDF
        ('classifier', SVC(probability=True,random_state=42))  # Support Vector Machine (SVM) Classifier
    ])
        else:
            raise ValueError("Model name not recognized.")

        model.fit(X_train, Y_train)
        # Evaluate the model
        y_pred_val = model.predict(X_val)
        accuracy = accuracy_score(Y_val, Y_pred_val)
        aucpr = average_precision_score(Y_val, model.predict_proba(X_val)[:, 1])
        # Log parameters, metrics, and model
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("aucpr", aucpr)
        mlflow.sklearn.log_model(model, f"model_{model_name}")
        # After training and logging metrics, we register the model
        mlflow.sklearn.log_model(model, f"model_{model_name}")

        run_id = mlflow.active_run().info.run_id

        client = MlflowClient()
        try:
            client.create_registered_model(model_name)
        except Exception as e:
            print(f"Model {model_name} already exists in the registry.")

        model_uri = f"runs:/{run_id}/model_{model_name}"
        model_version_info = client.create_model_version(model_name, model_uri, run_id)

        tags['Created by'] = 'Hardik'  # Replace with your identifier
        for tag_key, tag_value in tags.items():
            client.set_model_version_tag(
                model_name,
                model_version_info.version,
                tag_key,
                tag_value
            )

        print(f"Model {model_name}, version {model_version_info.version} registered in the MLflow Model Registry with tags {tags}.")
        print(f"Model: {model_name}, Accuracy: {accuracy}, AUCPR: {aucpr}")

In [11]:
mlflow.set_experiment("Email Spam Classification")

<Experiment: artifact_location='file:///content/mlruns/682272734189711862', creation_time=1708510037702, experiment_id='682272734189711862', last_update_time=1708510037702, lifecycle_stage='active', name='Email Spam Classification', tags={}>

In [12]:
# Train and log models
model_names = ['random_forest', 'logistic_regression', 'SVM']
for model_name in model_names:
    train_and_log_model(model_name, X_train, Y_train, X_val, Y_val)

Model random_forest already exists in the registry.
Model random_forest, version 2 registered in the MLflow Model Registry with tags {'Created by': 'Hardik'}.
Model: random_forest, Accuracy: 0.9590723055934516, AUCPR: 0.9957490941732193
Model logistic_regression already exists in the registry.
Model logistic_regression, version 2 registered in the MLflow Model Registry with tags {'Created by': 'Hardik'}.
Model: logistic_regression, Accuracy: 0.9809004092769441, AUCPR: 0.9976031166171672
Model SVM already exists in the registry.
Model SVM, version 2 registered in the MLflow Model Registry with tags {'Created by': 'Hardik'}.
Model: SVM, Accuracy: 0.990450204638472, AUCPR: 0.9993555133291768


The SVM model has the highest accuracy and AUCPR score