In [1]:
#Importing the required functions
import pandas as pd
import numpy as np
import regex as re
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")
import random
# !pip3 install mlflow
import mlflow
import logging
from urllib.parse import urlparse
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)


**Importing train, test and validation data**

In [3]:
path_train = './train.csv'
path_val = './validation.csv'
path_test = './test.csv'

train_df = pd.read_csv(path_train)
val_df = pd.read_csv(path_val)
test_df = pd.read_csv(path_test)


In [4]:
train_df['y_train'] = train_df['y_train'].map({'ham': 1, 'spam': 0})
val_df['y_val'] = val_df['y_val'].map({'ham': 1, 'spam': 0})
test_df['y_test'] = test_df['y_test'].map({'ham': 1, 'spam': 0})

**Bag of words**

In [5]:
vectorizer = CountVectorizer()
vectorizer.fit(train_df.X_train)

bow_transformer = vectorizer.vocabulary_
print(len(bow_transformer))

7330


In [6]:
X_train = vectorizer.transform(train_df.X_train)
X_val = vectorizer.transform(val_df.X_val)
X_test = vectorizer.transform(test_df.X_test)

print(X_train.shape, X_val.shape, X_test.shape)

(4512, 7330) (502, 7330) (558, 7330)


**Tf-idf transformer**

In [8]:
tfidf_transformer = TfidfTransformer().fit(X_train)

In [9]:
tfidf_X_train = tfidf_transformer.transform(X_train)
tfidf_X_val = tfidf_transformer.transform(X_val)
tfidf_X_test = tfidf_transformer.transform(X_test)

print(tfidf_X_train.shape, tfidf_X_val.shape, tfidf_X_test.shape)

(4512, 7330) (502, 7330) (558, 7330)


**Multinomial Naive Bayes Model(tf-idf tokenizer)**

In [10]:
def MNB_model(alp):
    spam_detection_model = MultinomialNB(alpha = alp).fit(tfidf_X_train, train_df.y_train)
    test_predictions = spam_detection_model.predict(tfidf_X_test)

    acc_sc = accuracy_score(test_df.y_test, test_predictions)
    
    #Computing Precision and Recall
    precision, recall, thresholds = precision_recall_curve(test_df.y_test, test_predictions)
    # Use AUC function to calculate the area under the curve of precision recall curve
    aupcr = auc(recall, precision)
    
    conf_m = confusion_matrix(test_df.y_test, test_predictions)

    #print("The AUPCR score is:",aupcr)
    return [spam_detection_model, acc_sc, aupcr, conf_m]

**Logistic Regression Model (tf-idf tokenizer)**

In [11]:
def Log_model(C):
    spam_detection_model_2 = LogisticRegression(C = i)
    spam_detection_model_2.fit(tfidf_X_train, train_df.y_train)
    test_predictions = spam_detection_model_2.predict(tfidf_X_test)
    acc_sc = accuracy_score(test_df.y_test, test_predictions)
    
    #Computing Precision and Recall
    precision, recall, _ = precision_recall_curve(test_df.y_test, test_predictions)
    # Use AUC function to calculate the area under the curve of precision recall curve
    aupcr = auc(recall, precision)
    #print("The AUPCR score is:",aupcr)

    conf_m = confusion_matrix(test_df.y_test, test_predictions)

    #print("The AUPCR score is:",aupcr)
    return [spam_detection_model_2, acc_sc, aupcr, conf_m]

**Support Vector Classifier**

In [12]:
def SVC_Model(C):    
    spam_detection_model_3 = SVC(C = i)
    spam_detection_model_3.fit(tfidf_X_train, train_df.y_train)
    test_predictions = spam_detection_model_3.predict(tfidf_X_test)
    acc_sc = accuracy_score(test_df.y_test, test_predictions)
    
    #Computing Precision and Recall
    precision, recall, _ = precision_recall_curve(test_df.y_test, test_predictions)
    # Use AUC function to calculate the area under the curve of precision recall curve
    aupcr = auc(recall, precision)
    #print("The AUPCR score is:",aupcr)

    conf_m = confusion_matrix(test_df.y_test, test_predictions)

    #print("The AUPCR score is:",aupcr)
    return [spam_detection_model_3, acc_sc, aupcr, conf_m]

Logging and tracking multiple runs for the spam detection model built using the Multinomial Naive Bayes Model

In [13]:
mlflow.sklearn.autolog()

md_name = "Multinomial Naive Bayes" 

for i in np.arange(0.05, 2.25, 0.25):
    with mlflow.start_run(run_name = f"Model : {md_name}, Alpha : {i}"):
        
        #Log parameters
        mlflow.log_param("Alpha",i)

        #Running the model
        model_metrics = MNB_model(i)

        #Logging metrics
        mlflow.log_metric("Accuracy", model_metrics[1])
        mlflow.log_metric("AUPCR", model_metrics[2])

        #Logging JSON of confusion matrix
        mlflow.log_dict(np.array(model_metrics[3]).tolist(), "confusion_matrix.json")
        mlflow.sklearn.log_model(model_metrics[0], "model")

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        mlflow.sklearn.log_model(
            sk_model = model_metrics[0],
            artifact_path="sklearn-model",
            registered_model_name="Multinomial Naive Bayes"
        )
        if tracking_url_type_store != "file":
            mlflow.sklearn.log_model(model_metrics[0], "model", registered_model_name="Multinomial Naive Bayes")
        else:
          mlflow.sklearn.log_model(model_metrics[0], "model")

        


Successfully registered model 'Multinomial Naive Bayes'.
2023/03/21 16:48:21 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Multinomial Naive Bayes, version 1
Created version '1' of model 'Multinomial Naive Bayes'.
Registered model 'Multinomial Naive Bayes' already exists. Creating a new version of this model...
2023/03/21 16:48:34 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Multinomial Naive Bayes, version 2
Created version '2' of model 'Multinomial Naive Bayes'.
Registered model 'Multinomial Naive Bayes' already exists. Creating a new version of this model...
2023/03/21 16:48:46 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Multinomial Naive Bayes, version 3
Created version '3' of model 'Multinomial Nai

Logging and tracking multiple runs for the spam detection model built using the Logistic Regression Model

In [14]:
mlflow.sklearn.autolog()

md_name = "Logistic Regression"

for i in [0.1, 0.5, 1, 10, 20, 50, 100]:
    with mlflow.start_run(run_name = f"Model : {md_name}, C : {i}"):
        
        #Log parameters
        mlflow.log_param("C",i)

        #Running the model
        model_metrics = Log_model(i)

        #Logging metrics
        mlflow.log_metric("Accuracy", model_metrics[1])
        mlflow.log_metric("AUPCR", model_metrics[2])

        #Logging JSON of confusion matrix
        mlflow.log_dict(np.array(model_metrics[3]).tolist(), "confusion_matrix.json")
        mlflow.sklearn.log_model(model_metrics[0], "model")

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        mlflow.sklearn.log_model(
            sk_model = model_metrics[0],
            artifact_path="sklearn-model",
            registered_model_name="Logistic Regression"
        )
        if tracking_url_type_store != "file":
            mlflow.sklearn.log_model(model_metrics[0], "model", registered_model_name = "Logistic Regression")
        else:
          mlflow.sklearn.log_model(model_metrics[0], "model")

        


Successfully registered model 'Logistic Regression'.
2023/03/21 16:52:45 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Logistic Regression, version 1
Created version '1' of model 'Logistic Regression'.
Registered model 'Logistic Regression' already exists. Creating a new version of this model...
2023/03/21 16:52:57 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Logistic Regression, version 2
Created version '2' of model 'Logistic Regression'.
Registered model 'Logistic Regression' already exists. Creating a new version of this model...
2023/03/21 16:53:09 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Logistic Regression, version 3
Created version '3' of model 'Logistic Regression'.
Registered model 'Logisti

Logging and tracking multiple runs for the spam detection model built using the Support Vector Classifier Model

In [15]:
mlflow.sklearn.autolog()

md_name = "Support Vector Classifier"

for i in [0.1, 0.5, 1, 10, 20, 50, 100]:
    with mlflow.start_run(run_name = f"Model : {md_name}, C : {i}"):
        
        #Log parameters
        mlflow.log_param("C",i)

        #Running the model
        model_metrics = SVC_Model(i)

        #Logging metrics
        mlflow.log_metric("Accuracy", model_metrics[1])
        mlflow.log_metric("AUPCR", model_metrics[2])

        #Logging JSON of confusion matrix
        mlflow.log_dict(np.array(model_metrics[3]).tolist(), "confusion_matrix.json")
        mlflow.sklearn.log_model(model_metrics[0], "model")

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        mlflow.sklearn.log_model(
            sk_model = model_metrics[0],
            artifact_path="sklearn-model",
            registered_model_name = "Support Vector Classifier"
        )
        if tracking_url_type_store != "file":
            mlflow.sklearn.log_model(model_metrics[0], "model", registered_model_name = "Support Vector Classifier")
        else:
          mlflow.sklearn.log_model(model_metrics[0], "model")

        


Successfully registered model 'Support Vector Classifier'.
2023/03/21 16:54:16 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Support Vector Classifier, version 1
Created version '1' of model 'Support Vector Classifier'.
Registered model 'Support Vector Classifier' already exists. Creating a new version of this model...
2023/03/21 16:54:35 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Support Vector Classifier, version 2
Created version '2' of model 'Support Vector Classifier'.
Registered model 'Support Vector Classifier' already exists. Creating a new version of this model...
2023/03/21 16:54:52 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Support Vector Classifier, version 3
Created version '3' of model 