In [2]:
# importing dependencies here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# preprocessing
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")

# lemmitizing
from nltk.stem import WordNetLemmatizer

# vectorization and pipeline
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

# class imbalance
from imblearn.pipeline import make_pipeline as imb_make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# ML models and Cross Validation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.model_selection import cross_validate

# model evaluation
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score

# sparse to dense
from sklearn.base import TransformerMixin


class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()


# saving the model
from joblib import dump


# performance check
import time

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# reading the email data
df = pd.read_csv(os.path.join("..", "input","email-spam-classification-dataset-csv", "emails.csv"))

FileNotFoundError: [Errno 2] No such file or directory: '../input/email-spam-classification-dataset-csvemails.csv'

In [None]:
# checking first 5 records
df.head()

## EDA

In [None]:
# checking for class imbalance
df["spam"].value_counts().plot(kind="bar")

* Class 1 = Spam
* Class 0 = Ham
#### The dataset is imbalanced. I will handle class imbalance in the model pipeline.

### Checking for Null Values

In [None]:
df.isnull().sum()

### Some quick Stats check

In [None]:
print(df.info())
print(df.describe())

### Data Cleaning

In [None]:
# converting posts into lower case
df["clean_text"] = df["text"].str.lower()

df["clean_text"] = df["clean_text"].str.replace(
    re.compile(r"https?:\/\/(www)?.?([A-Za-z_0-9-]+)([\S])*"), ""
)

# dropping emails
df["clean_text"] = df["clean_text"].str.replace(re.compile(r"\S+@\S+"), "")

# dropping punctuations
df["clean_text"] = df["clean_text"].str.replace(re.compile(r"[^a-z\s]"), " ")

# dropping the word "subject"
df["clean_text"] = df["clean_text"].str.replace("subject", "")

In [None]:
df.head()

### Lemmitizing

In [None]:
# lemmitizing (excluding stop words in this step)

t = time.time()

lemmatizer = WordNetLemmatizer()

df["clean_text"] = df["clean_text"].apply(
    lambda x: " ".join(
        [
            lemmatizer.lemmatize(word)
            for word in x.split(" ")
            if word not in stopwords.words("english")
        ]
    )
)

print(f"Lemmitizing Time: {time.time() - t} seconds")

## Building ML Model

In [None]:
X = df["clean_text"].values
y = df["spam"].values

In [None]:
def build_model(model, X, y):

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # model training
    model.fit(X_train, y_train)

    # y_hat
    y_pred = model.predict(X_test)

    # model evaluation
    print(classification_report_imbalanced(y_test, y_pred))

    cross_validation_report(model, X, y)


########################################################################################################


def build_proba_model(model, X, y):

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # model training
    model.fit(X_train, y_train)

    # y_hat
    y_pred = model.predict(X_test)

    # y_probability
    y_proba = model.predict_proba(X_test)[:, 1]

    # precision recall score
    average_precision = average_precision_score(y_test, y_proba)

    # model evaluation
    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba):.2f}")
    print(f"Average Precision-Recall Score: {average_precision:.2f}")
    print(classification_report_imbalanced(y_test, y_pred))

    cross_validation_report(model, X, y)


#####################################################################################################


def cross_validation_report(model, X, y):

    raw_cv_report = cross_validate(
        model, X, y, cv=5, scoring=("accuracy", "precision", "recall")
    )

    cv_report = {f"Avg {key}": raw_cv_report[key].mean() for key in raw_cv_report}

    print("Cross Validation Report:\n--------------------------------")
    for key in cv_report:
        print(f"{key}: {cv_report[key]}")

    return

## Naive Bayes using Count Vectorizer

In [None]:
ct_nb = imb_make_pipeline(
    CountVectorizer(min_df=25, max_df=0.85, stop_words="english"),
    RandomOverSampler(),
    MultinomialNB(class_prior=None, fit_prior=True),
)

build_proba_model(ct_nb, X, y)

## Naive Bayes using TF-IDF Vectorizer

In [None]:
tf_nb = imb_make_pipeline(
    TfidfVectorizer(min_df=25, max_df=0.85, stop_words="english"),
    RandomOverSampler(),
    MultinomialNB(class_prior=None, fit_prior=True),
)

build_proba_model(tf_nb, X, y)

## Gaussian Naive Bayes using TF-IDF Vectorizer

In [None]:
tf_gaussian_nb = imb_make_pipeline(
    TfidfVectorizer(min_df=25, max_df=0.85, stop_words="english"),
    RandomOverSampler(),
    DenseTransformer(),
    GaussianNB(),
)

build_proba_model(tf_gaussian_nb, X, y)

## SVM with Count Vectorizer

### RBF kernel with variations of C 

In [None]:
# C = 1 (default)
svm_pipe = imb_make_pipeline(
    CountVectorizer(stop_words="english"),
    RandomOverSampler(),
    svm.SVC(kernel="rbf", C=1),
)
build_model(svm_pipe, X, y)

In [None]:
# C = 10
svm_pipe = imb_make_pipeline(
    CountVectorizer(stop_words="english"),
    RandomOverSampler(),
    svm.SVC(kernel="rbf", C=10),
)
build_model(svm_pipe, X, y)

In [None]:
# C = 100
svm_pipe = imb_make_pipeline(
    CountVectorizer(stop_words="english"),
    RandomOverSampler(),
    svm.SVC(kernel="rbf", C=100),
)
build_model(svm_pipe, X, y)

### Linear kernel with variations of C 

In [None]:
# C = 1 (default)
svm_pipe = imb_make_pipeline(
    CountVectorizer(stop_words="english"),
    RandomOverSampler(),
    svm.SVC(kernel="linear", C=1),
)
build_model(svm_pipe, X, y)

In [None]:
# C = 10
svm_pipe = imb_make_pipeline(
    CountVectorizer(stop_words="english"),
    RandomOverSampler(),
    svm.SVC(kernel="linear", C=10),
)
build_model(svm_pipe, X, y)

In [None]:
# C = 100
svm_pipe = imb_make_pipeline(
    CountVectorizer(stop_words="english"),
    RandomOverSampler(),
    svm.SVC(kernel="linear", C=100),
)
build_model(svm_pipe, X, y)

### Poly kernel with variations of C

In [None]:
# C = 1, default degree = 3
svm_pipe = imb_make_pipeline(
    CountVectorizer(stop_words="english"),
    RandomOverSampler(),
    svm.SVC(kernel="poly", C=1, degree=3),
)
build_model(svm_pipe, X, y)

In [None]:
# C = 10
svm_pipe = imb_make_pipeline(
    CountVectorizer(stop_words="english"),
    RandomOverSampler(),
    svm.SVC(kernel="poly", C=10, degree=3),
)
build_model(svm_pipe, X, y)

In [None]:
# C = 100
svm_pipe = imb_make_pipeline(
    CountVectorizer(stop_words="english"),
    RandomOverSampler(),
    svm.SVC(kernel="poly", C=100, degree=3),
)
build_model(svm_pipe, X, y)

### Based on the best scores for accuracy, precision and recall, selecting SVM with rbf kernel and regularization value of 10 as our final model.

#### Before training the final model on entire dataset, testing its performance with the variations of gamma parameter.

In [None]:
# gamma set to the default value of "scale"
svm_pipe = imb_make_pipeline(
    CountVectorizer(stop_words="english"),
    RandomOverSampler(),
    svm.SVC(kernel="rbf", C=10, gamma="scale"),
)
build_model(svm_pipe, X, y)

In [None]:
# gamma set to "auto"
svm_pipe = imb_make_pipeline(
    CountVectorizer(stop_words="english"),
    RandomOverSampler(),
    svm.SVC(kernel="rbf", C=10, gamma="auto"),
)
build_model(svm_pipe, X, y)

### Going ahead with the SVM trained model using default gamma value of scale and training it on the entire dataset

In [None]:
# training the final model
svm_pipe_final = imb_make_pipeline(
    CountVectorizer(stop_words="english"),
    RandomOverSampler(),
    svm.SVC(kernel="rbf", C=10, gamma="scale"),
)
svm_pipe_final.fit(X, y)

# saving the model
dump(svm_pipe_final, os.path.join("..", "model", "svm_spam_classifier.joblib"))