In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
import re

import nltk as nl
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

In [None]:
nl.download('wordnet')
nl.download('omw-1.4')
nl.download('stopwords')

In [None]:
stopwords = stopwords.words('english')
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [None]:
dataset = pd.read_csv("/content/spam_ham_dataset.csv")

In [None]:
dataset.head()


In [None]:
dataset.info()


In [None]:
dataset_mails = dataset.drop(dataset.columns[0:2],axis=1)


In [None]:
dataset_mails


In [None]:
def clean_text(text):
    new_text=text.lower()
    clean_text= re.sub("[^a-z]+"," ",new_text)
    clean_text_stopwords = ""
    for i in clean_text.split(" ")[1:]:
        if not i in stopwords and len(i) > 3:
            clean_text_stopwords += i
            clean_text_stopwords += " "
            clean_text_stopwords=lemmatizer.lemmatize(clean_text_stopwords)
            clean_text_stopwords=stemmer.stem(clean_text_stopwords)
    return clean_text_stopwords

In [None]:
dataset_mails["text_clean"] = dataset_mails.text.apply(clean_text)


In [None]:
dataset_mails


In [None]:
dataset_mails_clean = dataset_mails.drop(dataset_mails.columns[0:1],axis=1)


In [None]:
dataset_mails_clean['len'] = dataset_mails_clean['text_clean'].str.len()


In [None]:
dataset_mails_clean


In [None]:
plt.rcParams['figure.figsize'] = (10, 7)
sns.boxenplot(x = dataset_mails_clean['label_num'], y = dataset_mails_clean['len'])
plt.title('relationship between spam and text length')
plt.show()

In [None]:
count_Class=pd.value_counts(dataset_mails_clean["label_num"], sort= True)
count_Class.plot(kind= 'bar', color= ["blue", "orange"])
plt.title('Bar chart')
plt.show()

In [None]:
count_Class.plot(kind = 'pie',  autopct='%1.0f%%')
plt.title('Pie chart')
plt.ylabel('')
plt.show()

In [None]:
count1 = Counter(" ".join(dataset_mails_clean[dataset_mails_clean['label_num']==0]["text_clean"]).split()).most_common(20)
df1 = pd.DataFrame.from_dict(count1)
df1 = df1.rename(columns={0: "words in non-spam", 1 : "count"})
count2 = Counter(" ".join(dataset_mails_clean[dataset_mails_clean['label_num']==1]["text_clean"]).split()).most_common(20)
df2 = pd.DataFrame.from_dict(count2)
df2 = df2.rename(columns={0: "words in spam", 1 : "count_"})

In [None]:
df1.plot.bar(legend = False)
y_pos = np.arange(len(df1["words in non-spam"]))
plt.xticks(y_pos, df1["words in non-spam"])
plt.title('Most frequent words in non-spam messages')
plt.xlabel('words')
plt.ylabel('number')
plt.show()

In [None]:
df2.plot.bar(legend = False, color = 'orange')
y_pos = np.arange(len(df2["words in spam"]))
plt.xticks(y_pos, df2["words in spam"])
plt.title('Most frequent words in spams')
plt.xlabel('words')
plt.ylabel('number')
plt.show()

In [None]:
dataset_mails_clean.groupby('label_num').describe()


In [None]:
x = dataset_mails_clean['text_clean']
y = dataset_mails_clean['label_num']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2)


In [None]:
print(x_train.shape, x_test.shape)


In [None]:
print("Samples per class in train {}".format(np.bincount(y_train)))
print("Samples per class in test {}".format(np.bincount(y_test)))

In [None]:
bow_vec = CountVectorizer()

In [None]:
train_bow = bow_vec.fit_transform(x_train)
test_bow = bow_vec.transform(x_test)

In [None]:
cv_df = pd.DataFrame(train_bow.toarray(),columns = bow_vec.get_feature_names_out())
cv_df.head()

In [None]:
feature_names = bow_vec.get_feature_names_out()
print("Number of features: {}".format(len(feature_names)))

In [None]:
# Use multiple classifiers and grid search for prediction
def ML_modeling(models, params, X_train, X_test, y_train, y_test, performance_metrics):

    if not set(models.keys()).issubset(set(params.keys())):
        raise ValueError('Some estimators are missing parameters')

    for key in models.keys():

        model = models[key]
        param = params[key]
        gs = GridSearchCV(model, param, cv=10, error_score=0, refit=True)
        gs.fit(X_train, y_train)
        y_pred = gs.predict(X_test)

        # Print scores for the classifier
        accuracy_sc = accuracy_score(y_test, y_pred)
        precision_sc= precision_score(y_test, y_pred, average='macro')
        recall_sc = recall_score(y_test, y_pred, average='macro')
        f1_sc =  f1_score(y_test, y_pred, average='macro')

        performance_metrics.append([key,accuracy_sc,precision_sc,recall_sc,f1_sc])
        print(key, ':', gs.best_params_)
        print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_sc, precision_sc, recall_sc, f1_sc))

    return

In [None]:
## Preparing to make a pipeline
models = {
    'SVM': SVC(),
    'Naive Bayes': MultinomialNB()
}

params = {
    'SVM': { 'kernel': ['linear', 'rbf'] },
    'Naive Bayes': { 'alpha': [0.5, 1], 'fit_prior': [True, False] }
}

In [None]:
%%time
performance_metrics_bow = []
print("==============Bag of Words==============\n")
ML_modeling(models, params, train_bow, test_bow, y_train, y_test, performance_metrics_bow)

In [None]:
metrics_bow_df = pd.DataFrame(performance_metrics_bow,columns=['Model' , 'Accuracy', 'Precision' , 'Recall', "F1 Score"])

In [None]:
tfidf = TfidfVectorizer()


In [None]:
train_tfidf = tfidf.fit_transform(x_train)
test_tfidf = tfidf.transform(x_test)

In [None]:
tfidf_df = pd.DataFrame(train_tfidf.toarray(), columns = tfidf.get_feature_names_out())
tfidf_df.head()

In [None]:
print("==============TF-IDF==============\n")
performance_metrics_tfidf = []
ML_modeling(models, params, train_tfidf, test_tfidf, y_train, y_test, performance_metrics_tfidf)

In [None]:
metrics_tfidf_df = pd.DataFrame(performance_metrics_tfidf,columns=['Model' , 'Accuracy', 'Precision' , 'Recall', "F1 Score"])

In [None]:
metrics_bow_df


In [None]:
metrics_tfidf_df
