<a href="https://colab.research.google.com/github/iranfromiran/nlp-assignments/blob/main/spam_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

spam classification for emails

In [11]:
from bs4 import BeautifulSoup
import re
import nltk

def normalizing(myString):
    # convert text to lowercase
    myString = myString.lower()

    # convert URLs to 'httpaddr'
    myString = re.sub(r'(http|https)://[^\s]*', r' httpaddr ', myString)

    # convert email addresses to 'emailaddr'
    myString = re.sub(r'[^\s]+@[^\s]+[.][^\s]+', r' emailaddr ', myString)

    # convert numbers to 'number'
    myString = re.sub(r'[0-9]+', r' number ', myString)

    # convert $, ! and ? to words
    myString = re.sub(r'[$]', r' dollar ', myString)
    myString = re.sub(r'[!]', r' exclammark ', myString)
    myString = re.sub(r'[?]', r' questmark ', myString)

    # convert other punctuation to whitespace
    myString = re.sub(r'([^\w\s]+)|([_-]+)', r' ', myString)

    # convert newlines and blanklines to special strings and extra whitespace to single
    myString = re.sub(r'\n', r' newline ', myString)
    myString = re.sub(r'\n\n', r' blankline ', myString)
    myString = re.sub(r'\s+', r' ', myString)
    myString = myString.strip(' ')

    # perform word stemming
    myStringWords = myString.split(' ')
    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    stemWords = [stemmer.stem(word) for word in myStringWords]
    myString = ' '.join(stemWords)

    return myString



In [12]:
import openpyxl
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [13]:
def store():
    myExel_workbook = openpyxl.load_workbook('email.xlsx')
    myDataSheet = myExel_workbook['Data Set']
    xData = []
    yData = []
    rows = myDataSheet.max_row

    for i in range(2, rows+1):
        if (str(myDataSheet.cell(row = i, column = 1).value) != 'None'):
            xData.append(str(normalizing(str(myDataSheet.cell(row = i, column = 2).value))))
            if (str(myDataSheet.cell(row = i, column = 1).value) == "spam"):
                yData.append(1)
            else:
                yData.append(0)
    xTrain, xTest, yTrain, yTest = train_test_split(xData, yData, test_size=0.2, random_state=0)
    return xTrain, xTest, yTrain, yTest

In [14]:
# Calculating the F-score and precision, recall, matrix
def calcMetrics(xTest, yTest, model, vectorizer):

    xTestMatrix = vectorizer.transform(xTest)
    yTestMatrix = np.asarray(yTest)

    result = model.predict(xTestMatrix)
    matrix = confusion_matrix(yTestMatrix, result)

    fScore = f1_score(yTestMatrix, result, pos_label = 0)
    precision = precision_score(yTestMatrix, result, pos_label=0)
    recall = recall_score(yTestMatrix, result, pos_label=0)
    return fScore, precision, recall, matrix

In [37]:
# Test new data for Spam
def test_new(emailBody, model, vectorizer):

    featureMatrix = vectorizer.transform([normalizing(emailBody)])
    result = model.predict(featureMatrix)
    print("Predicting...")

    if (1 in result):
        return "Spam"
    else:
        return "Not Spam"

model = LinearSVC(class_weight='balanced')

In [26]:
# Create training data
xTrain, xTest, yTrain, yTest = store()

vectorizer = TfidfVectorizer(stop_words='english', max_df=75)
yTrainMatrix = np.asarray(yTrain)
xTrainMatrix = vectorizer.fit_transform(xTrain)

In [38]:

# Training SVM classifier
model.fit(xTrainMatrix, yTrainMatrix)
fScore, precision, recall, matrix = calcMetrics(xTest, yTest, model, vectorizer)
print(fScore, precision, recall, matrix)

0.9896587383660806 0.9886363636363636 0.9906832298136646 [[957   9]
 [ 11 138]]


In [39]:

emailBody= "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. text FA to 87121 "
label = test_new(emailBody, model, vectorizer)
print("Email is: %s" % label)

Predicting...
Email is: Spam
