In [394]:
import nltk
import pandas as pd
import re
import math

### Data preprocessing and tokenizing

In [395]:
data = pd.read_csv(r"C:\Users\MSI GF66\PycharmProjects\NLP\Spam or Ham\spam.csv", encoding="latin1")

In [396]:
def remove_special_characters(t):
    if isinstance(t, str):
        t = re.sub(r"[.,!@?#$%&*()+=\-_{}\[\];:'\"/\\|<>`~]", "", t)
        return t
    return t

In [397]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

def remove_stop_words(t):
    if isinstance(t, str):
        tokens = t.split()
        new = [token for token in tokens if token not in stopwords]
        return " ".join(new)
    return t

[nltk_data] Downloading package stopwords to C:\Users\MSI
[nltk_data]     GF66\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [398]:
def remove_non_ascii(t):
    if isinstance(t, str):
        return "".join(character for character in t if ord(character) < 128)
    return t

In [399]:
def data_cleaning(dataset):
    dataset = dataset["v2"].str.lower()
    dataset = dataset.apply(remove_special_characters)
    dataset = dataset.apply(remove_stop_words)
    dataset = dataset.apply(remove_non_ascii)
    return dataset

In [400]:
cleaned_data = data_cleaning(data)

print(cleaned_data)

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry 2 wkly comp win fa cup final tkts 2...
3                     u dun say early hor u c already say
4             nah dont think goes usf lives around though
                              ...                        
5567    2nd time tried 2 contact u u 750 pound prize 2...
5568                            b going esplanade fr home
5569                          pity mood soany suggestions
5570    guy bitching acted like id interested buying s...
5571                                       rofl true name
Name: v2, Length: 5572, dtype: object


In [401]:
def tokenize(dataset):
    if isinstance(dataset, str):
        return dataset.split()
    elif isinstance(dataset, list):
        return [text.split() for text in dataset]
    elif isinstance(dataset, pd.Series):
        return dataset.apply(lambda x: x.split())
    return dataset

In [402]:
data_tokens = tokenize(cleaned_data)

In [403]:
data['tokens'] = data_tokens

In [404]:
spam_messages = data[data["v1"] == "spam"]["tokens"].tolist()
ham_messages  = data[data["v1"] == "ham"]["tokens"].tolist()


In [405]:
print(ham_messages[0])

['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat']


In [406]:
def build_vocab(tokenized_data):

    vocab = set()
    for tokens in tokenized_data:
        for token in tokens:
            if token.isalpha():
                vocab.add(token)
    return sorted(list(vocab))

In [407]:
vocabulary = build_vocab(data_tokens)

### Prior probabilities

In [408]:
data = pd.read_csv(r"C:\Users\MSI GF66\PycharmProjects\NLP\Spam or Ham\spam.csv", encoding="latin1")
spam_data_count = (data["v1"] == "spam").sum()

In [409]:
print(spam_data_count)

747


In [410]:
ham_data_count = (data["v1"] == "ham").sum()

In [411]:
print(ham_data_count)

4825


In [412]:
P_spam = (spam_data_count / (spam_data_count + ham_data_count))
P_ham = (ham_data_count / (spam_data_count + ham_data_count))
print(P_spam)
print(P_ham)

0.13406317300789664
0.8659368269921034


In [413]:
print(data_tokens)

0       [go, jurong, point, crazy, available, bugis, n...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, 2, wkly, comp, win, fa, cup, fin...
3           [u, dun, say, early, hor, u, c, already, say]
4       [nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, u, 750, poun...
5568                      [b, going, esplanade, fr, home]
5569                     [pity, mood, soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [rofl, true, name]
Name: v2, Length: 5572, dtype: object


### Likelihood Probabilities

In [414]:
word_counts = {
    "spam": {},
    "ham": {}
}

In [415]:
from collections import defaultdict

word_counts = {
    "spam": defaultdict(int),
    "ham": defaultdict(int)
}

for message in spam_messages:
    for word in message:
        word_counts["spam"][word] += 1

for message in ham_messages:
    for word in message:
        word_counts["ham"][word] += 1


In [416]:
print(dict(list(word_counts["ham"].items())[:20]))

{'go': 247, 'jurong': 1, 'point': 13, 'crazy': 10, 'available': 13, 'bugis': 7, 'n': 134, 'great': 100, 'world': 32, 'la': 7, 'e': 78, 'buffet': 2, 'cine': 7, 'got': 231, 'amore': 1, 'wat': 95, 'ok': 272, 'lar': 38, 'joking': 6, 'wif': 27}


In [417]:
print(dict(list(word_counts["spam"].items())[:20]))

{'free': 216, 'entry': 26, '2': 173, 'wkly': 14, 'comp': 10, 'win': 60, 'fa': 4, 'cup': 5, 'final': 16, 'tkts': 4, '21st': 2, 'may': 7, '2005': 3, 'text': 120, '87121': 4, 'receive': 33, 'questionstd': 2, 'txt': 150, 'ratetcs': 2, 'apply': 29}


In [418]:
sorted_spam = dict(sorted(word_counts["spam"].items(), key=lambda x: x[1], reverse=True))
sorted_ham = dict(sorted(word_counts["ham"].items(), key=lambda x: x[1], reverse=True))

In [419]:
print(dict(list(sorted_spam.items())[:20]))

{'call': 347, 'free': 216, '2': 173, 'txt': 150, 'u': 147, 'ur': 144, 'mobile': 123, 'text': 120, '4': 119, 'claim': 113, 'stop': 113, 'reply': 101, 'prize': 92, 'get': 83, 'new': 69, 'send': 67, 'nokia': 65, 'urgent': 63, 'cash': 62, 'win': 60}


In [420]:
print(dict(list(sorted_ham.items())[:20]))

{'u': 972, 'im': 458, '2': 305, 'get': 303, 'ltgt': 276, 'ok': 272, 'dont': 268, 'go': 247, 'ur': 240, 'ill': 240, 'know': 232, 'got': 231, 'like': 229, 'call': 229, 'come': 224, 'good': 222, 'time': 189, 'day': 187, 'love': 185, '4': 168}


In [421]:
alpha = 1

In [422]:
P_xi_spam = {}
P_xi_ham = {}

total_spam_words = sum(word_counts["spam"].values())
total_ham_words = sum(word_counts["ham"].values())

vocabulary_size = len(vocabulary)

for word, count in sorted_spam.items():
    P_xi_spam[word] = (count + alpha)/(total_spam_words + alpha * vocabulary_size)

for word, count in sorted_ham.items():
    P_xi_ham[word] = (count + alpha)/(total_ham_words + alpha * vocabulary_size)


In [424]:
print(dict(list(P_xi_spam.items())[:20]))
print(dict(list(P_xi_ham.items())[:20]))

{'call': 0.016903871375139653, 'free': 0.010540632437946278, '2': 0.008451935687569826, 'txt': 0.007334725797833584, 'u': 0.007189002768737553, 'ur': 0.007043279739641521, 'mobile': 0.006023218535969301, 'text': 0.00587749550687327, '4': 0.005828921163841259, 'claim': 0.005537475105649196, 'stop': 0.005537475105649196, 'reply': 0.00495458298926507, 'prize': 0.004517413901976975, 'get': 0.004080244814688882, 'new': 0.0034002040122407345, 'send': 0.0033030553261767133, 'nokia': 0.0032059066401126926, 'urgent': 0.0031087579540486714, 'cash': 0.003060183611016661, 'win': 0.00296303492495264}
{'u': 0.02039404736952421, 'im': 0.009620624607000628, '2': 0.006413749738000419, 'get': 0.006371829805072312, 'ltgt': 0.005805910710542863, 'ok': 0.005722070844686649, 'dont': 0.0056382309788304335, 'go': 0.005198071683085307, 'ur': 0.005051351917836931, 'ill': 0.005051351917836931, 'know': 0.0048836721861245025, 'got': 0.004862712219660449, 'like': 0.004820792286732341, 'call': 0.004820792286732341, 

### Classification

In [425]:
model = {
    "prior": {"spam": P_spam, "ham": P_ham},
    "likelihood": {"spam": P_xi_spam, "ham": P_xi_ham},
    "vocabulary": vocabulary,
    "alpha": alpha
}

In [426]:
def classifier(message, model):
    words = message.lower().split()

    log_spam = math.log(model["prior"]["spam"])
    log_ham = math.log(model["prior"]["ham"])

    total_spam_words = sum(model["likelihood"]["spam"].values())
    total_ham_words = sum(model["likelihood"]["ham"].values())

    vocabulary_size = len(model["vocabulary"])

    for word in words:
        if word in model["vocabulary"]:
            prob_word_spam = model["likelihood"]["spam"].get(word, model["alpha"] / (total_spam_words + model["alpha"] * vocabulary_size))
            prob_word_ham  = model["likelihood"]["ham"].get(word, model["alpha"] / (total_ham_words  + model["alpha"] * vocabulary_size))

            log_spam += math.log(prob_word_spam)
            log_ham  += math.log(prob_word_ham)

    return "spam" if log_spam > log_ham else "ham"

### Examples of prediction

In [427]:
message = "Hey, are we still meeting for coffee tomorrow?"

cleaned_message = remove_non_ascii(remove_stop_words(remove_special_characters(message.lower())))

label = classifier(cleaned_message, model)

print("Prediction:", label)


Prediction: ham


In [428]:
message = "Congratulations! You won a free iPhone. Click here to claim your prize now!"

cleaned_message = remove_non_ascii(remove_stop_words(remove_special_characters(message.lower())))

label = classifier(cleaned_message, model)

print("Prediction:", label)

Prediction: spam


In [429]:
message = "Hey, you won a free coffee voucher!"

cleaned_message = remove_non_ascii(remove_stop_words(remove_special_characters(message.lower())))

label = classifier(cleaned_message, model)

print("Prediction:", label)


Prediction: spam
