In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string

# Read Data

In [2]:
# read dataset
spam_df = pd.read_csv("spam.csv", encoding="ISO-8859-1")

# subset and rename columns
spam_df = spam_df[["v1", "v2"]]
spam_df.rename(columns={"v1": "spam", "v2": "text"}, inplace=True)

# convert spam column to binary
spam_df.spam = spam_df.spam.apply(lambda s: True if s == "spam" else False)

# lowercase everything and remove punctuation
spam_df.text = spam_df.text.apply(
    lambda t: t.lower().translate(str.maketrans("", "", string.punctuation))
)

# shuffle
spam_df = spam_df.sample(frac=1)

In [3]:
spam_df

Unnamed: 0,spam,text
246,False,i asked you to call him now ok
4979,False,so what u doing today
5567,True,this is the 2nd time we have tried 2 contact u...
678,False,cause im not freaky lol
1598,False,daddy will take good care of you
...,...,...
4901,True,free polyphonic ringtone text super to 87131 ...
3238,False,am okay will soon be over all the best
1705,False,yun ahnow ì wkg wherebtw if ì go nus sc ìï wan...
5251,False,yeah work is fine started last week all the sa...


In [4]:
for t in spam_df[spam_df.spam == True].iloc[:5].text:
    print(t)
    print("-------")

this is the 2nd time we have tried 2 contact u u have won the å£750 pound prize 2 claim is easy call 087187272008 now1 only 10p per minute btnationalrate
-------
you are a winner u have been specially selected 2 receive å£1000 or a 4 holiday flights inc speak to a live operator 2 claim 0871277810910pmin 18 
-------
todays voda numbers ending 7548 are selected to receive a 350 award if you have a match please call 08712300220 quoting claim code 4041 standard rates app
-------
final chance claim ur å£150 worth of discount vouchers today text yes to 85023 now savamob member offers mobile t cs savamob pobox84 m263uz å£300 subs 16
-------
from 88066 lost å£12 help
-------


In [5]:
for t in spam_df[spam_df.spam == False].iloc[:5].text:
    print(t)
    print("-------")

i asked you to call him now ok
-------
so what u doing today
-------
cause im not freaky lol
-------
daddy will take good care of you 
-------
i av a new number   wil u only use this oneta
-------


In [6]:
# get training set
train_spam_df = spam_df.iloc[: int(len(spam_df) * 0.7)]

# get testing set
test_spam_df = spam_df.iloc[int(len(spam_df) * 0.7) :]

In [7]:
FRAC_SPAM_TEXTS = train_spam_df.spam.mean()
print(FRAC_SPAM_TEXTS)

0.1317948717948718


# Create Spam Bag of Words and Non-Spam Bag of Words

In [8]:
# get all words from spam and non-spam datasets
train_spam_words = " ".join(train_spam_df[train_spam_df.spam == True].text).split(" ")
train_non_spam_words = " ".join(train_spam_df[train_spam_df.spam == False].text).split(
    " "
)

common_words = set(train_spam_words).intersection(set(train_non_spam_words))

In [9]:
train_spam_bow = dict()
for w in common_words:
    train_spam_bow[w] = train_spam_words.count(w) / len(train_spam_words)

In [10]:
train_non_spam_bow = dict()
for w in common_words:
    train_non_spam_bow[w] = train_non_spam_words.count(w) / len(train_non_spam_words)

In [11]:
train_non_spam_bow

{'': 0.023497617230210634,
 'brought': 4.1260082932766694e-05,
 'u': 0.013450787036081942,
 'sim': 0.00010315020733191673,
 'cash': 0.00014441029026468345,
 're': 0.00014441029026468345,
 'what': 0.0033420667175541024,
 'mobile': 0.00020630041466383346,
 'del': 2.0630041466383347e-05,
 'mp3': 2.0630041466383347e-05,
 '2000': 2.0630041466383347e-05,
 'registered': 4.1260082932766694e-05,
 'wan': 0.0008045716171889506,
 'here': 0.0015059930270459843,
 'nothing': 0.0005570111195923504,
 'match': 8.252016586553339e-05,
 'us': 0.0008252016586553338,
 'using': 0.00014441029026468345,
 'somebody': 0.00010315020733191673,
 'membership': 2.0630041466383347e-05,
 'service': 6.189012439915004e-05,
 'txt': 0.00016504033173106678,
 'pay': 0.00037134074639490026,
 'trip': 0.0002681905390629835,
 'has': 0.0012790625709157674,
 'try': 0.0005570111195923504,
 'forget': 0.0002475604975966002,
 'pass': 8.252016586553339e-05,
 'love': 0.0024549749344996183,
 'answer': 0.00020630041466383346,
 'text': 0.00

# Predict on Test Set

# $ P(\text{SPAM} | \text{"urgent please call this number"}) $
# $\propto P(\text{"urgent please call this number"} | \text{SPAM}) \times P(\text{SPAM}) $
# $= P(\text{"urgent"} | \text{SPAM}) \times P(\text{"please"} | \text{SPAM}) \times \dots \times P(\text{SPAM})$

# Due to numerical issues, equivalently  compute:

# $log(P(\text{"urgent"} | \text{SPAM}) \times P(\text{"please"} | \text{SPAM}) \times \dots \times P(\text{SPAM}))$
# $ = log(P(\text{"urgent"} | \text{SPAM})) + log(P(\text{"please"} | \text{SPAM})) + \dots + log(P(\text{SPAM}))$

In [12]:
def predict_text(t, verbose=False):
    # if some word doesnt appear in either spam or non-spam BOW, disregard it
    valid_words = [w for w in t if w in train_spam_bow]

    # get the probabilities of each valid word showing up in spam and non-spam BOW
    spam_probs = [train_spam_bow[w] for w in valid_words]
    non_spam_probs = [train_non_spam_bow[w] for w in valid_words]

    # print probs if requested
    if verbose:
        data_df = pd.DataFrame()
        data_df["word"] = valid_words
        data_df["spam_prob"] = spam_probs
        data_df["non_spam_prob"] = non_spam_probs
        data_df["ratio"] = [
            s / n if n > 0 else np.inf for s, n in zip(spam_probs, non_spam_probs)
        ]
        print(data_df)

    # calculate spam score as sum of logs for all probabilities
    spam_score = sum([np.log(p) for p in spam_probs]) + np.log(FRAC_SPAM_TEXTS)

    # calculate non-spam score as sum of logs for all probabilities
    non_spam_score = sum([np.log(p) for p in non_spam_probs]) + np.log(
        1 - FRAC_SPAM_TEXTS
    )

    # if verbose, report the two scores
    if verbose:
        print("Spam Score: %s" % spam_score)
        print("Non-Spam Score: %s" % non_spam_score)

    # if spam score is higher, mark this as spam
    return spam_score >= non_spam_score

In [13]:
predict_text("urgent call this number".split(), verbose=True)

     word  spam_prob  non_spam_prob      ratio
0  urgent   0.003168       0.000041  76.772539
1    call   0.019818       0.003136   6.320007
2    this   0.004467       0.003528   1.266304
3  number   0.001056       0.000887   1.190272
Spam Score: -23.966814854620573
Non-Spam Score: -28.676484779338587


True

In [14]:
predict_text("hey do you want to go a movie tonight".split(), verbose=True)

      word  spam_prob  non_spam_prob     ratio
0      hey   0.000162       0.001671  0.097211
1       do   0.001300       0.005364  0.242280
2      you   0.015838       0.026778  0.591468
3     want   0.001949       0.002269  0.858993
4       to   0.037443       0.022115  1.693080
5       go   0.001624       0.003569  0.455151
6        a   0.021280       0.015101  1.409164
7    movie   0.000162       0.000289  0.562436
8  tonight   0.000162       0.000743  0.218725
Spam Score: -58.790917173187964
Non-Spam Score: -50.467066490430874


False

In [15]:
predict_text("offer for unlimited money call now".split(), verbose=True)

        word  spam_prob  non_spam_prob      ratio
0      offer   0.001218       0.000083  14.763950
1        for   0.011371       0.007262   1.565873
2  unlimited   0.000569       0.000021  27.559373
3      money   0.000244       0.000805   0.302850
4       call   0.019818       0.003136   6.320007
5        now   0.010802       0.004023   2.685272
Spam Score: -37.45475420482497
Non-Spam Score: -43.66353333244156


True

In [16]:
predict_text("are you at class yet".split(), verbose=True)

    word  spam_prob  non_spam_prob     ratio
0    are   0.004224       0.005900  0.715828
1    you   0.015838       0.026778  0.591468
2     at   0.001706       0.005467  0.311993
3  class   0.000162       0.000578  0.281218
4    yet   0.000081       0.000660  0.123033
Spam Score: -36.1562440488276
Non-Spam Score: -28.882896426775087


False

In [17]:
predictions = test_spam_df.text.apply(lambda t: predict_text(t.split()))

In [18]:
frac_spam_messages_correctly_detected = np.sum(
    (predictions == True) & (test_spam_df.spam == True)
) / np.sum(test_spam_df.spam == True)
print("Fraction Spam Correctly Detected: %s" % frac_spam_messages_correctly_detected)

Fraction Spam Correctly Detected: 0.927038626609442


In [19]:
frac_valid_sent_to_spam = np.sum(
    (predictions == True) & (test_spam_df.spam == False)
) / np.sum(test_spam_df.spam == False)
print("Fraction Valid Messages Sent to Spam: %s" % frac_valid_sent_to_spam)

Fraction Valid Messages Sent to Spam: 0.02640722724113968
