# Naive Bayes classifier for spam filtering

In [2]:
import pandas as pd

In [3]:
import warnings
warnings.filterwarnings("ignore")

#### Dataset from https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [4]:
df = pd.read_csv('../../data/sms/SMSSpamCollection', header=None, sep='\t', names=['Label', 'SMS'])

In [5]:
df.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.shape

(5572, 2)

In [7]:
df.Label.value_counts()

ham     4825
spam     747
Name: Label, dtype: int64


### metodología

1. Construim un diccionari amb ***totes les paraules (tokens)*** dels missatges que tenim al dataset

2. Apliquem un classificador naive Bayes:
    - cada paraula del missatge és una nova font d'informació sobre la possible naturalesa del missatge
    - implícitament, estem considerant les paraules com variables independents *cosa que òbviament no és certa* (però això ja és la filosofia naive Bayes)
    - en aquest cas però, cada paraula és una ***realització*** de la mateixa variable estocàstica $X=\text{"incloure paraula en el missatge"}$

3. Definim dues funcions de versemblança per X:
    - una ens indica com de versemblant és que s'hagi inclòs una determinada paraula en un missatge NO_SPAM (ham) $P\left(X=\text{'love'}\,|Y=\text{ham}\right)$
    - una ens indica com de versemblant és que s'hagi inclòs una determinada paraula en un missatge SPAM $P\left(X=\text{'love'}\,|Y=\text{spam}\right)$
    - les dues funcions de versemblança son ***discretes*** i de ***cardinalitat igual al número de paraules que tenim al diccionari***

### preparació de les dades
- eliminar signes de puntuació
- passar tot a minúscules
- separar les paraules (***tokenize***)

In [7]:
import string

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
def tokenize(message):
    for p in string.punctuation:
        message = message.replace(p, ' ')
    return message.lower().split()

In [10]:
%time
df['sms'] = df.SMS.apply(tokenize)
df.head()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


Unnamed: 0,Label,SMS,sms
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, don, t, think, he, goes, to, usf, he,..."


In [11]:
print(df.iloc[100]['SMS'])
print(df.iloc[100]['sms'])

Please don't text me anymore. I have nothing else to say.
['please', 'don', 't', 'text', 'me', 'anymore', 'i', 'have', 'nothing', 'else', 'to', 'say']


### Split train and test data

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test = train_test_split(df, test_size = 0.2, random_state = 2873)

In [14]:
print(X_train.shape)
X_train.head()

(4457, 3)


Unnamed: 0,Label,SMS,sms
632,ham,Thank u!,"[thank, u]"
4343,ham,Ha! I wouldn't say that I just didn't read any...,"[ha, i, wouldn, t, say, that, i, just, didn, t..."
13,ham,I've been searching for the right words to tha...,"[i, ve, been, searching, for, the, right, word..."
3227,ham,"Rose for red,red for blood,blood for heart,hea...","[rose, for, red, red, for, blood, blood, for, ..."
4515,ham,"Sure, whenever you show the fuck up &gt;:(","[sure, whenever, you, show, the, fuck, up, gt]"


In [15]:
X_train.Label.value_counts(normalize = True)

ham     0.869195
spam    0.130805
Name: Label, dtype: float64

In [16]:
print(X_test.shape)
X_test.head()

(1115, 3)


Unnamed: 0,Label,SMS,sms
4062,ham,"Aight I've been set free, think you could text...","[aight, i, ve, been, set, free, think, you, co..."
494,ham,What class of &lt;#&gt; reunion?,"[what, class, of, lt, gt, reunion]"
1396,ham,Thats cool! I am a gentleman and will treat yo...,"[thats, cool, i, am, a, gentleman, and, will, ..."
2575,spam,Your next amazing xxx PICSFREE1 video will be ...,"[your, next, amazing, xxx, picsfree1, video, w..."
3120,ham,Stop knowing me so well!,"[stop, knowing, me, so, well]"


In [17]:
X_test.Label.value_counts(normalize = True)

ham     0.852915
spam    0.147085
Name: Label, dtype: float64

### get dictionary of words from the train dataset

In [18]:
dictionary = {word: {'ham': 0, 'spam': 0} for word in list(set(X_train['sms'].sum()))}
len(dictionary)

7884

### get word frequencies for ham/spam

In [19]:
def count_words(message):
    for word in message.sms: dictionary[word][message.Label] +=1 

_ = X_train.apply(count_words, axis = 1)

In [20]:
# check
dictionary['love']

{'ham': 153, 'spam': 9}

### prior

In [21]:
prior = X_train.Label.value_counts(normalize = True)
prior

ham     0.869195
spam    0.130805
Name: Label, dtype: float64

### likelihood functions
- we need to know the total number of counts for each type of message

In [22]:
# total number of words in HAM messages
n_ham = sum([counts['ham'] for word, counts in dictionary.items()])
n_ham

57342

In [23]:
def ham_word_likelihood(word):
    if word in dictionary:
        return dictionary[word]['ham'] /n_ham
    else:
        return 1

In [24]:
# total number of words in SPAM messages
n_spam = sum([counts['spam'] for word, counts in dictionary.items()])
n_spam

14814

In [25]:
def spam_word_likelihood(word):
    if word in dictionary:
        return dictionary[word]['spam'] /n_spam
    else:
        return 1

### Classify

In [26]:
def classify(message):
    post_ham, post_spam = prior
    for word in message:
        post_ham *= ham_word_likelihood(word)
        post_spam *= spam_word_likelihood(word)
    return 'spam' if post_spam > post_ham else ('ham' if post_ham > post_spam else '??')

In [27]:
classify(['loving']), classify(['£2000'])

('ham', 'spam')

In [28]:
classify(['secret', 'source', 'of', 'infinite', 'power'])

'ham'

### test

In [29]:
X_test['predicted'] = X_test.sms.apply(classify)

In [30]:
pd.concat((X_test.groupby('Label').predicted.value_counts().to_frame('n'), X_test.groupby('Label').predicted.value_counts(normalize = True).to_frame('%')), axis = 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,%
Label,predicted,Unnamed: 2_level_1,Unnamed: 3_level_1
ham,ham,903,0.949527
ham,??,40,0.042061
ham,spam,8,0.008412
spam,spam,117,0.713415
spam,??,41,0.25
spam,ham,6,0.036585


##### not bad but many unclassified messages ... why ???