# Naive Bayes classifier for spam filtering

In [1]:
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings("ignore")

#### Dataset from https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [3]:
df = pd.read_csv('../Dades/sms/SMSSpamCollection', header=None, sep='\t', names=['Label', 'SMS'])

In [4]:
df.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.shape

(5572, 2)

In [6]:
df.Label.value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

### metodología

1. Construim un diccionari amb ***totes les paraules (tokens)*** dels missatges que tenim al dataset

2. Apliquem un classificador naive Bayes:
    - cada paraula del missatge és una nova font d'informació sobre la possible naturalesa del missatge
    - implícitament, estem considerant les paraules com variables independents *cosa que òbviament no és certa* (però això ja és la filosofia naive Bayes)
    - en aquest cas però, cada paraula és una ***realització*** de la mateixa variable estocàstica $X=\text{"incloure paraula en el missatge"}$

3. Definim dues funcions de versemblança per X:
    - una ens indica com de versemblant és que s'hagi inclòs una determinada paraula en un missatge NO_SPAM (ham) $P\left(X=\text{'love'}\,|Y=\text{ham}\right)$
    - una ens indica com de versemblant és que s'hagi inclòs una determinada paraula en un missatge SPAM $P\left(X=\text{'love'}\,|Y=\text{spam}\right)$
    - les dues funcions de versemblança son ***discretes*** i de ***cardinalitat igual al número de paraules que tenim al diccionari***

### preparació de les dades
- eliminar signes de puntuació
- passar tot a minúscules
- separar les paraules (***tokenize***)

In [7]:
import string

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
def tokenize(message):
    for p in string.punctuation:
        message = message.replace(p, ' ')
    return message.lower().split()

In [10]:
%time
df['sms'] = df.SMS.apply(tokenize)
df.head()

CPU times: total: 0 ns
Wall time: 0 ns


Unnamed: 0,Label,SMS,sms
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, don, t, think, he, goes, to, usf, he,..."


In [11]:
print(df.iloc[100]['SMS'])
print(df.iloc[100]['sms'])

Please don't text me anymore. I have nothing else to say.
['please', 'don', 't', 'text', 'me', 'anymore', 'i', 'have', 'nothing', 'else', 'to', 'say']


### Hi haurà un gran gruix de paraules són dels dos i un petit seran nomès de un. Seran aquestes petites les que fan decantar la balança. 

### Split train and test data

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test = train_test_split(df, test_size = 0.2, random_state = 2873)

In [14]:
print(X_train.shape)
X_train.head()

(4457, 3)


Unnamed: 0,Label,SMS,sms
632,ham,Thank u!,"[thank, u]"
4343,ham,Ha! I wouldn't say that I just didn't read any...,"[ha, i, wouldn, t, say, that, i, just, didn, t..."
13,ham,I've been searching for the right words to tha...,"[i, ve, been, searching, for, the, right, word..."
3227,ham,"Rose for red,red for blood,blood for heart,hea...","[rose, for, red, red, for, blood, blood, for, ..."
4515,ham,"Sure, whenever you show the fuck up &gt;:(","[sure, whenever, you, show, the, fuck, up, gt]"


In [15]:
X_train.Label.value_counts(normalize = True)

ham     0.869195
spam    0.130805
Name: Label, dtype: float64

In [16]:
print(X_test.shape)
X_test.head()

(1115, 3)


Unnamed: 0,Label,SMS,sms
4062,ham,"Aight I've been set free, think you could text...","[aight, i, ve, been, set, free, think, you, co..."
494,ham,What class of &lt;#&gt; reunion?,"[what, class, of, lt, gt, reunion]"
1396,ham,Thats cool! I am a gentleman and will treat yo...,"[thats, cool, i, am, a, gentleman, and, will, ..."
2575,spam,Your next amazing xxx PICSFREE1 video will be ...,"[your, next, amazing, xxx, picsfree1, video, w..."
3120,ham,Stop knowing me so well!,"[stop, knowing, me, so, well]"


### quan mostrejem aleatoriament, el subconjunt, ha de conservar més o menys la mateixa proporció. 

In [17]:
X_test.Label.value_counts(normalize = True)

ham     0.852915
spam    0.147085
Name: Label, dtype: float64

### get dictionary of words from the train dataset

In [18]:
# Tenim  una llista amb totes, i mirem quantes n'hi ha i inicialitzem un dict a 0 per aquesta paraula.

In [19]:
# Totes lesparaules junts
X_train['sms'].sum()

['thank',
 'u',
 'ha',
 'i',
 'wouldn',
 't',
 'say',
 'that',
 'i',
 'just',
 'didn',
 't',
 'read',
 'anything',
 'into',
 'way',
 'u',
 'seemed',
 'i',
 'don',
 't',
 'like',
 '2',
 'be',
 'judgemental',
 'i',
 'save',
 'that',
 'for',
 'fridays',
 'in',
 'the',
 'pub',
 'i',
 've',
 'been',
 'searching',
 'for',
 'the',
 'right',
 'words',
 'to',
 'thank',
 'you',
 'for',
 'this',
 'breather',
 'i',
 'promise',
 'i',
 'wont',
 'take',
 'your',
 'help',
 'for',
 'granted',
 'and',
 'will',
 'fulfil',
 'my',
 'promise',
 'you',
 'have',
 'been',
 'wonderful',
 'and',
 'a',
 'blessing',
 'at',
 'all',
 'times',
 'rose',
 'for',
 'red',
 'red',
 'for',
 'blood',
 'blood',
 'for',
 'heart',
 'heart',
 'for',
 'u',
 'but',
 'u',
 'for',
 'me',
 'send',
 'tis',
 'to',
 'all',
 'ur',
 'friends',
 'including',
 'me',
 'if',
 'u',
 'like',
 'me',
 'if',
 'u',
 'get',
 'back',
 '1',
 'u',
 'r',
 'poor',
 'in',
 'relation',
 '2',
 'u',
 'need',
 'some',
 '1',
 'to',
 'support',
 '3',
 'u',
 'r

In [20]:
# Amb el set, agafem les úniques

In [21]:
# Per cada paraula unica, inicialitzem un dict a 0
dictionary = {word: {'ham': 0, 'spam': 0} for word in list(set(X_train['sms'].sum()))}
len(dictionary)

7884

In [22]:
dictionary

{'fireplace': {'ham': 0, 'spam': 0},
 'places': {'ham': 0, 'spam': 0},
 'giving': {'ham': 0, 'spam': 0},
 'comfey': {'ham': 0, 'spam': 0},
 'copies': {'ham': 0, 'spam': 0},
 'cheap': {'ham': 0, 'spam': 0},
 '087187262701': {'ham': 0, 'spam': 0},
 '81151': {'ham': 0, 'spam': 0},
 'cn': {'ham': 0, 'spam': 0},
 'settling': {'ham': 0, 'spam': 0},
 'rows': {'ham': 0, 'spam': 0},
 'breaker': {'ham': 0, 'spam': 0},
 'gbp1': {'ham': 0, 'spam': 0},
 'flatter': {'ham': 0, 'spam': 0},
 '5p': {'ham': 0, 'spam': 0},
 'r836': {'ham': 0, 'spam': 0},
 'noice': {'ham': 0, 'spam': 0},
 '08719899229': {'ham': 0, 'spam': 0},
 'rite': {'ham': 0, 'spam': 0},
 'nudist': {'ham': 0, 'spam': 0},
 'royal': {'ham': 0, 'spam': 0},
 'til': {'ham': 0, 'spam': 0},
 '2nights': {'ham': 0, 'spam': 0},
 'suprman': {'ham': 0, 'spam': 0},
 'bx420': {'ham': 0, 'spam': 0},
 'ppm': {'ham': 0, 'spam': 0},
 'into': {'ham': 0, 'spam': 0},
 'hour': {'ham': 0, 'spam': 0},
 'missin': {'ham': 0, 'spam': 0},
 '08000407165': {'ham': 0

### get word frequencies for ham/spam

In [23]:
def count_words(message):
    for word in message.sms: dictionary[word][message.Label] +=1 

_ = X_train.apply(count_words, axis = 1)

# Ho guardem en memoria però no cal. Ho podem borrar
# del(_)

In [24]:
# check
dictionary['love']

{'ham': 153, 'spam': 9}

### prior

In [25]:
prior = X_train.Label.value_counts(normalize = True)
prior

ham     0.869195
spam    0.130805
Name: Label, dtype: float64

In [26]:
prior['ham']

0.8691945254655598

Passem el diccionaria proporcio. dividim el 153 de love de NO_SPAM, per totes les paraules que estan a NO_SPAM. Suma de totes les paraules. 

### likelihood functions
- we need to know the total number of counts for each type of message

In [27]:
# total number of words in HAM messages
n_ham = sum([counts['ham'] for word, counts in dictionary.items()])
n_ham

57342

#### Com que estem a test, si després ens ve una paraula que no tenim (nova). Multiplicarem per 1 les dues (ens quedarem igual).

In [28]:
def ham_word_likelihood(word):
    if word in dictionary:
        return dictionary[word]['ham'] /n_ham
    else:
        return 1

In [29]:
# total number of words in SPAM messages
n_spam = sum([counts['spam'] for word, counts in dictionary.items()])
n_spam

14814

In [30]:
def spam_word_likelihood(word):
    if word in dictionary:
        return dictionary[word]['spam'] /n_spam
    else:
        return 1

### Classify

In [31]:
prior

ham     0.869195
spam    0.130805
Name: Label, dtype: float64

In [32]:
def classify(message):
    post_ham, post_spam = prior # és una tupla
    for word in message:
        post_ham *= ham_word_likelihood(word)
        post_spam *= spam_word_likelihood(word)
    return 'spam' if post_spam > post_ham else ('ham' if post_ham > post_spam else '??')

In [33]:
classify(['loving']), classify(['£2000'])

('ham', 'spam')

In [34]:
classify(['secret', 'source', 'of', 'infinite', 'power'])

'ham'

### test

In [35]:
X_test['predicted'] = X_test.sms.apply(classify)

In [36]:
X_test.head()

Unnamed: 0,Label,SMS,sms,predicted
4062,ham,"Aight I've been set free, think you could text...","[aight, i, ve, been, set, free, think, you, co...",ham
494,ham,What class of &lt;#&gt; reunion?,"[what, class, of, lt, gt, reunion]",ham
1396,ham,Thats cool! I am a gentleman and will treat yo...,"[thats, cool, i, am, a, gentleman, and, will, ...",ham
2575,spam,Your next amazing xxx PICSFREE1 video will be ...,"[your, next, amazing, xxx, picsfree1, video, w...",ham
3120,ham,Stop knowing me so well!,"[stop, knowing, me, so, well]",ham


In [37]:
pd.concat((X_test.groupby('Label').predicted.value_counts().to_frame('n'), X_test.groupby('Label').predicted.value_counts(normalize = True).to_frame('%')), axis = 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,%
Label,predicted,Unnamed: 2_level_1,Unnamed: 3_level_1
ham,ham,903,0.949527
ham,??,40,0.042061
ham,spam,8,0.008412
spam,spam,117,0.713415
spam,??,41,0.25
spam,ham,6,0.036585


##### not bad but many unclassified messages ... why ???

- És perque hi han paraules que són 100% ham i llavors quan evaulem per spam és 0.

### Per evitar / suavitzar les funcions de likelyhood a partir de mostres que no són suficientment representatives.

Es pot fer laplacian smoothing &rarr; És posar un 1 on hi ha el 0 i sumar +1 a totes les altres.

És el que hi ha al següent `Notebook nb47.2`

### Ho podem fer inicialitzat el diccionari a 1.

In [38]:
def classify_3(message):
    post_ham, post_spam = prior # és una tupla
    for word in message:
        post_ham *= ham_word_likelihood(word)
        post_spam *= spam_word_likelihood(word)    
    if post_spam > post_ham:
        return 'spam'
    else:
        if post_ham > post_spam:
            'ham'
        else:
            print(message)
            return '??'

In [39]:
def classify_2(message):
    post_ham, post_spam = prior # és una tupla
    for word in message:
        post_ham *= ham_word_likelihood(word)
        post_spam *= spam_word_likelihood(word)
        print(f'paraula: {word}')
        print(f'ham: {ham_word_likelihood(word)} i total: {post_ham}')
        print(f'spam: {spam_word_likelihood(word)} i total: {post_spam}')
    
    if post_spam > post_ham:
        return 'spam'
    else:
        if post_ham > post_spam:
            'ham'
        else:
            print(message)
            return '??'

classify_2(X_test.loc[2370]['sms'])


paraula: a
ham: 0.01482334065780754 i total: 0.012884366548877363
spam: 0.01977858782233023 i total: 0.002587147565721006
paraula: boy
ham: 0.00040110215897596875 i total: 5.167947239792462e-06
spam: 0.0 i total: 0.0
paraula: loved
ham: 0.0001395137944264239 i total: 7.209999288190104e-10
spam: 0.0 i total: 0.0
paraula: a
ham: 0.01482334065780754 i total: 1.068762755913918e-11
spam: 0.01977858782233023 i total: 0.0
paraula: gal
ham: 0.00010463534581981794 i total: 1.1183036056439448e-15
spam: 0.0 i total: 0.0
paraula: he
ham: 0.00303442502877472 i total: 3.3934084507350002e-18
spam: 0.0 i total: 0.0
paraula: propsd
ham: 1.743922430330299e-05 i total: 5.917841112509156e-23
spam: 0.0 i total: 0.0
paraula: bt
ham: 0.0002790275888528478 i total: 1.6512409368376844e-26
spam: 0.0008100445524503848 i total: 0.0
paraula: she
ham: 0.0023194168323392977 i total: 3.8299160231490365e-29
spam: 0.0 i total: 0.0
paraula: didnt
ham: 0.0003662237103693628 i total: 1.4026060564007144e-32
spam: 0.0 i tot

'??'

In [40]:
dictionary = {word: {'ham': 1, 'spam': 1} for word in list(set(X_train['sms'].sum()))}
n_ham = sum([counts['ham'] for word, counts in dictionary.items()])
_ = X_train.apply(count_words, axis = 1)
X_test['predicted2'] = X_test.sms.apply(classify_3)

In [41]:
pd.concat((X_test.groupby('Label')['predicted2'].value_counts().to_frame('n'), X_test.groupby('Label')['predicted2'].value_counts(normalize = True).to_frame('%')), axis = 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,%
Label,predicted2,Unnamed: 2_level_1,Unnamed: 3_level_1
spam,spam,41,1.0


### Abans:

In [42]:
pd.concat((X_test.groupby('Label').predicted.value_counts().to_frame('n'), X_test.groupby('Label').predicted.value_counts(normalize = True).to_frame('%')), axis = 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,%
Label,predicted,Unnamed: 2_level_1,Unnamed: 3_level_1
ham,ham,903,0.949527
ham,??,40,0.042061
ham,spam,8,0.008412
spam,spam,117,0.713415
spam,??,41,0.25
spam,ham,6,0.036585
