In [219]:
import pandas as pd

sms_spam = pd.read_csv('aima-data/SMSSpamCollection.csv', sep='\t',
header=None, names=['Label', 'SMS'])

print(sms_spam.shape)
sms_spam.head()

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [220]:
# Randomize the dataset
data_randomized = sms_spam.sample(frac=1, random_state=1)

# Calculate index for split
training_test_index = round(len(data_randomized) * 0.8)

# Split into training and test sets
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(4458, 2)
(1114, 2)


In [221]:
# After cleaning
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
training_set['SMS'] = training_set['SMS'].str.lower()

In [222]:
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = []
for sms in training_set['SMS']:
   for word in sms:
      vocabulary.append(word)

vocabulary = list(set(vocabulary))

In [223]:
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
   for word in sms:
      word_counts_per_sms[word][index] += 1

In [224]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,neekunna,cheap,brilliantly,1120,proof,ntt,prsn,leh,choosing,prevent,...,subtoitles,showing,tell,regard,informed,forwarded,elaborating,types,innu,listen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [225]:
training_set_clean = pd.concat([word_counts, training_set["Label"]], axis=1)
training_set_clean.head()

Unnamed: 0,neekunna,cheap,brilliantly,1120,proof,ntt,prsn,leh,choosing,prevent,...,showing,tell,regard,informed,forwarded,elaborating,types,innu,listen,Label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham


In [226]:
training_set_clean.to_csv("aima-data/CleanSMSSpamCollection.csv", header=False, index=False)

In [227]:
from dataset import *

dataset = DataSet(name="CleanSMSSpamCollection")

In [228]:
from learningModels import *

nBD = NaiveBayesLearner(dataset, continuous=False)

In [229]:
test_set['SMS'] = test_set['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
test_set['SMS'] = test_set['SMS'].str.lower()

In [230]:
test_set['SMS'] = test_set['SMS'].str.split()

In [231]:
word_counts_per_sms_test = {unique_word: [0] * len(test_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(test_set['SMS']):
   for word in sms:
      if word in vocabulary:
         word_counts_per_sms_test[word][index] += 1

In [232]:
word_counts_test = pd.DataFrame(word_counts_per_sms_test)
word_counts_test.head()

Unnamed: 0,neekunna,cheap,brilliantly,1120,proof,ntt,prsn,leh,choosing,prevent,...,subtoitles,showing,tell,regard,informed,forwarded,elaborating,types,innu,listen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [233]:
test_set_clean = pd.concat([word_counts_test, test_set["Label"]], axis=1)
test_set_clean.head()

Unnamed: 0,neekunna,cheap,brilliantly,1120,proof,ntt,prsn,leh,choosing,prevent,...,showing,tell,regard,informed,forwarded,elaborating,types,innu,listen,Label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spam
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham


In [235]:
tp = 0
tn = 0
fp = 0
fn = 0

for i in test_set_clean.iterrows():
    row = list(i[1])
    actual = row[-1]
    predicted = nBD(row[:-1])

    if (actual == "spam" and predicted == "ham"):
        fp += 1
    if (actual == "ham" and predicted == "ham"):
        tp += 1
    if (actual == "spam" and predicted == "spam"):
        tn += 1
    if (actual == "ham" and predicted == "spam"):
        fn += 1

In [239]:
from tabulate import tabulate

def list_cm(cm,classes):     #función para generar de una forma más visual la matriz de confusión
      row_0 =['','Valor','Verdadero','', '']
      row_1 =['-',classes[0],classes[1]]
      row_2 =[classes[0],cm[0],cm[2]]
      row_3 =[classes[1],cm[1],cm[3]]
      table = zip(row_0,row_1, row_2, row_3)
      headers = ['', '', 'Valor', 'Predicho', '']  
      return print(tabulate(table, headers=headers, floatfmt=".4f")) 

In [240]:
list_cm([tp,fn,fp,tn],['ham','spam'])

                 Valor    Predicho
---------  ----  -------  ----------
           -     ham      spam
Valor      ham   966      1
Verdadero  spam  35       112


In [241]:
acc = (tp+tn)/(tp+tn+fp+fn)
pre = (tp)/(tp+fp)
rec = (tp)/(tp+fn)
f1 = 2 * (pre*rec)/(pre+rec)
spe = (tn)/(tn+fp)

print('Accuracy: {}'.format(acc))
print('Precision: {}'.format(pre))
print('Recall: {}'.format(rec))
print('F1: {}'.format(f1))
print('Specificity: {}'.format(f1))

Accuracy: 0.9676840215439856
Precision: 0.965034965034965
Recall: 0.9989658738366081
F1: 0.9817073170731707
Specificity: 0.9817073170731707
