In [1]:
import pandas as pd

sms_spam = pd.read_csv('aima-data/SMSSpamCollection.csv', sep='\t',
header=None, names=['Label', 'SMS'])

print(sms_spam.shape)
sms_spam.head()

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
# Randomize the dataset
data_randomized = sms_spam.sample(frac=1, random_state=1)

# Calculate index for split
training_test_index = round(len(data_randomized) * 0.8)

# Split into training and test sets
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(4458, 2)
(1114, 2)


In [3]:
# After cleaning
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
training_set['SMS'] = training_set['SMS'].str.lower()

In [4]:
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = []
for sms in training_set['SMS']:
   for word in sms:
      vocabulary.append(word)

vocabulary = list(set(vocabulary))

In [5]:
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
   for word in sms:
      word_counts_per_sms[word][index] += 1

In [6]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,bong,83332,woould,txting,again,someone,mandan,forgt,meetins,muah,...,cinema,rhythm,heaven,select,singles,innu,thought,abdomen,trips,la1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
training_set_clean = pd.concat([word_counts, training_set["Label"]], axis=1)
training_set_clean.head()

Unnamed: 0,bong,83332,woould,txting,again,someone,mandan,forgt,meetins,muah,...,rhythm,heaven,select,singles,innu,thought,abdomen,trips,la1,Label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham


In [8]:
training_set_clean.to_csv("aima-data/CleanSMSSpamCollection.csv", header=False, index=False)

In [9]:
from dataset import *

dataset = DataSet(name="CleanSMSSpamCollection")

In [10]:
from learningModels import *

nBD = NaiveBayesLearner(dataset, continuous=False)

In [229]:
test_set['SMS'] = test_set['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
test_set['SMS'] = test_set['SMS'].str.lower()

In [230]:
test_set['SMS'] = test_set['SMS'].str.split()

In [231]:
word_counts_per_sms_test = {unique_word: [0] * len(test_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(test_set['SMS']):
   for word in sms:
      if word in vocabulary:
         word_counts_per_sms_test[word][index] += 1

In [232]:
word_counts_test = pd.DataFrame(word_counts_per_sms_test)
word_counts_test.head()

Unnamed: 0,neekunna,cheap,brilliantly,1120,proof,ntt,prsn,leh,choosing,prevent,...,subtoitles,showing,tell,regard,informed,forwarded,elaborating,types,innu,listen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [233]:
test_set_clean = pd.concat([word_counts_test, test_set["Label"]], axis=1)
test_set_clean.head()

Unnamed: 0,neekunna,cheap,brilliantly,1120,proof,ntt,prsn,leh,choosing,prevent,...,showing,tell,regard,informed,forwarded,elaborating,types,innu,listen,Label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spam
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham


In [235]:
tp = 0
tn = 0
fp = 0
fn = 0

for i in test_set_clean.iterrows():
    row = list(i[1])
    actual = row[-1]
    predicted = nBD(row[:-1])

    if (actual == "spam" and predicted == "ham"):
        fp += 1
    if (actual == "ham" and predicted == "ham"):
        tp += 1
    if (actual == "spam" and predicted == "spam"):
        tn += 1
    if (actual == "ham" and predicted == "spam"):
        fn += 1

In [27]:
from tabulate import tabulate

def list_cm(cm,classes):     #función para generar de una forma más visual la matriz de confusión
      row_0 =['','Valor','Verdadero','', '']
      row_1 =['-',classes[0],classes[1]]
      row_2 =[classes[0],cm[0],cm[2]]
      row_3 =[classes[1],cm[1],cm[3]]
      table = zip(row_0,row_1, row_2, row_3)
      headers = ['', '', 'Valor', 'Predicho', '']  
      return print(tabulate(table, headers=headers, floatfmt=".4f")) 

In [40]:
list_cm([tp,fn,fp,tn],['ham','spam'])

                 Valor    Predicho
---------  ----  -------  ----------
           -     ham      spam
Valor      ham   896      71
Verdadero  spam  16       131


In [241]:
acc = (tp+tn)/(tp+tn+fp+fn)
pre = (tp)/(tp+fp)
rec = (tp)/(tp+fn)
f1 = 2 * (pre*rec)/(pre+rec)
spe = (tn)/(tn+fp)

print('Accuracy: {}'.format(acc))
print('Precision: {}'.format(pre))
print('Recall: {}'.format(rec))
print('F1: {}'.format(f1))
print('Specificity: {}'.format(f1))

Accuracy: 0.9676840215439856
Precision: 0.965034965034965
Recall: 0.9989658738366081
F1: 0.9817073170731707
Specificity: 0.9817073170731707


In [35]:
'''
    Leer los mensajes contenidos en archivos .sms en la carpeta tests/unsorted
'''
import os # Necesario para interactuar con el sistema de archivos

files_df = pd.DataFrame() # Dataframe para guardar los datos de los SMS
filenames = pd.DataFrame() # Dataframe para guardar los nombres de archivos asociados a cada SMS

directory = "tests/unsorted" # Directorio base de SMS no clasificados

idx = 0 # Índice de lectura de SMS

for filename in os.listdir(directory): # Iterar a través de los archivos del directorio
    f = os.path.join(directory, filename) # Determinar la ruta completa del archivo
    if os.path.isfile(f): # Verfificar si la ruta es un archivo
        file = open(f, "r") # Abrir el archivo
        
        messagefile = pd.DataFrame({"Label": filename.split("_")[1].split(".")[0], "SMS": file.read()}, index=[idx]) # Crear la entrada del archhivo leído
        files_df = pd.concat([files_df, messagefile]) # Agregar la entrada al dataframe
        
        filenames = pd.concat([filenames, pd.DataFrame({"filename": f}, index=[idx])]) # Agregar la entrada con la ruta del archivo
        idx += 1 # Aumentar el índice
        file.close() # Cerrar el archivo
                

In [22]:
files_df['SMS'] = files_df['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
files_df['SMS'] = files_df['SMS'].str.lower()


In [23]:
files_df['SMS'] = files_df['SMS'].str.split()

In [24]:
files_word_counts_per_sms_test = {unique_word: [0] * len(files_df['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(files_df['SMS']):
   for word in sms:
      if word in vocabulary:
         files_word_counts_per_sms_test[word][index] += 1

In [25]:
files_word_counts_test = pd.DataFrame(files_word_counts_per_sms_test)
files_word_counts_test.head()

Unnamed: 0,bong,83332,woould,txting,again,someone,mandan,forgt,meetins,muah,...,cinema,rhythm,heaven,select,singles,innu,thought,abdomen,trips,la1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
files_test_set_clean = pd.concat([files_word_counts_test, files_df["Label"]], axis=1)
files_test_set_clean.head()

Unnamed: 0,bong,83332,woould,txting,again,someone,mandan,forgt,meetins,muah,...,rhythm,heaven,select,singles,innu,thought,abdomen,trips,la1,Label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham


In [34]:
import shutil # Manipulación de archivos

# Métricas
tp = 0
tn = 0
fp = 0
fn = 0


for i in files_test_set_clean.iterrows():
    row = list(i[1])
    actual = row[-1]
    predicted = nBD(row[:-1])
    
    filename_series = filenames.iloc[i[0]] # Recuperar entrada del dataframe de rutas de archivo 
    fullpath = filename_series['filename'] # Recuperar la ruta del archivo
    raw_filename = fullpath.split("/")[2]  # Recuperar el nombre del archivo
    
    '''
        En este condicional se verifica el tipo de clasificación determinada
        y se realizan los siguientes procesos.
            1. Determinar la ruta de destino
            2. Crear el directorio de destino si no existe
            3. Copiar el archivo original a la ruta de destino.
    '''
    if predicted == "spam":
        destination_path = f"tests/sorted/spam/{raw_filename}"
        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
        shutil.copy(fullpath, destination_path)
    else:
        destination_path = f"tests/sorted/ham/{raw_filename}"
        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
        shutil.copy(fullpath, destination_path)
        

    if (actual == "spam" and predicted == "ham"):
        fp += 1
    if (actual == "ham" and predicted == "ham"):
        tp += 1
    if (actual == "spam" and predicted == "spam"):
        tn += 1
    if (actual == "ham" and predicted == "spam"):
        fn += 1

In [37]:
list_cm([tp,fn,fp,tn],['ham','spam'])

                 Valor    Predicho
---------  ----  -------  ----------
           -     ham      spam
Valor      ham   896      71
Verdadero  spam  16       131


In [39]:
acc = (tp+tn)/(tp+tn+fp+fn)
pre = (tp)/(tp+fp)
rec = (tp)/(tp+fn)
f1 = 2 * (pre*rec)/(pre+rec)
spe = (tn)/(tn+fp)

print('Accuracy: {}'.format(acc))
print('Precision: {}'.format(pre))
print('Recall: {}'.format(rec))
print('F1: {}'.format(f1))
print('Specificity: {}'.format(f1))

Accuracy: 0.921903052064632
Precision: 0.9824561403508771
Recall: 0.9265770423991727
F1: 0.9536987759446515
Specificity: 0.9536987759446515


In [None]:
raw_message = "Show ur colours! Euro 2004 2-4-1 Offer! Get an England Flag & 3Lions tone on ur phone! Click on the following service message for info!"
message_df = pd.DataFrame({"Label": "spam", "SMS": raw_message}, index=[0]) # Crear el dataframe
message_df['SMS'] = message_df['SMS'].str.replace('\W', ' ', regex=True) # Removes punctuation
message_df['SMS'] = message_df['SMS'].str.lower()
message_df['SMS'] = message_df['SMS'].str.split()

In [None]:
word_counts_per_sms_test = {unique_word: [0] * len(message_df['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(message_df['SMS']):
   for word in sms:
      if word in vocabulary:
         word_counts_per_sms_test[word][index] += 1