<a href="https://colab.research.google.com/github/jeraldflowers/Classifier-NLTK/blob/main/Accuracy_Classifier_Spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Improving the Accuracy of the email Classifier

In [None]:
import os
import nltk
import random

from nltk import word_tokenize
from nltk.collocations import *
import pandas as pd

nltk.download('punkt')

!git clone https://github.com/pachocamacho1990/datasets
!unzip datasets/email/plaintext/corpus1.zip
!unzip datasets/email/plaintext/corpus2.zip
!unzip datasets/email/plaintext/corpus3.zip

## Functions for Load the Datasets

In [5]:
import pandas as pd

# GET TEXT AND LABELS

def get_text_labels_from_folders(folderBase, folderLabels):
  data = []
  labels = []
  for folderLabel in folderLabels:
    for file in os.listdir('{}/{}'. format(folderBase, folderLabel)):
      with open('{}/{}/{}'.format(folderBase, folderLabel, file), encoding='latin-1') as f:
        data.append(f.read())
        labels.append(folderLabel)
  return data, labels

def set_label_num(label_str):
  if label_str == 'spam':
    return 1
  else:
    return 0

dataCorpus1, labelsCorpus1 = get_text_labels_from_folders('corpus1', ['spam', 'ham'])
dataCorpus2, labelsCorpus2 = get_text_labels_from_folders('corpus2', ['spam', 'ham'])
dataCorpus3, labelsCorpus3 = get_text_labels_from_folders('corpus3', ['spam', 'ham'])

data = dataCorpus1 + labelsCorpus2 + labelsCorpus3
labels = labelsCorpus1 +  labelsCorpus2 + labelsCorpus3

dataframe = pd.DataFrame({'text': data,
                          'labels': labels})

dataframe = dataframe.sample(frac = 1)
dataframe['tokens'] = dataframe['text'].apply(lambda x: word_tokenize(x))
dataframe['labels_num'] = dataframe['labels'].apply(lambda x: set_label_num(x))

## Functions to Filter Words

In [6]:
def filter_words_by_threshold(text_tokenized, threshold=3):
  words = []
  words = [word for word in text_tokenized if len(word) > threshold]
  return words

def get_n_grams_collocations_from_words(words, freq_filter=10, n_best=10, n_gram_measure=nltk.collocations.BigramAssocMeasures()):
  finder = BigramCollocationFinder.from_words(words)
  finder.apply_freq_filter(freq_filter)
  email_spam_collocations = finder.nbest(n_gram_measure.pmi, n_best)
  return email_spam_collocations



## Get the Most Common Collocations and Words in the Spam Datasets

In [17]:
_, spamCorpus1 = get_text_labels_from_folders('corpus1', ['spam'])
_, spamCorpus2 = get_text_labels_from_folders('corpus2', ['spam'])
_, spamCorpus3 = get_text_labels_from_folders('corpus3', ['spam'])

spamCorpuses = spamCorpus1 + spamCorpus2 + spamCorpus3

filtered_words = []

for text in spamCorpuses:
  filtered_words += filter_words_by_threshold(word_tokenize(text))
filtered_words  

email_spam_collocations = get_n_grams_collocations_from_words(filtered_words, 120, 40)
all_spam_words = nltk.FreqDist([w for w in filtered_words])
top_spam_words = all_spam_words.most_common(200)

## Get the Most Common Collocations and Words in the Ham Datasets

In [16]:
_, hamCorpus1 = get_text_labels_from_folders('corpus1', ['ham'])
_, hamCorpus2 = get_text_labels_from_folders('corpus2', ['ham'])
_, hamCorpus3 = get_text_labels_from_folders('corpus3', ['ham'])

hamCorpuses = hamCorpus1 + hamCorpus2 + hamCorpus3

filtered_words = []

for text in hamCorpuses:
  filtered_words += filter_words_by_threshold(word_tokenize(text))
filtered_words

email_ham_collocations = get_n_grams_collocations_from_words(filtered_words, 120, 40)
all_ham_words = nltk.FreqDist([w for w in filtered_words])
top_ham_words = all_ham_words.most_common(200)

## Filter Repeated Words of Most Common Words in Spam and Ham

In [12]:
top_ham_words_iterator = top_ham_words

for word in top_ham_words_iterator:
  if word in top_ham_words and word in top_spam_words:
    top_ham_words.remove(word)
    top_spam_words.remove(word)

## Get Most Commons Words of all Dataset

In [None]:
filtered_words = []

for text in data:
  filtered_words += filter_words_by_threshold(word_tokenize(text))
filtered_words

all_words = nltk.FreqDist([w for w in filtered_words])
top_words = all_words.most_common(200)
top_words

## Get the Attributes

In [28]:
def document_attributes(document):
  document_words = set(document)
  atrib = {}
  for word in top_words:
    atrib['contains({})'.format(word)] = (word in document_words)

  for word in top_spam_words:
    atrib['contains_spam_words({})'.format(word)] = (word in document_words)

  for word in top_ham_words:
    atrib['contains_ham_words({})'.format(word)] = (word in document_words)

  for word in document_words:
    has_spam_word = False
    has_ham_word = False
    for bigram_position_0, bigram_position_1 in email_spam_collocations:
      if word == bigram_position_0 or word == bigram_position_1:
        has_spam_word = True
        break
    for bigram_position_0, bigram_position_1 in email_ham_collocations:
      if word == bigram_position_0 or word == bigram_position_1:
        has_ham_word = True
        break

  atrib['spam_word({})'.format(word)] = has_spam_word
  atrib['ham_word({})'.format(word)] = has_ham_word

  filtered_words = filter_words_by_threshold(document)
  bigrams = get_n_grams_collocations_from_words(filtered_words, n_best=10, freq_filter=5)

  for i in range(len(bigrams)):
    atrib['bigram_collocation({})'.format(i)] = bigrams[i]

  return atrib

## Separate Training and Test Dataset

In [29]:
fset = [(document_attributes(text), labels) for text, labels in zip(dataframe['tokens'], dataframe['labels_num'].values)]

random.shuffle(fset)
print(len(fset))

train, test = fset[:13078], fset[13078:]

16347


## Train and Calculate Accuracy

In [30]:
classifier = nltk.NaiveBayesClassifier.train(train)

print(nltk.classify.accuracy(classifier, test))

0.9400428265524625
