In [None]:
import math
import os

## Preparación del corpus de emails

In [None]:
!git clone https://github.com/pachocamacho1990/datasets

In [None]:
! unzip datasets/email/plaintext/corpus1.zip

In [None]:
os.listdir('corpus1/spam')

In [None]:
# Setting Environment

data = []
classes = []
# Reading Spam Data
for file in os.listdir('corpus1/spam'):
  with open('corpus1/spam/'+file, encoding='latin-1') as f:
    data.append(f.read())
    classes.append('spam')
# Reading Ham Data 
for file in os.listdir('corpus1/ham'):
  with open('corpus1/ham/'+file, encoding='latin-1') as f:
    data.append(f.read())
    classes.append('ham')
len(data)

5172

## Building Naive Bayes Model

---

## Spacy Tokenizer

* Documentation: https://spacy.io/api/tokenizer
* ¿How The Tokenizer Works? https://spacy.io/usage/linguistic-features#how-tokenizer-works

In [None]:
#@title Spacy Tokenizer Libraries
from spacy.tokenizer import Tokenizer
# English Vocabulary
from spacy.lang.en import English

# English - Natural Language Processing 
nlp = English()
# Tokenizer Instance Set-Up
tokenizer = Tokenizer(nlp.vocab)

In [None]:
print([t.text for t in tokenizer(data[0])])

['Subject:', 'confidence', 'is', 'back', '\n', 'hello', ',', '\n', 'my', 'boyfriend', 'began', 'having', 'problems', 'with', 'erections', '(', 'he', "'", 's', 'older', ')', '\n', 'and', 'i', 'suggested', 'he', 'look', 'into', 'vlagrra', 'softtabs', '.', '\n', 'boy', ',', 'am', 'i', 'glad', 'he', 'did', '!', '\n', 'the', 'first', 'time', 'he', 'tried', 'it', ',', 'one', '50', 'mg', 'piil', 'did', 'nothing', 'so', 'he', 'took', 'another', 'and', 'that', 'was', 'a', 'mistake', '.', 'three', 'hours', 'later', 'he', 'was', 'still', 'rock', 'hard', 'and', 'had', 'come', 'multiple', 'times', '(', 'so', 'had', 'i', ')', '!', '!', 'since', 'then', 'a', 'single', '50', 'mg', 'dose', 'does', 'ithe', 'first', 'of', 'these', 'was', 'that', 'he', 'should', 'have', 'been', 'brought', 'to', 'trial', 'att', 'very', 'well', '-', '-', 'he', "'", 's', 'now', 'good', 'for', 'almost', '2', 'hours', 'of', 'good', 'hard', 'sex', 'that', 'leaves', 'both', 'of', 'us', 'worn', 'out', '.', '\n', '-', 'bobbie', ',

### Algorithm Main class

Remember the most probable class is given by (in logarithmic computation space):


$$\hat{c} = {\arg \max}_{(c)}\log{P(c)}
 +\sum_{i=1}^n
\log{ P(f_i \vert c)}
$$

To avoid outliers, we will use Laplace smoothing like so:

$$
P(f_i \vert c) = \frac{C(f_i, c)+1}{C(c) + \vert V \vert}
$$

being $\vert V \vert$ the vocabulary length of our training set.

In [None]:
# Numerical Manipulation
import numpy as np

# Naive Bayes Classifier
class NaiveBayesClassifier():
  nlp = English()
  tokenizer = Tokenizer(nlp.vocab)
  
  def tokenize(self, doc):
    return  [t.text.lower() for t in tokenizer(doc)]

  def word_counts(self, words):
    wordCount = {}
    for w in words: 
      if w in wordCount.keys():
        wordCount[w] += 1
      else:
        wordCount[w] = 1
    return wordCount

  def fit(self, data, classes):
    n = len(data)
    self.unique_classes = set(classes)
    self.vocab = set()
    self.classCount = {} #C(c)
    self.log_classPriorProb = {} #P(c)
    self.wordConditionalCounts = {} #C(w|c)
    # Class Counter 
    for c in classes:
      if c in self.classCount.keys():
        self.classCount[c] += 1
      else:
        self.classCount[c] = 1
    # P(c) Calculation
    for c in self.classCount.keys():
      self.log_classPriorProb[c] = math.log(self.classCount[c]/n)
      self.wordConditionalCounts[c] = {}
    # C(w|c) Calculation
    for text, c in zip(data, classes):
      counts = self.word_counts(self.tokenize(text))
      for word, count in counts.items():
        if word not in self.vocab:
          self.vocab.add(word)
        if word not in self.wordConditionalCounts[c]:
          self.wordConditionalCounts[c][word] = 0.0
        self.wordConditionalCounts[c][word] += count

  def predict(self, data):
    results = []
    for text in data:
      words = set(self.tokenize(text))
      scoreProb = {}
      for word in words: 
        if word not in self.vocab: continue # Ignoring New Words 
        # P(w|c) Laplacian Smoother
        for c in self.unique_classes:
          log_wordClassProb = math.log(
              (self.wordConditionalCounts[c].get(word, 0.0)+1)/(self.classCount[c]+len(self.vocab)))
          scoreProb[c] = scoreProb.get(c, self.log_classPriorProb[c]) + log_wordClassProb
      arg_maxprob = np.argmax(np.array(list(scoreProb.values())))
      results.append(list(scoreProb.keys())[arg_maxprob])
    return results

### Scikit Learn Utilities
* `train_test_split`: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

* `accuracy_score`: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

* `precision_score`: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html

* `recall_score`: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html

In [None]:
# Model Selection - Train & Test Split
from sklearn.model_selection import train_test_split
# Getting Metrics: Accuracy - Precision & Recall Score
from sklearn.metrics import accuracy_score, precision_score, recall_score
data_train, data_test, classes_train, classes_test = train_test_split(data, classes, test_size=0.10, random_state=42)

In [None]:
# Naive Bayes Classifier Invokation
classifier = NaiveBayesClassifier()
# Applying NB on Models
classifier.fit(data_train, classes_train)

In [None]:
# Checking In Predictions
clases_predict = classifier.predict(data_test)

In [None]:
# Gettign Accuracy Score
accuracy_score(classes_test, clases_predict)

0.8397683397683398

In [None]:
# Precision Score
precision_score(classes_test, clases_predict, average=None, zero_division=1)

array([0.81390135, 1.        ])

In [None]:
# Recall Score
recall_score(classes_test, clases_predict, average=None, zero_division=1)

array([1.        , 0.46451613])