### IMPORTS

In [None]:
from collections import defaultdict, Counter
import glob, math, operator, time
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, classification_report

### DATA IMPORT
The following cells load and unzip the training data directly from source.

In [None]:
# get data from source
!wget -nv "http://www.aueb.gr/users/ion/data/lingspam_public.tar.gz"

2020-09-22 00:36:44 URL:http://www2.aueb.gr/users/ion/data/lingspam_public.tar.gz [11564714/11564714] -> "lingspam_public.tar.gz" [1]


In [None]:
# unzip data to memory
!tar -xf lingspam_public.tar.gz

### FUNCTIONS
This is where the functions are constructed.

In [None]:
def increment_nb_model(fname, model, vocab):
  """Function to process an email into either the 
  Ham or SPAM model.
  """
  with open(fname) as email:   # open text file
    lines = email.readlines()   # convert to py obj
    for word in lines[2].split(" "):  # iterate through words(features)
      model[word] += 1    # update model with count
      vocab[word] += 1    # update vocab with count

In [None]:
def train_nb_model(alpha=1):
  """Function to train Naive Bayes models.

  Returns: logpriors(dict): set of prior probabilities for classes (0=ham; 1=spam)
          loglikelihood(dict): dict (class-level) of dict of word likelihoods
          vocab(list): set of Vocabulary
  """
  vocab_dict = defaultdict(int) # initialize overall count dict
  spam_dict = defaultdict(int) # initialize spam count dict
  ham_dict = defaultdict(int) # initialize ham count dict
  loglikelihood = defaultdict(lambda: defaultdict(float))  # initialize log likelihood dictionary
  spamct, hamct = 0, 0  # initialize document counter for prior
  
  # Count all pertinent emails and features
  for file in glob.glob("/content/lingspam_public/lemm_stop/part1/*"):    # iterate through the pertinent folder
    if file[41:].startswith("spms"):  # determine spam or not
      increment_nb_model(file, spam_dict, vocab_dict)  # update model with file
      spamct += 1   # count file for NB calc
    else: # or ham
      increment_nb_model(file, ham_dict, vocab_dict)
      hamct += 1

  # Calculate Priors and Stats
  spam_prior = math.log(spamct / (spamct + hamct)) # spam prior in log form
  ham_prior = math.log(hamct / (spamct + hamct)) # ham prior in log form
  logpriors = {"ham":ham_prior, "spam":spam_prior} # nest prior dicts for easy access
  count_dict = {'spam':spam_dict, 'ham':ham_dict} # nest count dicts for easy access
  token_ct_dict = {'spam':sum(spam_dict.values()), 'ham':sum(ham_dict.values())}
  vocab = list(vocab_dict.keys()) # set of entire vocab
  vocab_len = len(vocab) # size of vocab

  # Calculate individual log probs
  for x in ['spam', 'ham']:
    for wordtype in vocab:
      num = count_dict[x][wordtype] + alpha # calculate numerator
      den = token_ct_dict[x] + (vocab_len * alpha)  # calculate denominator
      loglikelihood[x][wordtype] = math.log(num / den)  # calculate (and store) log likelihood

  return logpriors, loglikelihood, set(vocab) 


In [None]:
def nb_model_predict(fname, logpriors, loglikelihood, vocab, tags=['spam','ham']):
  """Function to predict a class label given proper input.
  """
  pred = {}
  for group in tags:
    running_prob = logpriors[group]
    with open(fname) as email:
      lines = email.readlines()
      for word in lines[2].split(" "):
        if word in vocab:
          running_prob += loglikelihood[group][word]
    pred[group] = running_prob
  return max(pred.items(), key=operator.itemgetter(1))[0]

    

### WORKFLOW
Execution takes place here.

In [None]:
# Train Model
t1 = time.time()
logpriors, loglikelihood, vocab = train_nb_model() # train model
print("Time to Train: {} seconds".format(round(time.time()-t1, 3)))


Time to Train: 0.074 seconds


In [None]:
# First Eval module: not through sklearn
TP, FP, TN, FN = 0,0,0,0
# fname = "/content/lingspam_public/lemm_stop/part10/9-157msg1.txt" # dev step
# fname = "/content/lingspam_public/lemm_stop/part10/spmsgc55.txt" # dev step
for fname in glob.glob("/content/lingspam_public/lemm_stop/part10/*"):
  pred = nb_model_predict(fname, logpriors, loglikelihood, vocab)
  if "spm" in fname: # SPAM
    if pred == "spam": # True Positives
      TP += 1
    elif pred == "ham": # False Negatives
      FN += 1
    else: 
      print("ERROR: Spam predicted as something other than Spam or Ham")
  else:  # HAM
    if pred == "ham": # True Negatives
      TN += 1
    elif pred == "spam": # False Positives
      FP += 1 
    else:
      print("ERROR: Ham predicted as something other than Spam or Ham")

precision = TP / (TP + FP)
recall = TP / (TP + FN)
fmeasure = (2 * precision * recall) / (precision + recall)
print("Precision: {}\tRecall: {}\t\tF1:{}\n".format(round(precision,3),round(recall,3),round(fmeasure,4)))
print("Confusion Mat. | Positive Prediction \t| Negative Prediction")
print("Positive Class | True Positives {}\t| False Negatives {}".format(TP, FN))
print("Negative Class | False Positives {}\t| True Negatives {}".format(FP, TN))

Precision: 0.754	Recall: 0.878		F1:0.8113

Confusion Mat. | Positive Prediction 	| Negative Prediction
Positive Class | True Positives 43	| False Negatives 6
Negative Class | False Positives 14	| True Negatives 228


Evaluation: What is the Precision, Recall, and F-score of the classifier?

In [None]:
def run_predictions():
  t2 = time.time()
  preds = []
  gold = []
  # fname = "/content/lingspam_public/lemm_stop/part10/9-157msg1.txt" # dev step
  # fname = "/content/lingspam_public/lemm_stop/part10/spmsgc55.txt" # dev step
  for fname in glob.glob("/content/lingspam_public/lemm_stop/part10/*"):
    preds.append(nb_model_predict(fname, logpriors, loglikelihood, vocab))
    if "spm" in fname: # SPAM Gold
      gold.append("spam")
    else:   # Ham Gold
      gold.append("ham")

  y_true = np.array(gold)
  y_pred = np.array(preds)
  # precision_recall_fscore_support(y_true, y_pred)
  stats = precision_recall_fscore_support(y_true, y_pred, average=None, labels=['ham', 'spam'])
  names = ["Precision", "Recall\t", "Fscore\t", "Support\t"]
  print(" \t\tHam\t\tSpam")
  for i, stat in enumerate(stats):
    print("{}\t{}".format(names[i], stat))
  print("Time to Test: {} seconds".format(round(time.time()-t2, 3)))



### Experimentation with Smoothing


In [None]:
# Train Model with alpha=1 smoothing
t1 = time.time()
logpriors, loglikelihood, vocab = train_nb_model(alpha=1) # train model
print("Time to Train: {} seconds".format(round(time.time()-t1, 3)))
run_predictions()


Time to Train: 0.091 seconds
 		Ham		Spam
Precision	[0.97435897 0.75438596]
Recall		[0.94214876 0.87755102]
Fscore		[0.95798319 0.81132075]
Support		[242  49]
Time to Test: 0.121 seconds


In [None]:
# Train Model with alpha=0.5 smoothing
t1 = time.time()
logpriors, loglikelihood, vocab = train_nb_model(alpha=0.5) # train model
print("Time to Train: {} seconds".format(round(time.time()-t1, 3)))
run_predictions() 

Time to Train: 0.076 seconds
 		Ham		Spam
Precision	[0.97435897 0.75438596]
Recall		[0.94214876 0.87755102]
Fscore		[0.95798319 0.81132075]
Support		[242  49]
Time to Test: 0.105 seconds


In [None]:
# Train Model with alpha=0.1 smoothing
t1 = time.time()
logpriors, loglikelihood, vocab = train_nb_model(alpha=0.1) # train model
print("Time to Train: {} seconds".format(round(time.time()-t1, 3)))
run_predictions() 

Time to Train: 0.071 seconds
 		Ham		Spam
Precision	[0.97435897 0.75438596]
Recall		[0.94214876 0.87755102]
Fscore		[0.95798319 0.81132075]
Support		[242  49]
Time to Test: 0.114 seconds


Changing the alpha from 1, to .9 or .8, or anything down to .1 had no effect on system performance. However, taking it below .1 began to improve preformance on both categories.

In [None]:
# Train Model with alpha=0.01 smoothing
t1 = time.time()
logpriors, loglikelihood, vocab = train_nb_model(alpha=0.01) # train model
print("Time to Train: {} seconds".format(round(time.time()-t1, 3)))
run_predictions() 

Time to Train: 0.076 seconds
 		Ham		Spam
Precision	[0.97468354 0.7962963 ]
Recall		[0.95454545 0.87755102]
Fscore		[0.96450939 0.83495146]
Support		[242  49]
Time to Test: 0.119 seconds


At .001, we get diminishing returns from the Recall on Spam and precision on Ham, while everything else (including F1) has improved slightly.

In [None]:
# Train Model with alpha=0.001 smoothing
t1 = time.time()
logpriors, loglikelihood, vocab = train_nb_model(alpha=0.001) # train model
print("Time to Train: {} seconds".format(round(time.time()-t1, 3)))
run_predictions() 

Time to Train: 0.081 seconds
 		Ham		Spam
Precision	[0.96326531 0.86956522]
Recall		[0.97520661 0.81632653]
Fscore		[0.96919918 0.84210526]
Support		[242  49]
Time to Test: 0.119 seconds


Interestingly, I was expecting the performance of alpha=0.0055 to fall between a=0.001 and a=0.001, but it's simply bad. So, I think I would stick with .001 in practice. It's the only parameter value that brings all recall and precision values above 80%. 

In [None]:
# Train Model with alpha=0.0055 smoothing
t1 = time.time()
logpriors, loglikelihood, vocab = train_nb_model(alpha=0.0055) # train model
print("Time to Train: {} seconds".format(round(time.time()-t1, 3)))
run_predictions() 

Time to Train: 0.074 seconds
 		Ham		Spam
Precision	[0.97058824 0.79245283]
Recall		[0.95454545 0.85714286]
Fscore		[0.9625     0.82352941]
Support		[242  49]
Time to Test: 0.11 seconds


In [None]:
# Train Model with alpha=0.0001 smoothing
t1 = time.time()
logpriors, loglikelihood, vocab = train_nb_model(alpha=0.0001) # train model
print("Time to Train: {} seconds".format(round(time.time()-t1, 3)))
run_predictions() 

Time to Train: 0.085 seconds
 		Ham		Spam
Precision	[0.95951417 0.88636364]
Recall		[0.97933884 0.79591837]
Fscore		[0.96932515 0.83870968]
Support		[242  49]
Time to Test: 0.103 seconds


Classifier comparison with sklearn's implementation. 

In [None]:
# Process emails
def process_email(fname):
  with open(fname) as email:   # open text file
    lines = email.readlines()   # convert to py obj
  return lines[2].strip()

train_texts = []
train_labels = []
for fname in glob.glob("/content/lingspam_public/lemm_stop/part1/*"):    # iterate through the pertinent folder
  if fname[41:].startswith("spms"):  # determine spam or not
    train_labels.append("spam")
  else:
    train_labels.append("ham")
  train_texts.append(process_email(fname))

# Count Vectorizer Step
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_texts)

# Train MnNB Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_counts, train_labels)

In [None]:
Counter(train_labels)

Counter({'ham': 241, 'spam': 48})

In [None]:
# Process Test Set
test_texts = []
test_labels = []
for fname in glob.glob("/content/lingspam_public/lemm_stop/part10/*"):
  if "spms" in fname:  # determine spam or not
    test_labels.append("spam")
  else:
    test_labels.append("ham")
  test_texts.append(process_email(fname))

# Transform Features
X_new_counts = count_vect.transform(test_texts)
# X_new_tfidf = tfidf_transformer.transform(X_new_counts)

# Make predictions with Trained Multinomial NB from SKLearn
predicted = clf.predict(X_new_counts)

y_true = np.array(test_labels)
y_pred = np.array(predicted)


# stats = precision_recall_fscore_support(y_true, y_pred, average=None, labels=['ham', 'spam'])
# names = ["Precision", "Recall\t", "Fscore\t", "Support\t"]
# print(" \t\tHam\t\tSpam")
# for i, stat in enumerate(stats):
#   print("{}\t{}".format(names[i], stat))


In [None]:
print(classification_report(y_true, y_pred, 
                                    target_names=list(set(train_labels))))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       242
        spam       1.00      0.94      0.97        49

    accuracy                           0.99       291
   macro avg       0.99      0.97      0.98       291
weighted avg       0.99      0.99      0.99       291

