<a href="https://colab.research.google.com/github/leonistor/ml-manning/blob/master/04-machine-learning/NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naive Bayes (za eazy way)

## Prepare data files

In [0]:
# prepare emails data

# is 'prepare data files' already executed?
data_ready = True

In [0]:
# https://svaderia.github.io/articles/downloading-and-unzipping-a-zipfile/
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

zipurl = "https://github.com/leonistor/ml-manning/raw/master/04-machine-learning/emails.zip"
# use if zipped files not in folders
tmp_destination = ""

if not data_ready:
  with urlopen(zipurl) as zip_response:
    with ZipFile(BytesIO(zip_response.read())) as zfile:
      zfile.extractall("/tmp/" + tmp_destination)

data_ready = True

In [4]:
import glob
path = "/tmp/emails/"
files = [f for f in glob.glob(path + "**/*", recursive=True)]
# for f in files:
  # print(f)
print("files and folders count: ", len(files))

files and folders count:  3002


## Import data

In [5]:
import io
import os
import numpy

from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
  for root, dirnames, filenames in os.walk(path):
    for filename in filenames:
      path = os.path.join(root, filename)

      inBody = False
      lines = []
      f = io.open(path, 'r', encoding='latin1')
      for line in f:
        if inBody:
          lines.append(line)
        elif line == '\n':
          inBody = True
      f.close()
      message = '\n'.join(lines)
      yield path, message

def dataFrameFromDirectory(path, classification):
  rows = []
  index = []
  for filename, message in readFiles(path):
    rows.append({'message': message, 'class': classification})
    index.append(filename)
  return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})

data = data.append(dataFrameFromDirectory('/tmp/emails/spam', 'spam'))
data = data.append(dataFrameFromDirectory('/tmp/emails/ham', 'ham'))

data.head()

Unnamed: 0,message,class
/tmp/emails/spam/00173.e10eb62e2c7808674c43d6a5e9e08a1c,"<html>\n\n<body bgColor=""#CCCCCC"" topmargin=1 ...",spam
/tmp/emails/spam/00270.5dcd9ce3be2992222b9038d7bf75a23a,"Dear Partner to be,\n\n\n\nFirst, I must apolo...",spam
/tmp/emails/spam/00405.3163fff27ff95b91afd656f0025c6a83,<html>\n\n<head>\n\n</head>\n\n<center>\n\n<h1...,spam
/tmp/emails/spam/00256.edd9bfb44729edf3c4f177814fd8c9e1,This is a Multipart MIME message. Since your m...,spam
/tmp/emails/spam/00281.db28f3aab77ff478279d8de20d572b42,\n\n<html><body><center>\n\n\n\n<table bgcolor...,spam


## The fun part

In [6]:
vectorizer = CountVectorizer()
# words -> tokens and count (Learn the vocabulary dictionary and return term-document matrix.)
counts = vectorizer.fit_transform(data['message'].values)

classifier = MultinomialNB()
targets = data['class'].values
# classifier.fit(X, y)
# X : {array-like, sparse matrix} of shape (n_samples, n_features)
    # Training vectors, where n_samples is the number of samples and  
    # n_features is the number of features.  
# y : array-like of shape (n_samples,)
    # Target values.
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
# try out with some examples of body text:
examples = [
  'Free Viagra now',
  'Hi, Bob, how about a game of golf tommorrow?',
  'Enlarge your penis today'
  ]
# tokenize examples
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['spam', 'ham', 'spam'], dtype='<U4')

## Activity: Train/ Test check accuracy

In [13]:
# https://www.kaggle.com/dilip990/spam-ham-detection-using-naive-bayes-classifier
all_predictions = classifier.predict(vectorizer.transform(data['message'].values))

from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(data['class'].values, all_predictions))
print(confusion_matrix(data['class'].values, all_predictions))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      2500
        spam       0.99      0.86      0.92       500

    accuracy                           0.97      3000
   macro avg       0.98      0.93      0.95      3000
weighted avg       0.98      0.97      0.97      3000

[[2495    5]
 [  71  429]]
