**Spam Classifier using Naive Bayes**

We are importing the spam and not spam files and creating a dataframe. The dataframe consists of a class column which helps to identify the type of the mail.

In [0]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer #to find count of a word
from sklearn.naive_bayes import MultinomialNB #used for naive bayes classification 

def readFiles(path):
  for root, dirnames, filenames in os.walk(path): #os.walk() used to input the files from the directory
    for filename in filenames:
      path = os.path.join(root, filename)
      inBody=False
      lines=[]
      f=io.open(path, 'r', encoding='latin1')
      for line in f: #to check if the line is message or body.. omitting the body
        if inBody:
          lines.append(line)
        elif line == '\n':
          inBody=True
      f.close()
      message = '\n'.join(lines)
      yield path, message

def dataFrameFromDirectory(path, classification):
  rows=[]
  index=[]
  for filename, message in readFiles(path):  #used to read the files ffrom the directory
    rows.append({'message': message, 'class':classification}) #maps message and classification
    index.append(filename) #index for the data dataframe will be the file name of the file containing the email
  return DataFrame(rows, index=index)

data = DataFrame({'message':[], 'class':[]}) #Dataframe object contains message and class of emails (spam or not spam)
data = data.append(dataFrameFromDirectory('/content/drive/My Drive/ML Course materials/emails/spam','spam'))#appending spam mails from the file
data = data.append(dataFrameFromDirectory('/content/drive/My Drive/ML Course materials/emails/not_spam','not spam'))#appending not spam mails from the file


In [3]:
data.head()

Unnamed: 0,message,class
/content/drive/My Drive/ML Course materials/emails/spam/00003.2ee33bc6eacdb11f38d052c44819ba6c,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,spam
/content/drive/My Drive/ML Course materials/emails/spam/00002.d94f1b97e48ed3b553b3508d116e6a09,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,spam
/content/drive/My Drive/ML Course materials/emails/spam/00001.7848dde101aa985090474a91ec93fcf0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",spam
/content/drive/My Drive/ML Course materials/emails/spam/00004.eac8de8d759b7e74154f142194282724,##############################################...,spam
/content/drive/My Drive/ML Course materials/emails/spam/00005.57696a39d7d84318ce497886896bf90d,I thought you might like these:\n\n1) Slim Dow...,spam


Using CountVectorizer word count in each mail is found and we can use it in the Naive Bayes classifier. 

In [12]:
vectorizer = CountVectorizer() 
counts = vectorizer.fit_transform(data['message'].values) #get the value of the message column in data and find the count of each words

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets) #training the model using the number of times a particular word has occured in the message and the type of the message

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
##Sample test

test = ['Credit Card free with loans upto 100000','Buy one get one free free Hurry up', 'Hi,I wont be available tomorrow can you take care?', 'Baby I love you a lot', "Monthly subscription at just 999"]
test_counts = vectorizer.transform(test)
predictions = classifier.predict(test_counts)
predictions

array(['spam', 'not spam', 'not spam', 'not spam', 'not spam'],
      dtype='<U8')