In [18]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root,filename) #builds up full path for each file
            
            inBody = False
            lines = []
            
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line =='\n': #takes only body information (without header )
                    inBody = True 
            f.close()
            message = '\n'.join(lines)
            yield path, message
                        

def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    
    for filename, message in readFiles(path):
        rows.append({'message':message, 'class':classification})
        index.append(filename)
    return DataFrame(rows, index=index)
                        

data = DataFrame({"message":[], 'class':[]})
                        
                        
data = data.append(dataFrameFromDirectory('/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/spam','spam'))
data = data.append(dataFrameFromDirectory('/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/ham','ham'))

In [19]:
data.head()

Unnamed: 0,message,class
/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/spam/00093.ca4edc32d2ff8e1dbb5f9c0b15ec435b,Get your favorite Poker action at http://www.m...,spam
/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/spam/00101.5a24bf3ba3962442179b1a0325a1d1cb,<html>\n\n<head>\n\n<title>Digital Publishing ...,spam
/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/spam/00202.d5b52386f66bd36cd1508319c82cf671,"Me and my friends have this brand new idea, a ...",spam
/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/spam/00418.6321175c76411371c109eafc99563d2c,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",spam
/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/spam/00132.0ead3e293c6c41cbffb69670e8b85ae7,"As seen on NBC, CBS, CNN, and even Oprah! The ...",spam


In [20]:
#CountVectorizer - splits up each message into list of words. 
#Words are thrown into MultinomialNB classifier. 
#Calling fit() method we've got a trained spam filter ready.

vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

print(counts)
#Training  a model:

classifier = MultinomialNB()

targets = data['class'].values

classifier.fit(counts, targets)



  (0, 26143)	1
  (0, 60804)	1
  (0, 23874)	1
  (0, 43422)	6
  (0, 9394)	1
  (0, 11640)	2
  (0, 28855)	3
  (0, 59298)	3
  (0, 38161)	2
  (0, 38953)	2
  (0, 43193)	2
  (0, 9871)	1
  (0, 46257)	2
  (0, 42370)	1
  (0, 25222)	2
  (0, 11364)	1
  (0, 53220)	3
  (0, 58986)	2
  (0, 24806)	3
  (0, 37658)	1
  (0, 40892)	1
  (0, 32663)	1
  (0, 25396)	1
  (0, 9233)	1
  (0, 40674)	1
  :	:
  (2999, 2495)	4
  (2999, 36111)	1
  (2999, 55963)	1
  (2999, 36934)	3
  (2999, 476)	2
  (2999, 49842)	2
  (2999, 5091)	1
  (2999, 396)	1
  (2999, 35317)	1
  (2999, 1770)	1
  (2999, 43609)	2
  (2999, 37652)	2
  (2999, 11409)	2
  (2999, 43854)	2
  (2999, 42424)	7
  (2999, 61529)	1
  (2999, 44578)	1
  (2999, 42425)	1
  (2999, 45936)	1
  (2999, 18210)	1
  (2999, 43547)	1
  (2999, 51824)	1
  (2999, 37662)	1
  (2999, 1194)	1
  (2999, 2271)	1


MultinomialNB()

In [21]:
#Testing a model

examples = ['Free Viagra now !!!', "Hi, Bob, how about a free viagra today"]

example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
print(predictions)

['spam' 'ham']
