In [25]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root,filename) #builds up full path for each file
            
            inBody = False
            lines = []
            
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line =='\n': #takes only body information (without header )
                    inBody = True 
            f.close()
            message = '\n'.join(lines)
            yield path, message
                        

def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    
    for filename, message in readFiles(path):
        rows.append({'message':message, 'class':classification})
        index.append(filename)
    return DataFrame(rows, index=index)
                        

data = DataFrame({"message":[], 'class':[]})
                        
                        
data = data.append(dataFrameFromDirectory('/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/spam','spam'))
data = data.append(dataFrameFromDirectory('/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/ham','ham'))

In [26]:
data.head()

Unnamed: 0,message,class
/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/spam/00093.ca4edc32d2ff8e1dbb5f9c0b15ec435b,Get your favorite Poker action at http://www.m...,spam
/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/spam/00101.5a24bf3ba3962442179b1a0325a1d1cb,<html>\n\n<head>\n\n<title>Digital Publishing ...,spam
/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/spam/00202.d5b52386f66bd36cd1508319c82cf671,"Me and my friends have this brand new idea, a ...",spam
/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/spam/00418.6321175c76411371c109eafc99563d2c,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",spam
/home/igor/Desktop/Projects/Python/Machine Learning/Datasets/DataScience-Python3/emails/spam/00132.0ead3e293c6c41cbffb69670e8b85ae7,"As seen on NBC, CBS, CNN, and even Oprah! The ...",spam


In [27]:
#CountVectorizer - splits up each message into list of words. 
#Words are thrown into MultinomialNB classifier. 
#Calling fit() method we've got a trained spam filter ready.

vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)


#Training  a model:

classifier = MultinomialNB()

targets = data['class'].values

classifier.fit(counts, targets)



MultinomialNB()

In [31]:
#Testing a model

examples = ['Free Viagra now !!!', "Hi, Bob, how about a game of golf tommorrow?"]

example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
print(predictions)

['spam' 'ham']
