In [1]:
import os
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score

NEWLINE = '\n'

HAM = 'ham'
SPAM = 'spam'

SOURCES = [
    ('data/spam',        SPAM),
    ('data/easy_ham',    HAM),
    ('data/hard_ham',    HAM),
]
    
SOURCES_ALL = [
    ('data/spam',        SPAM),
    ('data/easy_ham',    HAM),
    ('data/hard_ham',    HAM),
    ('data/beck-s',      HAM),
    ('data/farmer-d',    HAM),
    ('data/kaminski-v',  HAM),
    ('data/kitchen-l',   HAM),
    ('data/lokay-m',     HAM),
    ('data/williams-w3', HAM),
    ('data/BG',          SPAM),
    ('data/GP',          SPAM),
    ('data/SH',          SPAM)
]

SKIP_FILES = {'cmds'}

In [5]:
def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding="latin-1") # modified for Python 2.7 MP
                    for line in f:
                        if past_header:
                            lines.append(line)
                        elif line == NEWLINE:
                            past_header = True
                    f.close()
                    content = NEWLINE.join(lines)
                    yield file_path, content

In [6]:
def build_data_frame(path, classification):
    rows = []
    index = [] 
    # create list with text and classification
    for file_name, text in read_files(path):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    data_frame = DataFrame(rows, index=index)
    return data_frame

In [9]:
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))
# reorder ham and spam, because it was originally come from the same folder
data = data.reindex(numpy.random.permutation(data.index))
print("after",data)

after                                                    class  \
data/easy_ham\01245.eb8e87560f382001583084d77b0...   ham   
data/easy_ham\02269.ee87871a0b0364173f13977f058...   ham   
data/spam\00344.17882edad13c2c761e6d8d99eef5a346    spam   
data/easy_ham\01870.2c9f4adcd1eec7ae3607d76bbf9...   ham   
data/easy_ham\02468.3ac182b2833f74c850d4bcaf63f...   ham   
data/easy_ham\00425.0ba16e840d94d629f8a3881b4e0...   ham   
data/easy_ham\01046.f0371dba9ae76787d5541e73a09...   ham   
data/easy_ham\00666.9f288224f19ca69b2663b5b9a85...   ham   
data/easy_ham\02036.ab9f23457f950a124c5b78449cc...   ham   
data/easy_ham\00563.8c8efdf5034a0ba771e48494fd2...   ham   
data/easy_ham\00911.dcbdde154d9f25c1afe32f4b8f5...   ham   
data/easy_ham\02402.7f699b32ac5965e5086d82b4199...   ham   
data/easy_ham\00127.c1981aeb12ebd22536f12f0a044...   ham   
data/easy_ham\00575.c4b32c1dc29245ce6298f5abf2f...   ham   
data/easy_ham\01407.5388b24c7941469cb0164922cf6...   ham   
data/easy_ham\01438.3bdd05f78df18d

In [15]:
vectorizer =  CountVectorizer(ngram_range=(1, 2))
dataa = vectorizer.fit_transform(data['text'].values)
print(dataa)

IndexError: invalid index

In [10]:
pipeline = Pipeline([
    ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2))),
    ('classifier',         MultinomialNB())
])

Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])


In [None]:
k_fold = KFold(n=len(data), n_folds=6)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values.astype(str)

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values.astype(str)

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label=SPAM)
    scores.append(score)

In [None]:
print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)