In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import pickle

In [3]:
def load_enron_data(path):
    """Load Enron email data from path into a file."""
    for filename in os.listdir(path):
        row = {
            'filename': filename,
            'content': open(os.path.join(path, filename), 'r', encoding='latin1').read()
        }
        yield row

In [4]:
spam_df = pd.DataFrame(load_enron_data('data/enron1/spam/'))
spam_df['is_spam'] = True

In [5]:
ham_df = pd.DataFrame(load_enron_data('data/enron1/ham'))
ham_df['is_spam'] = False

In [6]:
email_df = pd.concat([spam_df, ham_df], axis=0)

Pipeline of data: taking a Python object, saving as a pickle, then can be used elsewhere (websites, etc.)
Pipeline class from sklearn can train model on data and can make predictions on new data. Pickle contains whatever the model learns and uses that (pickling can be done on many differrent Python objects). Pipelines can not be fully built on accuracy, as that metric does not really focus on the business case be able to function in the real world. 

In [7]:
vec = TfidfVectorizer(stop_words='english')
model = MultinomialNB()

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english')),
    ('model', MultinomialNB())
])

pipeline.fit(email_df['content'], email_df['is_spam'])

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...e,
        vocabulary=None)), ('model', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [8]:
pipeline.predict_proba([
    'cheap viagra',         # Spam
    'meeting lunch energy'  # Not spam
    ])

array([[0.05751643, 0.94248357],
       [0.97106817, 0.02893183]])

In [9]:
with open('spam_model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)