Distinguishing between email types using Naive Bayes 

In [23]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
#inspecting all email labels 
emails = fetch_20newsgroups()
labels = emails.target_names
print(labels)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [25]:
#importing the email types to use for training
train_emails = fetch_20newsgroups(categories = ['comp.sys.ibm.pc.hardware','rec.sport.hockey'], subset = 'train', shuffle = True, random_state = 108)

In [26]:
print(train_emails.target_names)

['comp.sys.ibm.pc.hardware', 'rec.sport.hockey']


In [27]:
#test emails
test_emails = fetch_20newsgroups(categories = ['comp.sys.ibm.pc.hardware','rec.sport.hockey'], subset = 'test', shuffle = True, random_state = 108)

In [28]:
#transforming emails to lists of word counts, all the words the model should lookout for in an email
counter = CountVectorizer()
counter.fit(test_emails.data + train_emails.data)


#making a list of the count of words in training and test sets 
train_counts = counter.transform(train_emails.data)
test_counts = counter.transform(test_emails.data)

In [29]:
#training the model
classifier = MultinomialNB()
classifier.fit(train_counts, train_emails.target)

#testing model accuracy
print(classifier.score(test_counts, test_emails.target))

0.9974715549936789


This model distinguishes between emails about hockey and emails about tech with over 99% accuracy, other email categories can also be compared by picking any two target names and putting them in the categories parameter in test_emails and train_emails.