#### FOUNDATIONS OF MACHINE LEARNING: SUPERVISED LEARNING

<br>

# Email Similarity

<hr>

In [4]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

### Exploring the Data

In [19]:
emails = fetch_20newsgroups(categories = ['rec.sport.baseball', 'rec.sport.hockey'])

In [22]:
print(emails.target_names)

['rec.sport.baseball', 'rec.sport.hockey']


In [23]:
print(emails.data[5])

From: mmb@lamar.ColoState.EDU (Michael Burger)
Subject: More TV Info
Distribution: na
Nntp-Posting-Host: lamar.acns.colostate.edu
Organization: Colorado State University, Fort Collins, CO  80523
Lines: 36

United States Coverage:
Sunday April 18
  N.J./N.Y.I. at Pittsburgh - 1:00 EDT to Eastern Time Zone
  ABC - Gary Thorne and Bill Clement

  St. Louis at Chicago - 12:00 CDT and 11:00 MDT - to Central/Mountain Zones
  ABC - Mike Emerick and Jim Schoenfeld

  Los Angeles at Calgary - 12:00 PDT and 11:00 ADT - to Pacific/Alaskan Zones
  ABC - Al Michaels and John Davidson

Tuesday, April 20
  N.J./N.Y.I. at Pittsburgh - 7:30 EDT Nationwide
  ESPN - Gary Thorne and Bill Clement

Thursday, April 22 and Saturday April 24
  To Be Announced - 7:30 EDT Nationwide
  ESPN - To Be Announced


Canadian Coverage:

Sunday, April 18
  Buffalo at Boston - 7:30 EDT Nationwide
  TSN - ???

Tuesday, April 20
  N.J.D./N.Y. at Pittsburgh - 7:30 EDT Nationwide
  TSN - ???

Wednesday, April 21
  St. Louis a

In [26]:
print(emails.target[5])
#since the index is 1, that means the email is equal to rec.sport.hockey

1


### Making the Training and Test Sets

In [28]:
train_emails = fetch_20newsgroups(categories = ['rec.sport.baseball', 'rec.sport.hockey'], subset = 'train', shuffle = True, random_state = 108)
test_emails = fetch_20newsgroups(categories = ['rec.sport.baseball', 'rec.sport.hockey'], subset = 'test', shuffle = True, random_state = 108)

### Counting Words

In [31]:
counter = CountVectorizer()
counter.fit(train_emails.data + test_emails.data)

CountVectorizer()

In [33]:
train_counts = counter.transform(train_emails.data)
test_counts = counter.transform(test_emails.data)

### Making a Naive Bayes Classifier

In [35]:
classifier = MultinomialNB()
classifier.fit(train_counts, train_emails.target) #train_emails.target are the labels

MultinomialNB()

In [36]:
print(classifier.score(test_counts, test_emails.target))
#this means that the classifier was pretty accurate when classifying baseball and hockey emails

0.9723618090452262


### Testing Other Datasets

In [38]:
train_emails = fetch_20newsgroups(categories = ['comp.sys.ibm.pc.hardware', 'rec.sport.hockey'], subset = 'train', shuffle = True, random_state = 108)
test_emails = fetch_20newsgroups(categories = ['comp.sys.ibm.pc.hardware', 'rec.sport.hockey'], subset = 'test', shuffle = True, random_state = 108)

In [39]:
counter.fit(train_emails.data + test_emails.data)
train_counts = counter.transform(train_emails.data)
test_counts = counter.transform(test_emails.data)

In [40]:
classifier.fit(train_counts, train_emails.target)

MultinomialNB()

In [41]:
print(classifier.score(test_counts, test_emails.target))
# the classifier was very accurate when classifying the different categories, most likely because they were from different categories

0.9974715549936789


<hr>

In [50]:
train_emails = fetch_20newsgroups(categories = ['comp.graphics', 'sci.electronics', 'comp.windows.x'], subset = 'train', shuffle = True, random_state = 108)
test_emails = fetch_20newsgroups(categories = ['comp.graphics', 'sci.electronics', 'comp.windows.x'], subset = 'test', shuffle = True, random_state = 108)

In [51]:
counter.fit(train_emails.data + test_emails.data)
train_counts = counter.transform(train_emails.data)
test_counts = counter.transform(test_emails.data)

In [52]:
classifier.fit(train_counts, train_emails.target)

MultinomialNB()

In [53]:
print(classifier.score(test_counts, test_emails.target))
# using 3 labels, the classifier was not as accurate as the other datasets

0.8598130841121495
