In [1]:
import os
import codecs # helps with different text encodings

In [2]:
def read_in(folder):
  files = os.listdir(folder)
  a_list = []
  for a_file in files:
    if not a_file.startswith("."):
      f = codecs.open(folder + a_file, "r", encoding = "ISO-8859-1", errors="ignore") #
      a_list.append(f.read())
      f.close()
  return a_list

[DataSet](http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/index.html)

In [3]:
import google.colab
uploader = google.colab.files.upload()

Saving enron1.tar.gz to enron1.tar.gz


In [4]:
import tarfile

file_name = "enron1.tar.gz"

with tarfile.open(file_name, "r:gz") as tar:
    tar.extractall("extracted_data")
    print("Extraction complete!")

Extraction complete!


In [5]:
spam_list = read_in("/content/extracted_data/enron1/spam/")
ham_list = read_in("/content/extracted_data/enron1/ham/")

print(len(spam_list))
print(len(ham_list))

print(spam_list[0])
print(ham_list[0])

1500
3672
Subject: cant find you on msn . . .
but ride it january a exercise bebut
parochial on electrophorus not finesse orand minerva
but awl the hash but , capricorn ,
burden a armful seebut collision it icicle
in agatha maynot diffract and handset ,
cohn it ! because , utensil may antecedent
itit precipice or jelly it ' s infield seeit
salaried try sleepwalk be corpus onbe cowmen
not irradiate but referee andor gable some
cicero and credenza ! .
if you wanna , raw move cleeqing here
thank you ,
georgette neff
acourtesy manager

Subject: re : meter # : 1266 ; august 2000 / allocation exception
lauri
there have been some fluctuations @ dupont meter # 1266 - i believe conoco
transport is only business there - see below
is the transport set up to take these small swings ?
lee
from : robert e lloyd 08 / 21 / 2000 02 : 34 pm
to : lee l papayoti / hou / ect @ ect
cc : gary a hanks / hou / ect @ ect , daren j farmer / hou / ect @ ect , pat
clynes / corp / enron @ en

In [6]:
import random

all_emails = [(email_content, "spam") for email_content in spam_list]
all_emails += [(email_content, "ham") for email_content in ham_list]

random.seed(42)
random.shuffle(all_emails)

print (f"Dataset size = {str(len(all_emails))} emails")

Dataset size = 5172 emails


In [7]:
# Splitting

import nltk
from nltk import word_tokenize

nltk.download('punkt_tab')

def tokenize(input):
  word_list = []
  for word in word_tokenize(input):
    word_list.append(word)
  return word_list

input = "What's the best way to split a sentence into words?"
print(tokenize(input))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['What', "'s", 'the', 'best', 'way', 'to', 'split', 'a', 'sentence', 'into', 'words', '?']


In [8]:
# Feature Extraction

def get_features(text):
  features = {}
  word_list = [word for word in word_tokenize(text.lower())]

  for word in word_list:
    features[word] = True
  return features

all_features = [(get_features(email), label) for (email, label) in all_emails]

print(get_features("Participate In Our New Lottery NOW!"))
print(len(all_features))
print(len(all_features[0][0]))
print(len(all_features[99][0]))

{'participate': True, 'in': True, 'our': True, 'new': True, 'lottery': True, 'now': True, '!': True}
5172
122
74


In [9]:
from nltk import NaiveBayesClassifier, classify

def train(features, proportion):
  train_size = int(len(features) * proportion)
  train_set, test_set = features[:train_size], features[train_size:]
  print (f"Training set size = {str(len(train_set))} emails")
  print (f"Test set size = {str(len(test_set))} emails")
  classifier = NaiveBayesClassifier.train(train_set)
  return train_set, test_set, classifier

train_set, test_set, classifier = train(all_features, 0.8)

Training set size = 4137 emails
Test set size = 1035 emails


In [10]:
def evaluate(train_set, test_set, classifier):
  print (f"Accuracy on the training set = {str(classify.accuracy(classifier, train_set))}")
  print (f"Accuracy of the test set = {str(classify.accuracy(classifier, test_set))}")
  classifier.show_most_informative_features(50)
evaluate(train_set, test_set, classifier)

Accuracy on the training set = 0.95987430505197
Accuracy of the test set = 0.957487922705314
Most Informative Features
               forwarded = True              ham : spam   =    198.5 : 1.0
                     hou = True              ham : spam   =    185.5 : 1.0
                    2004 = True             spam : ham    =    161.4 : 1.0
            prescription = True             spam : ham    =    124.5 : 1.0
                     ect = True              ham : spam   =    115.0 : 1.0
                    pain = True             spam : ham    =     95.6 : 1.0
                    spam = True             spam : ham    =     87.5 : 1.0
                     sex = True             spam : ham    =     85.9 : 1.0
                featured = True             spam : ham    =     77.9 : 1.0
             medications = True             spam : ham    =     76.3 : 1.0
                  differ = True             spam : ham    =     73.1 : 1.0
                  weight = True             spam : ham  

In [11]:
# Searching

from nltk.text import Text

def concordance(data_list, search_word):
  for email in data_list:
    word_list = [word for word in word_tokenize(email.lower())]
    text_list = Text(word_list)
    if search_word in word_list:
      text_list.concordance(search_word)

print ("STOCKS in HAM:")
concordance(ham_list, "stocks")
print ("\n\nSTOCKS in SPAM:")
concordance(spam_list, "stocks")

STOCKS in HAM:
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ad my portfolio is diversified into stocks that have lost even more money than
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files


STOCKS in SPAM:
Displaying 2 of 2 matches:
his email pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this email . none o
Displaying 4 of 4 matches:
hree days . play of the week tracks stocks on downward trends , foresees botto
mark is our uncanny ability to spot stocks that have bottomed - out and antici
ound and upward trend . most of the stocks we track rebound and peak within ju
om third party . investing in penny stocks is high risk and you should seek pr
Displaying 3 of

In [12]:
test_spam_list = ["Participate in our new lottery!", "Try out this new medicine"]
test_ham_list = ["See the minutes from the last meeting attached", "Investors are coming to our office on Monday"]

test_emails = [(email_content, "spam") for email_content in test_spam_list]
test_emails += [(email_content, "ham") for email_content in test_ham_list]

new_test_set = [(get_features(email), label) for (email, label) in test_emails]
evaluate(train_set, new_test_set, classifier) #D

Accuracy on the training set = 0.95987430505197
Accuracy of the test set = 1.0
Most Informative Features
               forwarded = True              ham : spam   =    198.5 : 1.0
                     hou = True              ham : spam   =    185.5 : 1.0
                    2004 = True             spam : ham    =    161.4 : 1.0
            prescription = True             spam : ham    =    124.5 : 1.0
                     ect = True              ham : spam   =    115.0 : 1.0
                    pain = True             spam : ham    =     95.6 : 1.0
                    spam = True             spam : ham    =     87.5 : 1.0
                     sex = True             spam : ham    =     85.9 : 1.0
                featured = True             spam : ham    =     77.9 : 1.0
             medications = True             spam : ham    =     76.3 : 1.0
                  differ = True             spam : ham    =     73.1 : 1.0
                  weight = True             spam : ham    =     73.1 :

In [13]:
for email in test_spam_list:
  print (email)
  print (classifier.classify(get_features(email)))

for email in test_ham_list:
  print (email)
  print (classifier.classify(get_features(email)))

Participate in our new lottery!
spam
Try out this new medicine
spam
See the minutes from the last meeting attached
ham
Investors are coming to our office on Monday
ham


In [19]:
del input

In [20]:
while True:
  email = input("Type in your email here (or press 'Enter'): ")
  if len(email)==0:
    break
  else:
    prediction = classifier.classify(get_features(email))
    print (f"This email is likely {prediction}\n")

Type in your email here (or press 'Enter'): I am a stock broker.
This email is likely spam

Type in your email here (or press 'Enter'): I will help you.
This email is likely ham

Type in your email here (or press 'Enter'): This is my Buisness.
This email is likely ham

Type in your email here (or press 'Enter'): I play Cricket.
This email is likely spam

Type in your email here (or press 'Enter'): Buy these shoes.
This email is likely spam

Type in your email here (or press 'Enter'): CLick here.
This email is likely spam

Type in your email here (or press 'Enter'): 


In [1]:
# Spliting word and seperating punctuations (algo)

text = 'Define which data represents "ham" class and which data represents "spam" class forthe machine learning algorithm.'

delimiters = ['"', "."]

words = []

current_word = ""

for char in text:
  if char==" ":
    if not current_word=="":
      words.append(current_word)
      current_word = ""
  elif char in delimiters:
    if current_word=="":
      words.append(char)
    else:
      words.append(current_word)
      words.append(char)
      current_word = ""
  else:
    current_word += char

print(words)

['Define', 'which', 'data', 'represents', '"', 'ham', '"', 'class', 'and', 'which', 'data', 'represents', '"', 'spam', '"', 'class', 'forthe', 'machine', 'learning', 'algorithm', '.']


## Enron2

In [21]:
import google.colab
uploader = google.colab.files.upload()

Saving enron2.tar.gz to enron2.tar.gz


In [22]:
file_name = "enron2.tar.gz"

with tarfile.open(file_name, "r:gz") as tar:
    tar.extractall("extracted_data")
    print("Extraction complete!")

Extraction complete!


In [23]:
spam_list = read_in("/content/extracted_data/enron2/spam/")
ham_list = read_in("/content/extracted_data/enron2/ham/")

print(len(spam_list))
print(len(ham_list))

print(spam_list[0])
print(ham_list[0])

1496
4361
Subject: fantastic investors info
maisonette international enterprises ltd ( maen )
a soiid hoiding of companies with constant revenue generating
businesses , offering unique products and services to the genera | public and
professionais .
current price : 0 . o 9
is this an undiscovered gem that is positioned to go higher ? review
exactly what this company does . does it sound new and exciting to you ?
watch this one trade tuesday .
breaking news ! !
maisonette home products , ltd . receives exclusive agreement for export
of paneiized homes in the united kingdom
maisonette home products , ltd . , the canadian subsidiary of maisonette
international enterprises ltd . ( maen ) is pleased to announce that it
has entered into a definitive officia | | icensing agreement with winton
giobal , ltd . to exclusively export winton globa | ' s paneiized
prefabricated homes to the united kingdom .
under the terms of the agreement , maisonette will act as exclusive
agent f

In [24]:
all_emails = [(email_content, "spam") for email_content in spam_list]
all_emails += [(email_content, "ham") for email_content in ham_list]

random.seed(42)
random.shuffle(all_emails)

print (f"Dataset size = {str(len(all_emails))} emails")

Dataset size = 5857 emails


In [31]:
all_features = [(get_features(email), label) for (email, label) in all_emails]

print(len(all_features))
print(len(all_features[0][0]))
print(len(all_features[99][0]))

5857
53
178


In [32]:
train_set, test_set, classifier = train(all_features, 0.8)

Training set size = 4685 emails
Test set size = 1172 emails


In [33]:
evaluate(train_set, test_set, classifier)

Accuracy on the training set = 0.9970117395944503
Accuracy of the test set = 0.9906143344709898
Most Informative Features
                   vince = True              ham : spam   =    522.6 : 1.0
                     ect = True              ham : spam   =    294.1 : 1.0
                     hou = True              ham : spam   =    275.0 : 1.0
                      cc = True              ham : spam   =    240.2 : 1.0
                 shirley = True              ham : spam   =    138.6 : 1.0
                identity = True             spam : ham    =    129.3 : 1.0
               forwarded = True              ham : spam   =    123.4 : 1.0
                     713 = True              ham : spam   =    120.1 : 1.0
                     oem = True             spam : ham    =     98.7 : 1.0
                     php = True             spam : ham    =     94.8 : 1.0
                thousand = True             spam : ham    =     92.9 : 1.0
                   logos = True             spam : ha

## Enron1-6

In [35]:
enron1 = google.colab.files.upload()

Saving enron1.tar.gz to enron1.tar (1).gz


In [36]:
enron2 = google.colab.files.upload()

Saving enron2.tar.gz to enron2.tar (1).gz


In [37]:
enron3 = google.colab.files.upload()

Saving enron3.tar.gz to enron3.tar.gz


In [38]:
enron4 = google.colab.files.upload()

Saving enron4.tar.gz to enron4.tar.gz


In [39]:
enron5 = google.colab.files.upload()

Saving enron5.tar.gz to enron5.tar.gz


In [40]:
enron6 = google.colab.files.upload()

Saving enron6.tar.gz to enron6.tar.gz


In [41]:
for i in range(1,7):
  file_name = f"enron{i}.tar.gz"

  with tarfile.open(file_name, "r:gz") as tar:
      tar.extractall("extracted_data")
      print(f"{file_name} : Extraction complete!")

enron1.tar.gz : Extraction complete!
enron2.tar.gz : Extraction complete!
enron3.tar.gz : Extraction complete!
enron4.tar.gz : Extraction complete!
enron5.tar.gz : Extraction complete!
enron6.tar.gz : Extraction complete!


In [42]:
spam_list = []
ham_list = []

for i in range(1,7):
  spam_list += read_in(f"/content/extracted_data/enron{i}/spam/")
  ham_list += read_in(f"/content/extracted_data/enron{i}/ham/")

In [43]:
all_emails = [(email_content, "spam") for email_content in spam_list]
all_emails += [(email_content, "ham") for email_content in ham_list]

random.seed(42)
random.shuffle(all_emails)

print (f"Dataset size = {str(len(all_emails))} emails")

Dataset size = 33716 emails


In [44]:
all_features = [(get_features(email), label) for (email, label) in all_emails]

print(len(all_features))
print(len(all_features[0][0]))
print(len(all_features[99][0]))

33716
54
94


In [45]:
train_set, test_set, classifier = train(all_features, 0.8)

Training set size = 26972 emails
Test set size = 6744 emails


In [46]:
evaluate(train_set, test_set, classifier)

Accuracy on the training set = 0.9906940530920955
Accuracy of the test set = 0.9856168446026097
Most Informative Features
                   enron = True              ham : spam   =   4144.9 : 1.0
                     php = True             spam : ham    =    485.8 : 1.0
                    meds = True             spam : ham    =    344.3 : 1.0
                     xls = True              ham : spam   =    328.2 : 1.0
                 stinson = True              ham : spam   =    315.7 : 1.0
                crenshaw = True              ham : spam   =    308.0 : 1.0
                     713 = True              ham : spam   =    260.4 : 1.0
                   corel = True             spam : ham    =    259.6 : 1.0
              scheduling = True              ham : spam   =    243.2 : 1.0
                     eol = True              ham : spam   =    237.5 : 1.0
                  louise = True              ham : spam   =    232.2 : 1.0
              macromedia = True             spam : ha