In [2]:
try:
    import urlextract
except:
    print('Exception: installing urlextract library\n')
    import sys
    !{sys.executable} -m pip install urlextract
    import urlextract

In [3]:
urlextract.__path__

['D:\\anacondaz\\lib\\site-packages\\urlextract']

In [4]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

## Подключение и загрузка данных

In [5]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'
DATA_PATH = os.path.join('datasets','spam')
HAM_URL = DOWNLOAD_ROOT + '20030228_easy_ham.tar.bz2'
SPAM_URL = DOWNLOAD_ROOT + '20030228_spam.tar.bz2'

def fetch_data(ham_url = HAM_URL, spam_url = SPAM_URL, data_path = DATA_PATH):
    os.makedirs(data_path, exist_ok=True)
    for filename, url in (('ham.tar.bz2', ham_url),('spam.tar.bz2', spam_url)):
        tar_path = os.path.join(data_path, filename)
        urllib.request.urlretrieve(url, tar_path)
        data_tar = tarfile.open(tar_path)
        data_tar.extractall(path=data_path)
        data_tar.close()

In [6]:
DATA_PATH

'datasets\\spam'

In [7]:
fetch_data()

In [8]:
SPAM_DIR = os.path.join(DATA_PATH,'spam')
HAM_DIR = os.path.join(DATA_PATH, 'easy_ham')
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name)>20]
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name)>20]

# listdir - return The list is in arbitrary order, that's why we sorted it.

In [9]:
print(len(spam_filenames), len(ham_filenames), sep='\n')

500
2500


In [10]:
spam_filenames[1]

'00002.d94f1b97e48ed3b553b3508d116e6a09'

## Parse Email

In [11]:
import email
import email.policy

def load_email(is_spam, filename, spam_path = DATA_PATH):
    directory = 'spam' if is_spam else 'easy_ham'
    with open(os.path.join(spam_path, directory, filename), 'rb') as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)
    
# BytesParser converts bytes(in which the email is encoded) to characters.

In [12]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [13]:
ham_emails[1] #an object, returned by parser.

<email.message.EmailMessage at 0x1dbcb1b2308>

### examples

In [14]:
ham_emails[1].get_content() #return big string???

"Martin A posted:\nTassos Papadopoulos, the Greek sculptor behind the plan, judged that the\n limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the\n Mount Athos monastic community, was ideal for the patriotic sculpture. \n \n As well as Alexander's granite features, 240 ft high and 170 ft wide, a\n museum, a restored amphitheatre and car park for admiring crowds are\nplanned\n---------------------\nSo is this mountain limestone or granite?\nIf it's limestone, it'll weather pretty fast.\n\n------------------------ Yahoo! Groups Sponsor ---------------------~-->\n4 DVDs Free +s&p Join Now\nhttp://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM\n---------------------------------------------------------------------~->\n\nTo unsubscribe from this group, send an email to:\nforteana-unsubscribe@egroups.com\n\n \n\nYour use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/ \n\n\n\n"

In [15]:
ham_emails[1].get_content().strip() #clear \n and ' '

"Martin A posted:\nTassos Papadopoulos, the Greek sculptor behind the plan, judged that the\n limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the\n Mount Athos monastic community, was ideal for the patriotic sculpture. \n \n As well as Alexander's granite features, 240 ft high and 170 ft wide, a\n museum, a restored amphitheatre and car park for admiring crowds are\nplanned\n---------------------\nSo is this mountain limestone or granite?\nIf it's limestone, it'll weather pretty fast.\n\n------------------------ Yahoo! Groups Sponsor ---------------------~-->\n4 DVDs Free +s&p Join Now\nhttp://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM\n---------------------------------------------------------------------~->\n\nTo unsubscribe from this group, send an email to:\nforteana-unsubscribe@egroups.com\n\n \n\nYour use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/"

In [16]:
print(ham_emails[1].get_content())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/ 






In [17]:
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [18]:
print(spam_emails[6].get_content().strip())

Help wanted.  We are a 14 year old fortune 500 company, that is
growing at a tremendous rate.  We are looking for individuals who
want to work from home.

This is an opportunity to make an excellent income.  No experience
is required.  We will train you.

So if you are looking to be employed from home with a career that has
vast opportunities, then go:

http://www.basetel.com/wealthnow

We are looking for energetic and self motivated people.  If that is you
than click on the link and fill out the form, and one of our
employement specialist will contact you.

To be removed from our link simple go to:

http://www.basetel.com/remove.html


4139vOLW7-758DoDY1425FRhM1-764SMFc8513fCsLl40


### emails structure

In [19]:
def get_email_structure(email):
    if isinstance(email, str): #Return type of email. (here compared to a string)
        return email
    payload = email.get_payload() #Get List of Message-objects.
    if isinstance(payload, list):
        return 'multipart({})'.format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type() #text/plain

In [20]:
from collections import Counter #Counter of unique words in list

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [44]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [22]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

### what about email headers?

In [46]:
#spam_emails[0].items()

In [23]:
for header, value in spam_emails[0].items():
    print(header,":", value,'\n')

Return-Path : <12a1mailbot1@web.de> 

Delivered-To : zzzz@localhost.spamassassin.taint.org 

Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT) 

Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST) 

Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100 

From : 12a1mailbot1@web.de 

Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900 

To : dcek1a1@netsgo.com 

Subject : Life Insurance - Why Pay More? 

Date : Wed, 21 Aug 2002 20:31:57 -1600 

MIME-Version : 1.0 

Message-ID : <0103c1042001882DD_IT7@dd_it7> 

Content-Type : text/html; charset="iso-8859-1" 

Cont

In [24]:
spam_emails[0]['Subject']

'Life Insurance - Why Pay More?'

## Our sets.

In [25]:
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

## Parsing HTML

### We should use here BeautifulSoup library......

In [48]:
import re #Regular expressions (Regex)
from html import unescape #Convert all named and numeric character references (&gt etc.) to unicode characters.

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I) #I-flag - deepreciate register.
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I) #S-flag means that '.' - can be any character.
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [53]:
html_spam_emails = [email for email in X_train[y_train==1]
                   if get_email_structure(email) == 'text/html']
sample_html_spam = html_spam_emails[4]
print(sample_html_spam.get_content().strip()[:1000], '...')

<html>
<head>
</head>
  <body background="http://64.70.215.158/images//Gauze.jpg">
   
<div align="Center">  
<center>  
<table width="90%" bgcolor="#ffffff" border="0">
    <tbody>
    <tr>
      <td width="100%">                     
      <hr color="#0000ff">               
      <table width="100%" border="0">
          <tbody>
          <tr>
            <td width="100%" colspan="2"><font size="4" color="#0000FF"><b>The 
            Famous</b></font><b><font color="#0000ff" size="4">
            eBay Marketing e-Course...</font></b></td>
           </tr>
          <tr>
            <td valign="Top" align="Left" width="15%" height="90"><br>
                           
            <p> </p>
             </td>
            <td valign="Top" align="Left" width="85%">                  
       
            <center>               
            <p></p>
            </center>
             
            <center>               
            <p></p>
            </center>
             
            <tab

In [54]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], '...')


            The
            Famous
            eBay Marketing e-Course...
                   
                  Learn To
 Sell With the Complete eBay
Auction Marketing e-Course
                  Here's
                  YOUR Chance To Join The Online
                  Selling Revolution And EARN A FULL TIME  INCOME!
 Our eBay                    Marketing e-Course will show you how to create
HUGE profits selling on eBay!
                   Do you sell on eBay? If so, you could be making up
to $100,000 per month.
                  This is no hype and no scam. Receiving over 1.5 billion page views per
month, eBay is the ULTIMATE venue for selling virtually anything and making
huge profits with almost no effort. But you have to know what to sell and
how to sell. That's where I come in.
                  As a leading expert in internet marketing and the owner of several profitable
auction-based businesses, the manual that I have written provides easy to
understand and detailed instruction

In [55]:
def email_to_text(email): #We just return plain text, no matter what...
    html = None
    for part in email.walk(): #Itaerate on each path of email structure's part
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: #in case of encoding issues
            content = str(part.get_payload()) #str from list?
        if ctype == "text/plain":
            return content
        else:
            html = content
        if html:
            return html_to_plain_text(html)

In [56]:
print(email_to_text(sample_html_spam)[:100],' ...')


            The
            Famous
            eBay Marketing e-Course...
                   
       ...


### Stemming

In [57]:
import nltk

In [58]:
stemmer = nltk.PorterStemmer()
for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
    print(word, "->", stemmer.stem(word))

Computations -> comput
Computation -> comput
Computing -> comput
Computed -> comput
Compute -> comput
Compulsive -> compuls


### Replacing URLs

In [33]:
url_extractor = urlextract.URLExtract()
print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


## Transformer

In [59]:
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [60]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'you': 14, 'to': 13, 'the': 11, 'number': 9, 'hyperlink': 7, 'and': 7, 'of': 7, 'e': 7, 'market': 6, 'from': 6, 'thi': 6, 'mail': 6, 'a': 5, 'in': 5, 'for': 5, 'our': 5, 'technolog': 4, 's': 4, 'if': 4, 'or': 4, 'receiv': 4, 'be': 4, 'remov': 4, 'sale': 3, 'busi': 3, 'one': 3, 'it': 3, 'easi': 3, 'will': 3, 'not': 3, 'pleas': 3, 'is': 3, 'list': 3, 'that': 2, 'relev': 2, 'advertis': 2, 'articl': 2, 'magazin': 2, 'as': 2, 'fast': 2, 'compani': 2, 'wire': 2, 'pc': 2, 'world': 2, 'asia': 2, 'street': 2, 'intellig': 2, 'inc': 2, 'bank': 2, 'all': 2, 'today': 2, 'on': 2, 'your': 2, 'with': 2, 'supplier': 2, 'resourc': 2, 'need': 2, 'do': 2, 'find': 2, 'are': 2, 'have': 2, 'special': 2, 'offer': 2, 'ha': 2, 'been': 2, 'sent': 2, 'feder': 2, 'messag': 2, 'resid': 2, 'immedi': 2, 'click': 2, 'new': 1, 'page': 1, 'final': 1, 'newsfe': 1, 'deliv': 1, 'current': 1, 'such': 1, 'locat': 1, 'free': 1, 'sign': 1, 'up': 1, 'read': 1, 'use': 1, 'time': 1, 'provid': 1, 'internet': 1, 'mo

## Convert words into vectors

In [61]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, vocabulary_size=3000):
        self.vocabulary_size = vocabulary_size
        
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [62]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.intc'>'
	with 32 stored elements in Compressed Sparse Row format>

In [63]:
X_few_vectors.toarray()

array([[281,  13,  11,  14,   7,   5,   7,   9,   5,   5,   0],
       [469,  18,  21,   8,  13,  15,  12,   4,  10,   6,  19],
       [352,  11,  13,  10,  12,  12,   7,  24,   8,   4,   5]],
      dtype=int32)

In [64]:
vocab_transformer.vocabulary_

{'to': 1,
 'the': 2,
 'you': 3,
 'and': 4,
 'for': 5,
 'of': 6,
 'number': 7,
 'a': 8,
 'in': 9,
 'i': 10}

## Pipeline

In [65]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [66]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="liblinear", random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] .................................... , score=0.991, total=   0.2s
[CV]  ................................................................
[CV] .................................... , score=0.979, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.988, total=   0.2s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s finished


0.9858333333333333

In [67]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 98.77%
Recall: 96.39%
