In [12]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

## Подключение и загрузка данных

In [19]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'
DATA_PATH = os.path.join('datasets','spam')
HAM_URL = DOWNLOAD_ROOT + '20030228_easy_ham.tar.bz2'
SPAM_URL = DOWNLOAD_ROOT + '20030228_spam.tar.bz2'

def fetch_data(ham_url = HAM_URL, spam_url = SPAM_URL, data_path = DATA_PATH):
    os.makedirs(data_path, exist_ok=True)
    for filename, url in (('ham.tar.bz2', ham_url),('spam.tar.bz2', spam_url)):
        tar_path = os.path.join(data_path, filename)
        urllib.request.urlretrieve(url, tar_path)
        data_tar = tarfile.open(tar_path)
        data_tar.extractall(path=data_path)
        data_tar.close()

In [50]:
DATA_PATH

'datasets\\spam'

In [20]:
fetch_data()

In [21]:
SPAM_DIR = os.path.join(DATA_PATH,'spam')
HAM_DIR = os.path.join(DATA_PATH, 'easy_ham')
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name)>20]
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name)>20]

In [22]:
print(len(spam_filenames), len(ham_filenames), sep='\n')

500
2500


## Parse Email

In [24]:
import email
import email.policy

def load_email(is_spam, filename, spam_path = DATA_PATH):
    directory = 'spam' if is_spam else 'easy_ham'
    with open(os.path.join(spam_path, directory, filename), 'rb') as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [27]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [28]:
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [29]:
print(spam_emails[6].get_content().strip())

Help wanted.  We are a 14 year old fortune 500 company, that is
growing at a tremendous rate.  We are looking for individuals who
want to work from home.

This is an opportunity to make an excellent income.  No experience
is required.  We will train you.

So if you are looking to be employed from home with a career that has
vast opportunities, then go:

http://www.basetel.com/wealthnow

We are looking for energetic and self motivated people.  If that is you
than click on the link and fill out the form, and one of our
employement specialist will contact you.

To be removed from our link simple go to:

http://www.basetel.com/remove.html


4139vOLW7-758DoDY1425FRhM1-764SMFc8513fCsLl40


In [32]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return 'multipart({})'.format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [33]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [34]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [35]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [36]:
for header, value in spam_emails[0].items():
    print(header,":", value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [37]:
spam_emails[0]['Subject']

'Life Insurance - Why Pay More?'

In [38]:
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [39]:
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [40]:
html_spam_emails = [email for email in X_train[y_train==1]
                   if get_email_structure(email) == 'text/html']
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], '...')

<html>

<head>
<meta http-equiv="Content-Language" content="en-us">
<meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<title>Would You Like to Save up to 80</title>
</head>

<body bgcolor="#000000">

<div align="center">
  <center>

<table border="3" width="469" height="83" bgcolor="#000000" bordercolor="#0000FF">
  <tr>
    <td width="100%" height="48" align="center" bordercolor="#FFFFFF"><font face="Tahoma" color="#FFFFFF" size="5"> Would You Like to
      <b> Save </b>up to<b> </b></font><b><font face="Tahoma" size="5" color="#FFFFCC">80%</font></b><font face="Tahoma" color="#FFFFFF" size="5">
      on <b>Printer</b>, </font><b><font face="Tahoma" size="5" color="#FF3300">Fax</font></b><font face="Tahoma" color="#FFFFFF" size="5">
      &amp; </font><b><font face="Tahoma" size="5" color="#FFFFCC"> Copier</font></b><font face="Tahoma" color="#FFF

In [41]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], '...')


     Would You Like to
       Save up to 80%
      on Printer, Fax
      &  Copier Supplies?
      On
        Brands Like ->
      EPSON
      Canon
      HEWLETT
        PACKARD
      Lexmark
      &
        more!
      100%
        Quality Satisfaction Guarantee or Your Money Back!
      FREE
        Same Day shipping on all US Orders*
      We'll
        beat ANY Price on the Internet - GUARANTEED!**
       HYPERLINK Click
        Here to ORDER NOW!
      OR
        Call us Toll-Free at 1-800-758-8084!
 
 
      *Free Shipping only on
        orders of $40 or more.**We beat any online retailer's price by 5%.
        Call us with the URL (Website) advertising the lower price and once we
        verify the price, we will beat it by 5%! (Must be same manufacturer)
      You
        are receiving this special offer because you have provided permission to
        receive email communications regarding special online promotions or
        offers. If you feel you have received this messag

In [42]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: #in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
        if html:
            return html_to_plain_text(html)

In [43]:
print(email_to_text(sample_html_spam)[:100],' ...')


     Would You Like to
       Save up to 80%
      on Printer, Fax
      &  Copier Supplies?
        ...


In [44]:
import nltk

In [45]:
stemmer = nltk.PorterStemmer()
for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
    print(word, "->", stemmer.stem(word))

Computations -> comput
Computation -> comput
Computing -> comput
Computed -> comput
Compute -> comput
Compulsive -> compuls


In [46]:
import urlextract

ModuleNotFoundError: No module named 'urlextract'