In [2]:
import os
import tarfile

import nltk
import pandas as pd

nltk.download('punkt')

from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist
from heapq import nlargest

import en_core_web_sm
nlp = en_core_web_sm.load()

[nltk_data] Downloading package punkt to /home/kerch007/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
downloads = os.path.join(os.environ['HOME'] + "/Downloads")
enron_dir = os.path.join(downloads, 'Enron emails')
enron_files = ['enron1.tar.gz', 'enron2.tar.gz', 'enron3.tar.gz',
               'enron4.tar.gz', 'enron5.tar.gz', 'enron6.tar.gz']

def extract_emails(fname):
    rows = []
    tfile = tarfile.open(fname, encoding="latin-1", errors='ignore')
    for member in tfile.getmembers():
        if 'ham' in member.name:
            f = tfile.extractfile(member)
            if f is not None:
                row = f.read()
                rows.append({'message': row, 'class': 'ham'})
        if 'spam' in member.name:
            f = tfile.extractfile(member)
            if f is not None:
                row = f.read()
                rows.append({'message': row, 'class': 'spam'})
    tfile.close()
    return pd.DataFrame(rows)
data = pd.DataFrame({'message': [], 'class': []})
for file in enron_files:
    unzipped_file = extract_emails(os.path.join(enron_dir, file))
    data = data.append(unzipped_file)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [4]:
data['message'] = data['message'].apply(lambda x: x.decode('latin-1'))
data['class'] = data['class'].map({'spam':1, 'ham':0})

In [5]:
punctuations = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'

def cleanup_text(docs, logging=False):
    texts = []
    stopwords = nltk.corpus.stopwords.words('english')
    doc = nlp(docs, disable=['parser', 'ner'])
    tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
    tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
    tokens = ' '.join(tokens)
    texts.append(tokens)
    return pd.Series(texts)

data_10 = data.sample(n=10)

data_10['message'] = data_10['message'].apply(lambda x: cleanup_text(x, False))
data_10

Unnamed: 0,class,message
966,0,subject enron cost saving guideline effective ...
3880,1,subject guarantee satisfaction great price
5863,1,subject hello mr albert william . chief audito...
4930,1,subject glad madde move hello welcome phar coo...
5690,1,subject fw fyi dear homeowner complete review ...
3279,1,subject press release new year eve central ele...
2361,0,subject sharad houston visit sharad expect get...
53,0,subject ua 4 meter 1441 11 97 falfurrias howar...
4658,1,subject surf charset big 5 market exhibition o...
3189,0,subject suemar berryman well inlet gulf plain ...


In [6]:
def get_text_summary(txt):
    sents = sent_tokenize(txt)
    word_sent = word_tokenize(txt.lower())
    _stopwords = set(stopwords.words('english') + list(punctuation))

    word_sent=[word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    words = nlargest(15, freq, key=freq.get)
    summary = ' '.join(words)
    return(summary)

data_10['summary'] = data_10['message'].apply(get_text_summary)

data_10

Unnamed: 0,class,message,summary
966,0,subject enron cost saving guideline effective ...,enron travel service cost please gss informati...
3880,1,subject guarantee satisfaction great price,subject guarantee satisfaction great price
5863,1,subject hello mr albert william . chief audito...,account fund company 3 14 contract 5 chief aud...
4930,1,subject glad madde move hello welcome phar coo...,l subject glad madde move hello welcome phar c...
5690,1,subject fw fyi dear homeowner complete review ...,offer mortgage low rate go subject fw fyi dear...
3279,1,subject press release new year eve central ele...,e de com www um que seu techno dj para os uma ...
2361,0,subject sharad houston visit sharad expect get...,amc option houston function sharad get exotica...
53,0,subject ua 4 meter 1441 11 97 falfurrias howar...,contract 97 089 ect meter 11 transport show al...
4658,1,subject surf charset big 5 market exhibition o...,com www tex po e website surf industrial china...
3189,0,subject suemar berryman well inlet gulf plain ...,berryman well subject suemar inlet gulf plain ...
