In [None]:
from __future__ import print_function
import re
class EmailParser:
    def __init__(self):
        self.reg_empty_line = re.compile('\n\n')
        self.reg_subject = re.compile('^Subject: (.*)$')
        self.reg_mime_tag = re.compile(r'^-+([\w\.=]+)-*$')
        self.reg_content_type = re.compile(r'^Content-Type: ([a-z/]+);?$')
        self.reg_charset = re.compile(r'charset=(\S*)')
        self.reg_html = re.compile('<.*?>', re.DOTALL)
    def __call__(self, lines):
        is_content = False
        mime_tag = None
        mime_opening = 0
        content = ""
        content_type = ""
        subject = ""
        find_content_type = 0
        lines = [ line.strip() for line in lines ]
        charset = "ascii"
        for i, line in enumerate(lines):
            if is_content:
#                print(line.strip(), mime_tag, mime_opening)
                if re.match(self.reg_mime_tag, line):
                    tag = re.match(self.reg_mime_tag, line).group(1)
                    mime_tag = tag
                    mime_opening = 1
                elif re.match(self.reg_content_type, line):
                    content_type = re.match(self.reg_content_type, line).group(1)
                    #content += content_type + ' '
                    find_content_type = 2
                elif line.startswith('This is a multi-part message in MIME format'):
                    pass
                else:
                    if line == '':
                        mime_opening = 0
                    elif mime_opening == 0 and re.match(r'^image', content_type) is None:
                        content += line + '\n' 
                        
                if find_content_type > 0 and re.search(self.reg_charset, line):
                    charset = re.search(self.reg_charset, line).group(1)
                    #print(lines[i-3:i+1], charset)
                find_content_type = max(find_content_type - 1, 0)
                
            if re.match(self.reg_lines, line):
                is_content = True
            elif re.match(self.reg_subject, line):
                subject = re.match(self.reg_subject, line).group(1).strip()
        content = re.sub(self.reg_html, '', content)
        try:
            return subject.decode(charset), content.decode(charset)
        except LookupError as e:
            charset='windows-1252'
        except UnicodeDecodeError as e:
            charset='windows-1252'
        return subject.decode(charset, 'ignore'), content.decode(charset, 'ignore')

In [None]:
import re
#content_types = defaultdict(lambda : 0)
def decode(s, charset):
    try:
        return s.decode(charset)
    except (UnicodeDecodeError, LookupError) as e:
        re_charset = re.compile(r'charset="(\S+?)"', re.MULTILINE)
        charset = re.search(re_charset, s)
        if charset:
            charset = charset.group(1)
            try:
                return s.decode(charset)
            except (UnicodeDecodeError, LookupError) as e:
                pass
        return s.decode('windows-1252', 'ignore')
    
def fix_line_break(texts, vocab):
    linebreak = re.compile(r'(\w*)=\n(\w*)')
    boundary = 0
    ret = ""
    for match in linebreak.finditer(texts):
        word = match.group(1) + match.group(2)
        ret += texts[boundary:match.start(0)]
        if match.group(1) != '' and match.group(2) != '' and word in vocab:
            ret += word
        else:
            ret += match.group(1) + ' ' + match.group(2)
        boundary = match.end(0)
    ret += texts[boundary:]
    return ret

def fix_coding(texts):
    def substitute(m):
        return chr(int(m.group(1), 16))
    return re.sub(re.compile(r'=([A-Fa-f0-9]{2})=?'), substitute, texts)

def parse(texts):
    texts = fix_coding(texts)
    p = texts.find('\n\n')
    body = texts[p + 2 : ]
    re_mime_id = re.compile(r'--\S{10,}\n(\n|(.*?)\n\n)', re.DOTALL)
    re_content_type = re.compile(r'Content-Type: ([a-z/]+)')
    re_charset = re.compile(r'charset=(\S+)$', re.MULTILINE)
    re_text_type = re.compile(r'text|message')
    re_html = re.compile('<.*?>', re.DOTALL)
    re_subject = re.compile('^Subject: (.*)$', re.MULTILINE)
    
    subject = re.search(re_subject, texts[ : p])
    if subject:
        subject = subject.group(1)
    else:
        subject = ""
    
    p = 0
    parsed = u""
    charset = 'ascii'
    content_type = "text"
    for matcher in re_mime_id.finditer(body):
        target = body[p : matcher.start(0)]
        if re.search(re_text_type, content_type):
            parsed += decode(target, charset)
        p = matcher.end(0)
        if matcher.group(2) and re.search(re_content_type, matcher.group(2)):
            content_type = re.search(re_content_type, matcher.group(2)).group(1)
            content_types[content_type] += 1
        else:
            content_type = "text"
        if matcher.group(2) and re.search(re_charset, matcher.group(2)):
            charset = re.search(re_charset, matcher.group(2)).group(1)
        else:
            charset = 'ascii'
    target = body[p : ]
    if re.search(re_text_type, content_type):
        parsed += decode(target, charset)
    parsed = re.sub(re_html, '', parsed)
    return subject.decode('windows-1252', 'ignore') + u' ' + parsed

In [None]:
def main():
    parsed_dataset = dict()
    for k in dataset:
        texts = dataset[k]['text']
        parsed_dataset[k] = {
            'text' : parse(texts),
            'label' : dataset[k]['label']
        } 
    return parsed_dataset

In [None]:
import tarfile
import re
from tqdm import tqdm_notebook

def parse_dataset(parser, sample=0):
    file_id = re.compile(r'inmail\.(\d+)')
    labels = dict()
    with open('full/index') as f:
        for line in f.readlines():
            label, member = line.split()
            labels[ int(re.search(file_id, member).group(1)) ] = label 
    datasets = dict()
    n = 0
    with tarfile.open('trec07p.tgz') as files:
        for member in tqdm_notebook(files):
            m = re.search(file_id, member.name)
            if m:
                fid = int(m.group(1))
                label = labels[fid]
                f = files.extractfile(member)
                datasets[fid] = {
                    'text' : f.read(),
                    'label' : label
                }
                f.close()
                n += 1
                if sample and n > sample:
                    break
    return datasets

In [None]:
!mkdir trec07p
dataset = parse_dataset(None, sample=0)
parsed_dataset = main()
# with open('trec07p/full_encoded.json', 'w') as fo:
#     json.dump(parsed_dataset, fo)

In [None]:
from collections import defaultdict
import nltk
freq = defaultdict(lambda : 0)
for i in tqdm_notebook(range(1, 10000)):
    for token in nltk.word_tokenize(parsed_dataset[unicode(i)]['text']):
        freq[token.lower()] += 1

In [1]:
import re
import json
from tqdm import tqdm_notebook
# with open('full_encoded.json', 'r') as f:
#     parsed_dataset = json.load(f)

In [None]:
new_line = re.compile(r'\s*\n\s*')
white_space = re.compile(r'[^\S\n]+')
css_re = re.compile(r'.*{\s*\S+\s*:[\s\S]*}')
space_re = re.compile(r'&nbsp;')
def clean_text(text):
    text = text.lower()
    text = fix_line_break(text, freq)
    text = re.sub(css_re, ' ', text)
    text = re.sub(space_re, ' ', text)
    text = re.sub(new_line, '\n', text)
    text = re.sub(white_space, ' ', text) 
    return text

In [None]:
clean_data = []
labels = []
for k in tqdm_notebook(range(1, len(parsed_dataset) + 1) ):
    text = parsed_dataset[unicode(k)]['text']
    text = clean_text(text)
    clean_data.append(text)
    labels.append(parsed_dataset[unicode(k)]['label'])    
clean_dataset = {'texts' : clean_data, "labels" : labels }

In [11]:
# with open('full_clean.csv', 'w') as fo:
#     json.dump({'texts' : clean_data, "labels" : labels } , fo)

In [1]:
# import json
# with open('full_clean.csv', 'r') as f:
#     dataset = json.load(f)

In [37]:
import numpy as np 
import random
import json

def split_dataset(dataset, seed=233, shuffle=True):
    subsets = [ ('train', 0.9), ('test', 0.1) ]
    k = next(dataset.iterkeys())
    n = len(dataset[k])
    if shuffle:
        np.random.seed(seed)
        indices = np.random.permutation(n)
    else:
        indices = range(n)
    s = 0
    
    for subset_name, fraction in subsets:
        m = int(n * fraction)
        print(subset_name, m)
        sub_indices = indices[s : s + m]
        print(subset_name, s, s + m)
        s += m
        subset = dict()
        for k in dataset:
            subset[k] = [dataset[k][i]  for i in sub_indices ] 

        with open('trec07p/{}.json'.format(subset_name), 'w') as fo:
            json.dump(subset, fo)

In [38]:
split_dataset(clean_dataset, shuffle=False)

('train', 56017)
('train', 0, 56017)
('test', 6224)
('test', 56017, 62241)


In [3]:
import itertools
import re

In [4]:
def unique_dataset(dataset):
    stext = sorted(range(len(dataset['texts'])),key=lambda x : dataset['texts'][x])
    select_indices = []
    for k, index_iter in itertools.groupby(stext, key=lambda x : dataset['texts'][x]):
        select_indices.append(next(index_iter))
    unique_dataset = dict()
    for k in dataset:
        unique_dataset[k] = [ dataset[k][i] for i in select_indices ]
    return unique_dataset

In [33]:
import numpy as np
from tqdm import tqdm_notebook
def further_clean(texts, n=0):
    #css_re = re.compile(r'.*{[\s\S]*:[\s\S]*}')
    #css_re = re.compile(r'this is a multi-part message in mime format.|.*{\s*\S+\s*:[\s\S]*}')
    css_re = re.compile(r'.*{\s*\S+\s*:[\s\S]*}')
    space_re = re.compile(r'&nbsp;')
    return [re.sub(space_re, ' ', re.sub(css_re, '', text) ) for text in tqdm_notebook(texts)]