# Extract

The following snippets of code iterate over several e-mail HTTP responses
from the NIST TREC, SPAM dataset, extracts the sender, content, and word
count.

# Setup

This first block sets up several necessary constans and imports used 
throughout this notebook.

In [71]:
from pathlib import PurePath, Path
from html.parser import HTMLParser

# A dummy class definition to serve as a namespace
# for data constants.
class DataStruct():
    pass

# The assumed directory structure is as follows:
#
# data
# -- TREC
# -- -- trec05p-1
# -- -- trec06p
# -- -- trec07p
# Extract.ipynb
#
data = DataStruct()
data.root = Path("data").resolve()

data.trec  = data.root.joinpath("TREC")
data.trec5 = data.trec.joinpath("trec05p-1")
data.trec6 = data.trec.joinpath("trec06p")
data.trec7 = data.trec.joinpath("trec07p")
data.trec_list = [
    data.trec5,
    data.trec6,
    data.trec7,
]

In [72]:
def ensure_registered_trec_directory(path: PurePath, errmsg: str):
    if not path in data.trec_list:
        raise RuntimeError(errmsg)


def trec_load_index(path: PurePath):
    """
    Loads in the index for the given TREC dataset.
    
    :return
        A tuple containing a list of the ham and spam
        classifications for each file, i.e., (ham, spam),
        where each list contains a Path object pointing
        to the labeled file.
    """
    ensure_registered_trec_directory(path, "Cannot load index for an unknown TREC directory")

    index_dir = path.joinpath("full")
    
    # This may also need to be fixed if any trec
    # directory isn't structured as expected of
    # 05 or 06.
    index_file = index_dir.joinpath("index")
    
    spam = set()
    ham = set()
    
    with index_file.open(mode="rb") as handle:
        for line in handle:
            info = line.decode("utf-8").split(' ')
            index_pointer = index_dir.joinpath(info[1]).resolve()
            if "spam" == info[0]:
                spam.add(index_pointer)
            else:
                ham.add(index_pointer)

    return (ham, spam)

        
def trec_iterate(path: PurePath, corpus_limit: int = None, target_limit: int = None):
    """
    Iterates over all or a specified subset of the TREC
    data referenced by the path(s), yielding a tuple of
    the dataset index (see trec_load_index(...)), 
    corpus directory, and the target file as Path objects,
    i.e., (corpus_dir, target_file).
    
    The index is only loaded once, don't worry.
    
    The expected usage is
    
        for index, corpus, target in trec_iterate(...):
            ...
    
    :param path
        The desired path of the specified TREC directory
        registered with data.trec_list. This parameter MUST
        be one of the pasts listed in data.trec_list.
    
    :param corpus_limit
        The maximum number of directories in the path/data/...
        directory to traverse. The traversal goes in sorted
        order by the directorys' lexicographical ordering.
        A negative number is treated as 0.
    
    :param target_limit
        The maximum number of files to iterate over in each
        iteration of a corpus directory. The traversal goes
        in sorted order by the files' lexicographical ordering.
        A negative number is treated as 0.
        
    Better explained, the number of total files iterated will
    be at most the product of corpus_limit * target_limit.
    """
    ensure_registered_trec_directory(path, "Cannot iterate over an unknown TREC directory")
    
    loaded_index = trec_load_index(path)
    
    # data_path should not be confused with the data
    # global variable defined in the first python cell.
    data_path = path.joinpath("data")
    
    for corpus in sorted(data_path.iterdir()):
        # Create a copy of the current target_limit
        # to make sure we have the original value for
        # reference.
        local_target_limit = target_limit
        
        # ------------------------------------
        # TODO: Fix this hack by properly defining
        # each TREC dataset instead of assuming
        # directory structure.
        if path is data.trec7:
            if local_target_limit is not None:
                if local_target_limit <= 0:
                    break
                local_target_limit -= 1
            yield (loaded_index, data_path, corpus)
            continue
        # -----------------------------------
        
        # Ensure that we have not reached the
        # corpus directory limit yet.
        if corpus_limit is not None:
            if corpus_limit <= 0:
                break
            corpus_limit -= 1
        
        for target in sorted(corpus.iterdir()):
            
            # Again, ensure we have not reached the
            # target file limit yet.
            if local_target_limit is not None:
                if local_target_limit <= 0:
                    break
                local_target_limit -= 1
                
            yield (loaded_index, corpus, target)


def trec_iterate_all(*paths, corpus_limit: int = None, target_limit: int = None):
    """
    See trec_iterate(...) above for more detailed documentation.
    
    :param paths
        A collection of paths to TREC, SPAM directories to 
        be processed.
    """
    for path in paths:
        yield from trec_iterate(path, corpus_limit, target_limit)

In [73]:
class MyHTMLParser(HTMLParser):
    
    def __init__ (self):
        super(MyHTMLParser, self).__init__()
        self.current_email = ""
        
    def handle_starttag(self, tag, attrs):
        pass
    
    def handle_endtag(self, tag):
        pass
    
    def handle_data(self, data):
        self.current_email += data + " "
        
    def error(self, message):
        print('---------------')
        print('ERROR: ' + message)
        print('---------------')


def filter_message_headers(encoded_message):
    new_encoded_message = []
    i = 0
    while i < len(encoded_message):
        sline = encoded_message[i].decode("latin-1")
        if "-----Original Message-----" in sline:
            while i < len(encoded_message) and len(encoded_message[i].decode("latin-1").strip()) != 0:
                i += 1
        if i < len(encoded_message):
            new_encoded_message.append(encoded_message[i])
        i += 1
    return new_encoded_message


def extract_metadata(encoded_message):
    sender = ""
    content_type = ""
    boundary = None
    body_index = None
    
    # [0] = body start
    # [-1] = file end
    split_indices = []
    
    for i, line in enumerate(encoded_message):
        sline = line.decode('latin-1')
        if str.encode("From: ") in line:
            sender = sline[6:].strip()
        if content_type == '' and str.encode("Content-Type: ") in line:
            content_type = sline[14:sline.find(';')]
        if 'boundary' in sline and sline.find('=') != -1:
            boundary = sline[sline.find('=')+1:][1:-2]
            continue
        if boundary and boundary in sline:
            split_indices.append(i)
        if not body_index and sline.strip() == "":
            body_index = i
    if len(split_indices) == 0:
        split_indices.append(body_index)
    split_indices.append(len(encoded_message)-1)
    return (sender, content_type, boundary, split_indices)

In [74]:
for index, corpus, target in trec_iterate_all(data.trec5, data.trec6, data.trec7):
    with target.open(mode="rb") as handle:
        encoded_message = filter_message_headers(handle.readlines())
        sender, content_type, boundary, split_indices = extract_metadata(encoded_message)
        charset = "latin-1"
        body = ""
        
        if len(split_indices) > 2:
            for i in range(len(split_indices) - 1):
                x = split_indices[i]
                y = split_indices[i + 1]
                j = x + 1
                while j < y:
                    line = encoded_message[j].decode(charset).strip()
                    if len(line) == 0:
                        break
                    j += 1
                print("--- EMAIL: --- ", target, is_spam)
                parser = MyHTMLParser()
                parser.feed(b"".join(encoded_message[j+1:y]).decode(charset))
                body = parser.current_email
        else:
            print("--- EMAIL: --- ", target, is_spam)
            try:
                parser = MyHTMLParser()   
                parser.feed(b"".join(encoded_message[split_indices[0]:split_indices[1]]).decode(charset))
                body = parser.current_email
            except:
                body = ""

# OLDER
#         if index_search[int(corpus)][int(collection)]:
#             sender_spam.append(sender)
#             body_spam.append(body)
#         else:
#             sender_ham.append(sender)
#             body_ham.append(body)
        

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/000/000 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/000/001 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/000/002 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/000/003 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/000/004 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/000/005 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/000/006 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/000/007 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/000/008 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/000/009 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/000/010 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/025 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/026 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/026 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/027 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/028 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/028 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/029 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/029 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/030 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/031 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/031 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/277 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/278 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/278 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/278 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/279 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/279 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/279 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/280 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/280 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/280 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/001/281 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/002/220 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/002/220 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/002/220 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/002/221 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/002/222 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/002/223 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/002/224 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/002/225 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/002/226 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/002/226 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/002/227 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/003/128 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/003/129 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/003/130 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/003/130 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/003/131 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/003/132 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/003/133 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/003/134 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/003/134 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/003/135 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/003/135 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/004/138 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/004/138 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/004/138 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/004/138 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/004/138 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/004/138 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/004/138 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/004/138 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/004/139 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/004/140 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/004/141 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/005/113 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/005/114 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/005/115 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/005/116 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/005/117 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/005/118 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/005/119 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/005/120 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/005/121 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/005/122 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/005/123 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/006/088 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/006/088 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/006/088 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/006/088 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/006/088 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/006/088 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/006/088 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/006/089 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/006/089 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/006/090 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/006/090 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/007/115 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/007/115 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/007/116 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/007/117 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/007/118 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/007/118 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/007/119 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/007/119 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/007/119 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/007/120 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/007/121 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/008/128 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/008/129 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/008/130 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/008/131 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/008/132 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/008/133 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/008/134 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/008/135 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/008/136 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/008/136 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/008/136 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/009/119 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/009/120 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/009/121 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/009/122 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/009/123 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/009/124 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/009/125 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/009/126 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/009/127 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/009/128 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/009/129 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/010/137 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/010/138 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/010/139 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/010/140 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/010/140 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/010/141 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/010/142 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/010/142 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/010/142 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/010/143 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/010/144 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/011/121 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/011/121 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/011/122 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/011/122 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/011/123 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/011/124 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/011/125 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/011/126 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/011/127 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/011/128 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/011/129 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/012/150 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/012/151 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/012/152 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/012/153 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/012/154 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/012/155 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/012/156 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/012/157 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/012/157 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/012/158 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/012/159 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/013/116 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/013/117 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/013/117 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/013/118 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/013/119 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/013/119 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/013/120 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/013/120 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/013/121 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/013/121 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/013/122 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/014/049 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/014/050 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/014/050 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/014/050 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/014/051 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/014/051 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/014/052 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/014/053 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/014/053 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/014/054 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/014/055 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/015/058 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/015/059 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/015/059 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/015/060 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/015/061 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/015/062 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/015/063 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/015/064 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/015/065 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/015/065 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/015/066 False
--- EMAIL:

--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/016/051 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/016/052 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/016/053 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/016/053 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/016/054 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/016/055 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/016/056 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/016/057 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/016/058 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/016/058 False
--- EMAIL: ---  /home/max/Code/MachineLearning/nlp/data/TREC/trec05p-1/data/016/058 False
--- EMAIL:

KeyboardInterrupt: 