In [3]:
# load a few useful libs
import tarfile
import gc
import email
import json
import re
import io
import os
import glob
import pandas

In [4]:
message_save_dir = "./datasets/processed/messages/"
metadata_save_dir = "./datasets/processed/metadata/"

# Warning: Next cell may take too much time to be run!

In [None]:
# let's iterate over the dataset file without opening it up on the file system

counter = 0
with tarfile.open("./datasets/sample_enron_mail_20150507.tar.gz") as dataset_file:
    for file in dataset_file.getmembers():
        try:
            file_content = dataset_file.extractfile(file).read()
            parsed_message = email.message_from_string(file_content.decode("utf-8"))
            metadata = dict(parsed_message.items())
            print(metadata['To'])
            metadata['To'] = re.sub(r'[\n\r\t ]+', '', metadata['To']).split(",")
            print(metadata['To'])
            message = parsed_message.get_payload()
            
            # write out the message
            with io.open(os.path.join(message_save_dir, metadata['Message-ID']+".txt"), "w") as message_datafile:
                message_datafile.write(message)
            
             # write out the message metadata(as json)
            with io.open(os.path.join(metadata_save_dir, metadata['Message-ID']+".json"), "w") as metadata_file:
                json.dump(metadata, metadata_file)
                
            #print(str(counter) + " " + file.name + " processed!")
            
        except Exception as error:
            print(error)
        finally:
            counter += 1
            
# let gc!
gc.collect()

# Warning: Run next cell just once!

In [7]:
# don't run this cell multiple times
metadata_files = glob.glob(os.path.join(metadata_save_dir, "*.json"))

In [12]:
counter = 0
graph = dict()
for metadata_file in metadata_files:
    if counter < 100:
        print(metadata_file)
        counter += 1
    else:
        continue
    with io.open(metadata_file) as metadata:
        metadata = json.load(metadata)
        if metadata['From'] in graph:
            graph[metadata['From']].extend(metadata['To'])
        else:
            graph[metadata['From']] = (metadata['To'])

./datasets/processed/metadata/<14334439.1075853798834.JavaMail.evans@thyme>.json
./datasets/processed/metadata/<15336750.1075842783088.JavaMail.evans@thyme>.json
./datasets/processed/metadata/<28459290.1075862331133.JavaMail.evans@thyme>.json
./datasets/processed/metadata/<24352658.1075844263182.JavaMail.evans@thyme>.json
./datasets/processed/metadata/<23832863.1075849650001.JavaMail.evans@thyme>.json
./datasets/processed/metadata/<26261130.1075841969911.JavaMail.evans@thyme>.json
./datasets/processed/metadata/<23492487.1075860517873.JavaMail.evans@thyme>.json
./datasets/processed/metadata/<767522.1075847211999.JavaMail.evans@thyme>.json
./datasets/processed/metadata/<10004.1075857920365.JavaMail.evans@thyme>.json
./datasets/processed/metadata/<9580800.1075857796783.JavaMail.evans@thyme>.json
./datasets/processed/metadata/<20903400.1075860953063.JavaMail.evans@thyme>.json
./datasets/processed/metadata/<6676511.1075845203578.JavaMail.evans@thyme>.json
./datasets/processed/metadata/<7042

In [33]:
with io.open("./datasets/processed/enron_graph.tsv", "w") as enron_graph:
    enron_graph.write("source\ttarget\tweight\n")
    for k, v in graph.items():
        source = k
        targets = [(target, v.count(target)) for target in v]
        for pair in targets:
            target, weight = pair
            enron_graph.write(source+"\t"+target+"\t"+str(weight)+"\n")

In [9]:
enron_graph = pandas.read_csv("./datasets/processed/enron_graph.tsv", sep="\t")

In [10]:
enron_graph.head(10)

Unnamed: 0,source,target,weight
0,chris.germany@enron.com,elizabeth.hernandez@enron.com,1
1,chris.germany@enron.com,brenda.fletcher@enron.com,1
2,chris.germany@enron.com,scott.goodell@enron.com,1
3,eric.gillaspie@enron.com,gerald.nemec@enron.com,1
4,stanley.horton@enron.com,rod.hayslett@enron.com,1
5,jgallagher@epsa.org,acomnes@enron.com,1
6,jgallagher@epsa.org,bhawkin@enron.com,1
7,jgallagher@epsa.org,carin.nersesian@enron.com,1
8,jgallagher@epsa.org,christi.l.nicolay@enron.com,1
9,jgallagher@epsa.org,donna.fulton@enron.com,1


In [12]:
groupby_people = enron_graph.groupby(by=["source", "target"])

# unique emails(a.k.a graph nodes)
groupby_people.count().reset_index()['source']

0             andrea.ring@enron.com
1               bill.rapp@enron.com
2               bill.rapp@enron.com
3               bill.rapp@enron.com
4               bill.rapp@enron.com
5              brad.mckay@enron.com
6             brant.reves@enron.com
7             brant.reves@enron.com
8             brant.reves@enron.com
9           brian.redmond@enron.com
10          brian.redmond@enron.com
11          brian.redmond@enron.com
12          brian.redmond@enron.com
13          bryan.garrett@enron.com
14           bryant@cheatsheets.net
15               c..giron@enron.com
16         charles.weldon@enron.com
17          charles.yeung@enron.com
18          charles.yeung@enron.com
19          charles.yeung@enron.com
20          charles.yeung@enron.com
21           chris.foster@enron.com
22          chris.germany@enron.com
23          chris.germany@enron.com
24          chris.germany@enron.com
25        christi.nicolay@enron.com
26        christi.nicolay@enron.com
27        christi.nicolay@en

In [6]:
 metadata['To']

['wise.counsel@lpl.com']