In [1]:
import mailbox
from datetime import datetime
from datetime import timedelta
import re

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
matplotlib.style.use('ggplot')

In [2]:
# load emails from file
mbox = mailbox.mbox('/srv/zooniverse/raw_data/emails/zooniverse.mbox')

In [98]:
# compiled regex for getting date sent from email dump
date_match = re.compile('Date: .*')

# extract the initial send date from the message body
def parse_date(message):
    date_string = re.findall(date_match, message)[0].split(', ',1)[1]
    date_time = datetime.strptime(date_string,'%b %d, %Y at %I:%M %p')
    return date_time

# create a new column of timedeltas between rows
def create_timedelta(df):
    df = df.sort_values('date')
    df['timedelta'] = df['date'].subtract(df['date'].shift(1))
    return df[1:-1]

In [97]:
# compiled regex for getting the name of the project from the url embedded in the email
project_match = re.compile('<https://www.zooniverse.org/projects/(.*?)/(.*?)(\?|>)',flags=re.DOTALL)

# extract projects named by the email
def extract_project_names(message):
    project_string_list = re.findall(project_match, body)
    project_string_set = set()
    for project in project_string_list:
        project_string_set.add(project[1].replace('-',' ').replace('=\n','').replace('/talk',''))
    return project_string_set

In [99]:
# iterate through email dump and extract all send dates
date_list = []
project_list = []

for m in mbox:
    body = m.get_payload(0).as_string()
    date = parse_date(body)
    project = extract_project_names(body)
    date_list.append(date)
    project_list.append(project)

In [100]:
# create a dataframe with timedeltas
df = pd.DataFrame({'date':date_list,'project':project_list})
df = create_timedelta(df)

In [115]:
def expand(df):
    df = df.reset_index(drop=True)
    expanded_df = df.apply(lambda x: pd.Series(list(x['project'])),axis=1).stack().reset_index(level=1, drop=True)
    expanded_df.name = 'project'
    return df.drop('project', axis=1).join(expanded_df)


In [116]:
expand(df).to_csv('/srv/zooniverse/tables/emails.csv',index=False)