In [1]:
import mailbox
from datetime import datetime
from datetime import timedelta
import re

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
matplotlib.style.use('ggplot')

In [2]:
# compiled regex for getting date sent from email dump
match = re.compile('Date: .*')

# extract the initial send date from the message body
def parse_date(message):
    date_string = re.findall(match, message)[0].split(', ',1)[1]
    date_time = datetime.strptime(date_string,'%b %d, %Y at %I:%M %p')
    return date_time

# create a new column of timedeltas between rows
def create_timedelta(df):
    df = df.sort_values('date')
    df['timedelta'] = df['date'].subtract(df['date'].shift(1))
    return df[1:-1]

def create_email_df(mbox_path):
    # load emails from file
    mbox = mailbox.mbox(mbox_path)
    
    # iterate through email dump and extract all send dates
    date_list = []

    for m in mbox:
        body = m.get_payload(0).as_string()
        date = parse_date(body)
        date_list.append(date)
        
    # create a dataframe with timedeltas
    df = pd.DataFrame({'date':date_list})
    df = create_timedelta(df)

    return df

In [3]:
# load all data
email_df = create_email_df('/srv/zooniverse/raw_data/emails/zooniverse.mbox')
classification_df = pd.read_csv('/srv/zooniverse/tables/all_classifications_table_02-18-19.csv')
project_df = pd.read_csv('/srv/zooniverse/tables/all_projects_table_02-18-19.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
def get_weeks(x):
    return x.isocalendar()[0]*52+x.isocalendar()[1]

# only use "official" projects and classifications
classification_df = classification_df.loc[classification_df['panoptes_api_official_project'] == 1]
project_df = project_df.loc[(project_df['panoptes_api_official_project'] == 1)]

# only include projects that also have classification data
project_df = project_df.loc[((project_df['panoptes_api'] == 1) & (project_df['ouroboros_dump'] == 1)) | (project_df['panoptes_dump'] == 1) & (project_df['panoptes_api'] == 1)]

# convert classification timestamp to datetime
classification_df['created_at'] = pd.to_datetime(classification_df['created_at'])

# create iso_week column for email and classification dfs
email_df['iso_week'] = email_df['date'].apply(get_weeks)
classification_df['iso_week'] = classification_df['created_at'].apply(get_weeks)

# Run to only use Panoptes Classifications

In [5]:
# only use panoptes classifications
classification_df = classification_df=classification_df.loc[classification_df['panoptes_dump'] == 1]

# Run to only use emails during classification window

In [16]:
email_df = email_df.loc[email_df['iso_week'] < classification_df['iso_week'].max()]

In [27]:
classifications_by_week = classification_df.merge(project_df[['panoptes_project_name','panoptes_project_id']],on='panoptes_project_id',how='left')
classifications_by_week = classifications_by_week.groupby(['panoptes_project_name','iso_week'])['iso_week'].size().to_frame('classification_count').reset_index()

In [28]:
window = 1
start = email_df['iso_week'].subtract(window).to_frame('start')
end = email_df['iso_week'].add(window-1).to_frame('end')

In [34]:
windowed_classifications_by_week = classifications_by_week.loc[(classifications_by_week['iso_week'].isin(start['start'])) | (classifications_by_week['iso_week'].isin(end['end']))]

In [38]:
classifications_by_week.loc[(classifications_by_week['iso_week'].isin(start['start'])),'newsletter'] = 0