In [98]:
import mailbox
from datetime import datetime
from datetime import timedelta
import re

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
matplotlib.style.use('ggplot')

from scipy import stats

In [2]:
# compiled regex for getting date sent from email dump
match = re.compile('Date: .*')

# extract the initial send date from the message body
def parse_date(message):
    date_string = re.findall(match, message)[0].split(', ',1)[1]
    date_time = datetime.strptime(date_string,'%b %d, %Y at %I:%M %p')
    return date_time

# create a new column of timedeltas between rows
def create_timedelta(df):
    df = df.sort_values('date')
    df['timedelta'] = df['date'].subtract(df['date'].shift(1))
    return df[1:-1]

def create_email_df(mbox_path):
    # load emails from file
    mbox = mailbox.mbox(mbox_path)
    
    # iterate through email dump and extract all send dates
    date_list = []

    for m in mbox:
        body = m.get_payload(0).as_string()
        date = parse_date(body)
        date_list.append(date)
        
    # create a dataframe with timedeltas
    df = pd.DataFrame({'date':date_list})
    df = create_timedelta(df)

    return df

In [3]:
# load all data
email_df = create_email_df('/srv/zooniverse/raw_data/emails/zooniverse.mbox')
classification_df = pd.read_csv('/srv/zooniverse/tables/all_classifications_table_02-18-19.csv')
project_df = pd.read_csv('/srv/zooniverse/tables/all_projects_table_02-18-19.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
def get_weeks(x):
    return x.isocalendar()[0]*52+x.isocalendar()[1]

# only use "official" projects and classifications
classification_df = classification_df.loc[classification_df['panoptes_api_official_project'] == 1]
project_df = project_df.loc[(project_df['panoptes_api_official_project'] == 1)]

# only include projects that also have classification data
project_df = project_df.loc[((project_df['panoptes_api'] == 1) & (project_df['ouroboros_dump'] == 1)) | (project_df['panoptes_dump'] == 1) & (project_df['panoptes_api'] == 1)]

# convert classification timestamp to datetime
classification_df['created_at'] = pd.to_datetime(classification_df['created_at'])

# create iso_week column for email and classification dfs
email_df['iso_week'] = email_df['date'].apply(get_weeks)
classification_df['iso_week'] = classification_df['created_at'].apply(get_weeks)

# add project name to classification df
classification_df = classification_df.merge(project_df[['panoptes_project_name','panoptes_project_id']],on='panoptes_project_id',how='left')

# Run to only use Panoptes Classifications

In [5]:
# only use panoptes classifications
classification_df = classification_df.loc[classification_df['panoptes_dump'] == 1]

# Run to only use emails during classification window

In [44]:
# remove emails sent later than panoptes classification data
email_df = email_df.loc[email_df['iso_week'] < classification_df['iso_week'].max()]

In [45]:
# remove emails from subsequent weeks
email_df = email_df.loc[~email_df['iso_week'].isin(email_df['iso_week'].add(1))]

In [108]:
def ttest(classification_df, email_df, window=1):

    # group classifications by project and iso_week
    classifications_by_week = classification_df.groupby(['panoptes_project_id','iso_week'])['iso_week'].size()
    # add 0s for missing weeks
    classifications_by_week = classifications_by_week.reindex(pd.MultiIndex.from_product(classifications_by_week.index.levels, names=classifications_by_week.index.names),fill_value=0)
    classifications_by_week = classifications_by_week.to_frame('classification_count').reset_index()
    
    # create dfs with start and end iso_weeks
    start = email_df['iso_week'].subtract(window).to_frame('start')
    end = email_df['iso_week'].add(window-1).to_frame('end')
    windowed_classifications_by_week_start = classifications_by_week.loc[(classifications_by_week['iso_week'].isin(start['start']))]
    windowed_classifications_by_week_end = classifications_by_week.loc[(classifications_by_week['iso_week'].isin(end['end']))]
    
    # merge start and end week classification counts into single df
    windowed_classifications_by_week_end['iso_week'] = windowed_classifications_by_week_end['iso_week'].subtract(1)
    windowed_classifications_by_week = windowed_classifications_by_week_start.merge(windowed_classifications_by_week_end,on=['panoptes_project_id','iso_week'])
    
    # only include weeks where there was at least one classification before or after the newsletter
    windowed_classifications_by_week = windowed_classifications_by_week.loc[(windowed_classifications_by_week['classification_count_x'] > 0) | (windowed_classifications_by_week['classification_count_y'] > 0)]
    
    ttest = stats.ttest_ind(windowed_classifications_by_week['classification_count_x'].values,
                            windowed_classifications_by_week['classification_count_y'].values,
                            equal_var = False)
    
    print('statistic: {0}\np value: {1})'.format(ttest.statistic,ttest.pvalue))
    

# Run T-Test
perform a for the difference in means between classifications per project the week before the newsletter and the week of the newsletter

In [109]:
ttest(classification_df,email_df)

statistic: -1.3321141370774052
p value: 0.18337514041076183)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
