In [65]:
from datetime import datetime
from datetime import timedelta

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
matplotlib.style.use('ggplot')

from sklearn.linear_model import LinearRegression

# Load and Clean Data

In [2]:
email_df = pd.read_csv('/srv/zooniverse/tables/emails.csv')
category_df = pd.read_csv('/srv/zooniverse/tables/project_categories_3-12-19.csv')
project_df = pd.read_csv('/srv/zooniverse/tables/all_projects_table_02-18-19.csv')

In [3]:
# only use "official" projects and classifications
project_df = project_df.loc[(project_df['panoptes_api_official_project'] == 1)]

# only include panoptes projects that also have classification data
project_df = project_df.loc[(project_df['panoptes_dump'] == 1) & (project_df['panoptes_api'] == 1)]

# convert email timestamp to datetime
email_df['date'] = pd.to_datetime(email_df['date'])

#lowercase project names
email_df['project'] = email_df['project'].str.lower()
category_df['project_name'] = category_df['project_name'].str.lower()
project_df['panoptes_project_name'] = project_df['panoptes_project_name'].str.lower()

In [4]:
# get names of projects for which we have email, category, and classification data
overlapping_projects = category_df.loc[(category_df['project_name'].str.lower().isin(project_df['panoptes_project_name'].str.lower())) & (category_df['project_name'].str.lower().isin(email_df['project'].str.lower()))][['project_name']]

In [5]:
# load classification df
classification_df = pd.read_csv('/srv/zooniverse/tables/all_classifications_table_02-18-19.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
# only use "official" projects and classifications
classification_df = classification_df.loc[classification_df['panoptes_api_official_project'] == 1]

# add name column to classification_df
classification_df = classification_df.merge(project_df[['panoptes_project_name','panoptes_project_id']],on='panoptes_project_id')

# online use projects with email, category, and classification data
classification_df = classification_df.loc[classification_df['panoptes_project_name'].str.lower().isin(overlapping_projects['project_name'].str.lower())]

# convert classification timestamp to datetime
classification_df['created_at'] = pd.to_datetime(classification_df['created_at'])

In [7]:
def get_weeks(x):
    return x.isocalendar()[0]*52+x.isocalendar()[1]

classification_df['iso_week'] = classification_df['created_at'].apply(get_weeks)
email_df['iso_week'] = email_df['date'].apply(get_weeks)

In [8]:
def count_classifications_per_week(classification_df):
    # grouby weeks and count classifications per week
    classifications_by_week = classification_df.groupby(['panoptes_project_name','iso_week'])['iso_week'].size().to_frame('classifications')
    # reindex to fill missing values
    classifications_by_week = classifications_by_week.reindex(pd.MultiIndex.from_product(classifications_by_week.index.levels, names=classifications_by_week.index.names))
    classifications_by_week.reset_index(inplace=True)
    # switch NaN to 0
    classifications_by_week['classifications'] = classifications_by_week['classifications'].fillna(0)
    return classifications_by_week

# grouby weeks and count classifications per week
classifications_by_week = count_classifications_per_week(classification_df)

In [9]:
# remove emails sent later than panoptes classification data
email_df = email_df.loc[email_df['iso_week'] < classification_df['iso_week'].max()]

In [10]:
# remove duplicate email
email_df = email_df.drop_duplicates(subset=['project','iso_week'])

# Don't run this

In [45]:
# remove emails from subsequent weeks
email_df = email_df.loc[~email_df['iso_week'].isin(email_df['iso_week'].add(1))]

# Transform Data

In [11]:
# log transform classifications
log_classifications_by_week = classifications_by_week.copy()
log_classifications_by_week['classifications'] = np.log1p(classifications_by_week['classifications'])

In [99]:
def clean_set(x):
    return x[1:-1].replace("'","").replace(' ','').replace(',','|')

def split_set(x):
    return x.split('|')

def remove_zeros(df):
    df = df.loc[df.groupby('panoptes_project_name')['classifications'].transform('sum') > 0]
    return df

def filter_by_category(classifications_by_week, email_df, category_df, bandwidth):
    
    # make sure the column names match and merge w/ categories
    email_category_df = add_email_offset(email_df).merge(category_df.rename(columns={'project_name':'project'}),on='project')
    classification_category_df = classifications_by_week.merge(category_df.rename(columns={'project_name':'panoptes_project_name'}),on='panoptes_project_name')

    # turn the category column into a list
    email_category_df['categories'] = email_category_df['categories'].apply(clean_set).apply(split_set)

    result = pd.DataFrame()
    
    #iterate through all emails in the email df
    for email in email_category_df.itertuples():
        #iterate trhough each week in the bandwidth
        for offset in range(-bandwidth,bandwidth):
            # get the number of classifcations for each project for a given week
            classification_chunk = classification_category_df.loc[classification_category_df['iso_week'] == email.iso_week + offset]
            # calculate the relative week, create rel_week column
            classification_chunk['rel_week'] = offset
            # calculate whether the newsletter has been sent, create newsletter column
            if offset < 0:
                classification_chunk['newsletter'] = 0
            else:
                classification_chunk['newsletter'] = 1
            # create targeted column
            classification_chunk.loc[classification_chunk['panoptes_project_name'] == email.project,'targeted'] = 1
            classification_chunk['targeted'].fillna(0,inplace=True)
            # iterate through all the targeted project's catefories
            for category in email.categories:
                # create targeted_topic colunm
                classification_chunk.loc[classification_chunk['categories'].str.contains(category),'targeted_topic'] = 1
            classification_chunk['targeted_topic'].fillna(0,inplace=True)            
            
            result = result.append(classification_chunk)
        
        #remove all project weeks where 0 classifications were made over the entire bandwidth
        result = remove_zeros(result)
        
    #result = result.merge(result['panoptes_project_name'].str.get_dummies(), left_index=True, right_index=True)
    return result

In [102]:
rdd_df = filter_by_category(classifications_by_week=log_classifications_by_week,
                            email_df=email_df,
                            category_df=category_df,
                            bandwidth=1)
rdd_df.to_csv('/srv/zooniverse/pre_regression/4-10-19.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html

In [103]:
rdd_df = filter_by_category(classifications_by_week=log_classifications_by_week,
                            email_df=email_df,
                            category_df=category_df,
                            bandwidth=2)
rdd_df.to_csv('/srv/zooniverse/pre_regression/bandwodth_2_4-10-19.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html

In [104]:
rdd_df = filter_by_category(classifications_by_week=log_classifications_by_week,
                            email_df=email_df,
                            category_df=category_df,
                            bandwidth=3)
rdd_df.to_csv('/srv/zooniverse/pre_regression/bandwodth_3_4-10-19.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html