In [1]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
matplotlib.style.use('ggplot')

import requests
import json

In [2]:
# specify data directory and file
data_dir = '/srv/zooniverse/raw_data/panoptes/all-panoptes-classifications-2017-01-30.csv'
# read csv file of panoptes classifications
classification_df = pd.read_csv(data_dir)

  interactivity=interactivity, compiler=compiler, result=result)


In [115]:
# specify data directory and file
ouroboros_data_file = '/srv/zooniverse/tables/ouroboros_project_table_03-02-17.csv'
# read csv file of panoptes classifications
ouroboros_classification_df = pd.read_csv(ouroboros_data_file)

In [102]:
# create a project dataframe that contains the total number of classifications per project
project_df = classification_df.groupby('project_id').size().to_frame('panoptes_dump_classification_count')
# get the workflow IDs for each project and store as a list
project_df = project_df.merge(classification_df.groupby('project_id')['workflow_id'].unique().to_frame('panoptes_dump_workflows'),left_index=True,right_index=True)
# reset index
project_df = project_df.reset_index()
# create 'panoptes_dump' column
project_df['panoptes_dump'] = 1
# rename project_id field for merge later
project_df = project_df.rename(columns={'project_id':'panoptes_project_id'})

In [105]:
# set the API endpoint
base_url = r'https://panoptes.zooniverse.org/api/projects'
# set necessary headers for zooniverse API
headers = {
    'Accept':'application/vnd.api+json; version=1',
    'Content-Type':'application/json'
}
params = {}
api_result_df = pd.DataFrame()
while True:
    # send and recieve HTTP request to API endpoint
    r = requests.get(base_url,
                    params=params,
                    headers=headers)
    # convert the result to JSON
    api_result = r.json()
    
    # iterate through each project in a page of API results
    for project in api_result['projects']:
        # collect relivant fields from json
        api_result_dict = {
            'panoptes_project_id':project['id'],
            'panoptes_project_name':project['display_name'],
            'panoptes_migrated':project['migrated'],
            'panoptes_description':project['description'],
            'panoptes_live':project['live'],
            'panoptes_launch_date':project['launch_date'],
            'panoptes_completeness':project['completeness'],
            'panoptes_api_subject_count':project['subjects_count'],
            'panoptes_api_classificaitons_count':project['classifiers_count'],
        }
        # collect the workflow information if it exists
        # this is useful for comparing dump results against API results
        if 'workflows' in project['links']:
            api_result_dict['panoptes_api_workflows'] = project['links']['workflows']
        else:
            api_result_dict['panoptes_api_workflows'] = None
        # collect the project roles if they're listed
        if 'project_roles' in project['links']:
            api_result_dict['panoptes_api_roles'] = project['links']['project_roles']
        else:
            api_result_dict['panoptes_api_roles'] = None
        # add the project data to the projects dataframe
        api_result_df = api_result_df.append(pd.DataFrame([api_result_dict]))
    
    # if there is another page of search results, add that href to the next query
    # else return the API result
    if api_result['meta']['projects']['next_href']:
        params['page'] = api_result['meta']['projects']['next_href'].split('=')[-1]
        print(api_result['meta']['projects']['next_href'])
    else:
        break
        
# convert the project ID from a string to a numeric field
api_result_df['panoptes_project_id'] = pd.to_numeric(api_result_df['panoptes_project_id'])
# create a panoptes_api field
api_result_df['panoptes_api'] = 1
    

/projects?page=2
/projects?page=3
/projects?page=4
/projects?page=5
/projects?page=6
/projects?page=7
/projects?page=8
/projects?page=9
/projects?page=10
/projects?page=11
/projects?page=12
/projects?page=13
/projects?page=14
/projects?page=15
/projects?page=16
/projects?page=17
/projects?page=18
/projects?page=19
/projects?page=20
/projects?page=21
/projects?page=22
/projects?page=23
/projects?page=24
/projects?page=25


In [128]:
joined_df = api_result_df.merge(project_df,on='panoptes_project_id',how='outer')

In [116]:
ouroboros_classification_df = ouroboros_classification_df.rename(columns={'_id':'ouroboros_mongo_id',
                                            'panoptes_id':'panoptes_project_id',
                                            'activated_subjects_at':'ouroboros_meta_activated_subjects_at',
                                            'classification_count':'ouroboros_meta_classification_count',
                                            'complete_count':'ouroboros_meta_complete_count',
                                            'created_at':'ouroboros_meta_created_at',
                                            'display_name':'ouroboros_project_name',
                                            'panoptes_id':'panoptes_project_id',
                                            'user_count':'ouroboros_meta_user_count'})
ouroboros_classification_df['ouroboros_dump'] = 1
ouroboros_classification_df = ouroboros_classification_df[['ouroboros_mongo_id',
                                                           'ouroboros_meta_activated_subjects_at',
                                                           'ouroboros_meta_classification_count',
                                                           'ouroboros_meta_complete_count',
                                                           'ouroboros_meta_created_at',
                                                           'ouroboros_project_name',
                                                           'panoptes_project_id', 
                                                           'ouroboros_meta_user_count',
                                                           'ouroboros_dump']]

In [129]:
joined_df = joined_df.merge(ouroboros_classification_df,on='panoptes_project_id',how='outer')

In [134]:
len(joined_df.loc[(joined_df['panoptes_api'] == 1) & (joined_df['ouroboros_dump'] == 1)])

58

In [140]:
# set NaN values to 0
joined_df.loc[joined_df['panoptes_dump'].isnull(),'panoptes_dump'] = 0
joined_df.loc[joined_df['panoptes_api'].isnull(),'panoptes_api'] = 0
joined_df.loc[joined_df['ouroboros_dump'].isnull(),'ouroboros_dump'] = 0

In [141]:
# drop non-uniques base on panoptes_project_id
joined_df = joined_df.drop_duplicates(subset='panoptes_project_id',keep='first')

In [155]:
print('panoptes dump: {0}'.format(len(joined_df.loc[joined_df['panoptes_dump'] == 1])))
print('panoptes api: {0}'.format(len(joined_df.loc[joined_df['panoptes_api'] == 1])))
print('ouroboros dump: {0}\n'.format(len(joined_df.loc[joined_df['ouroboros_dump'] == 1])))
print('panoptes dump + api: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 1) & (joined_df['panoptes_api'] == 1)])))
print('panoptes dump + ouroboros dump: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 1) & (joined_df['ouroboros_dump'] == 1)])))
print('ouroboros dump + panoptes api: {0}\n'.format(len(joined_df.loc[(joined_df['panoptes_api'] == 1) & (joined_df['ouroboros_dump'] == 1)])))
print('only panoptes dump: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 1) & (joined_df['panoptes_api'] == 0) & (joined_df['ouroboros_dump'] == 0)])))
print('only panoptes api: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 0) & (joined_df['panoptes_api'] == 1) & (joined_df['ouroboros_dump'] == 0)])))
print('only ouroboros dump: {0}\n'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 0) & (joined_df['panoptes_api'] == 0) & (joined_df['ouroboros_dump'] == 1)])))

panoptes dump: 750
panoptes api: 373
ouroboros dump: 30

panoptes dump + api: 234
panoptes dump + ouroboros dump: 0
ouroboros dump + panoptes api: 29

only panoptes dump: 516
only panoptes api: 110
only ouroboros dump: 1



In [159]:
print('panoptes dump: {0}'.format(len(joined_df.loc[joined_df['panoptes_dump'] == 1])/len(joined_df)))
print('panoptes api: {0}'.format(len(joined_df.loc[joined_df['panoptes_api'] == 1])/len(joined_df)))
print('ouroboros dump: {0}\n'.format(len(joined_df.loc[joined_df['ouroboros_dump'] == 1])/len(joined_df)))
print('panoptes dump + api: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 1) & (joined_df['panoptes_api'] == 1)])/len(joined_df)))
print('panoptes dump + ouroboros dump: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 1) & (joined_df['ouroboros_dump'] == 1)])/len(joined_df)))
print('ouroboros dump + panoptes api: {0}\n'.format(len(joined_df.loc[(joined_df['panoptes_api'] == 1) & (joined_df['ouroboros_dump'] == 1)])/len(joined_df)))
print('only panoptes dump: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 1) & (joined_df['panoptes_api'] == 0) & (joined_df['ouroboros_dump'] == 0)])/len(joined_df)))
print('only panoptes api: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 0) & (joined_df['panoptes_api'] == 1) & (joined_df['ouroboros_dump'] == 0)])/len(joined_df)))
print('only ouroboros dump: {0}\n'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 0) & (joined_df['panoptes_api'] == 0) & (joined_df['ouroboros_dump'] == 1)])/len(joined_df)))

panoptes dump: 0.8426966292134831
panoptes api: 0.4191011235955056
ouroboros dump: 0.033707865168539325

panoptes dump + api: 0.26292134831460673
panoptes dump + ouroboros dump: 0.0
ouroboros dump + panoptes api: 0.03258426966292135

only panoptes dump: 0.5797752808988764
only panoptes api: 0.12359550561797752
only ouroboros dump: 0.0011235955056179776



In [163]:
# specify data directory and file
ouroboros_classification_file = '/srv/zooniverse/tables/ouroboros_classification_table_03-09-17.csv'
# read csv file of panoptes classifications
ouroboros_classification_df = pd.read_csv(ouroboros_classification_file)

  interactivity=interactivity, compiler=compiler, result=result)


In [164]:
len(ouroboros_classification_df)

124878223

In [165]:
ouroboros_classification_df

Unnamed: 0.1,Unnamed: 0,_id,subject_ids,created_at,user_name,tutorial,project_name,project_id
0,0,ObjectId(51cda78539ea5359880157dd),"[{""$oid"":""51b6eb102d5d3ad242002224""}]",2013-06-28T15:12:43.000Z,not-logged-in-28963be9ac14771d8e69c078e6297c79,,worms,ObjectId(51c9bba83ae7407725000001)
1,1,ObjectId(51d0983b501e7e2e3a005a30),"[{""$oid"":""51b7b87c2d5d3ad242003bca""}]",2013-06-30T20:40:42.000Z,tinkapuppy,,worms,ObjectId(51c9bba83ae7407725000001)
2,2,ObjectId(51d1c18239ea532e4b018547),"[{""$oid"":""51bbd2e22d5d3ad242009e81""}]",2013-07-01T17:52:54.000Z,salomehuiyi,,worms,ObjectId(51c9bba83ae7407725000001)
3,3,ObjectId(51d29cc9501e7e1b43007306),"[{""$oid"":""519a3eb2447b5e2c3d000018""}]",2013-07-02T09:24:34.000Z,not-logged-in-9af3f0a1c12f3d1f08053849a8669f96,True,worms,ObjectId(51c9bba83ae7407725000001)
4,4,ObjectId(51d31c0939ea530c6000a1cc),"[{""$oid"":""519a3eb2447b5e2c3d000018""}]",2013-07-02T18:29:29.000Z,heathv,True,worms,ObjectId(51c9bba83ae7407725000001)
5,5,ObjectId(51d3252939ea530c6000a736),"[{""$oid"":""51ba14fc2d5d3ad242007f86""}]",2013-07-02T19:10:25.000Z,not-logged-in-79f41632578c8aa60516184b83af5d8b,,worms,ObjectId(51c9bba83ae7407725000001)
6,6,ObjectId(51d325ca501e7e1b4300c470),"[{""$oid"":""51b6e8172d5d3ad2420021c9""}]",2013-07-02T19:11:06.000Z,not-logged-in-79f41632578c8aa60516184b83af5d8b,,worms,ObjectId(51c9bba83ae7407725000001)
7,7,ObjectId(51d3faaa39ea530c60010b79),"[{""$oid"":""51bec1a22d5d3ad24200de28""}]",2013-07-03T10:21:25.000Z,bumishness,,worms,ObjectId(51c9bba83ae7407725000001)
8,8,ObjectId(51d416f639ea530c60011c7e),"[{""$oid"":""51bcd5d92d5d3ad24200a736""}]",2013-07-03T12:22:09.000Z,aexbrown,,worms,ObjectId(51c9bba83ae7407725000001)
9,9,ObjectId(51d41849501e7e1b430149c2),"[{""$oid"":""51b608db2d5d3ad24200052d""}]",2013-07-03T12:23:42.000Z,aexbrown,,worms,ObjectId(51c9bba83ae7407725000001)
