In [1]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
matplotlib.style.use('ggplot')

import requests
import json

In [7]:
# specify data directory and file
data_dir = '/srv/zooniverse/raw_data/panoptes/all-panoptes-classifications-2017-01-30.csv'
# read csv file of panoptes classifications
classification_df = pd.read_csv(data_dir)

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
# specify data directory and file
ouroboros_data_file = '/srv/zooniverse/tables/ouroboros_project_table_03-02-17.csv'
# read csv file of panoptes classifications
ouroboros_classification_df = pd.read_csv(ouroboros_data_file)

In [9]:
# create a project dataframe that contains the total number of classifications per project
project_df = classification_df.groupby('project_id').size().to_frame('panoptes_dump_classification_count')
# get the workflow IDs for each project and store as a list
project_df = project_df.merge(classification_df.groupby('project_id')['workflow_id'].unique().to_frame('panoptes_dump_workflows'),left_index=True,right_index=True)
# reset index
project_df = project_df.reset_index()
# create 'panoptes_dump' column
project_df['panoptes_dump'] = 1
# rename project_id field for merge later
project_df = project_df.rename(columns={'project_id':'panoptes_project_id'})

In [6]:
# set the API endpoint
base_url = r'https://panoptes.zooniverse.org/api/projects'
# set necessary headers for zooniverse API
headers = {
    'Accept':'application/vnd.api+json; version=1',
    'Content-Type':'application/json'
}
params = {}
api_result_df = pd.DataFrame()
while True:
    # send and recieve HTTP request to API endpoint
    r = requests.get(base_url,
                    params=params,
                    headers=headers)
    # convert the result to JSON
    api_result = r.json()
    # iterate through each project in a page of API results
    for project in api_result['projects']:
        # collect relivant fields from json
        api_result_dict = {
            'panoptes_project_id':project['id'],
            'panoptes_project_name':project['display_name'],
            'panoptes_migrated':project['migrated'],
            'panoptes_description':project['description'],
            'panoptes_live':project['live'],
            'panoptes_launch_date':project['launch_date'],
            'panoptes_completeness':project['completeness'],
            'panoptes_api_subject_count':project['subjects_count'],
            'panoptes_api_classificaitons_count':project['classifiers_count'],
        }
        # collect the workflow information if it exists
        # this is useful for comparing dump results against API results
        if 'workflows' in project['links']:
            api_result_dict['panoptes_api_workflows'] = project['links']['workflows']
        else:
            api_result_dict['panoptes_api_workflows'] = None
        # collect the project roles if they're listed
        if 'project_roles' in project['links']:
            api_result_dict['panoptes_api_roles'] = project['links']['project_roles']
        else:
            api_result_dict['panoptes_api_roles'] = None
        if project['launch_approved']:
            api_result_dict['panoptes_api_official_project'] = 1
        else:
            api_result_dict['panoptes_api_official_project'] = 0
        # add the project data to the projects dataframe
        api_result_df = api_result_df.append(pd.DataFrame([api_result_dict]))
    
    # if there is another page of search results, add that href to the next query
    # else return the API result
    if api_result['meta']['projects']['next_href']:
        params['page'] = api_result['meta']['projects']['next_href'].split('=')[-1]
        print(api_result['meta']['projects']['next_href'])
    else:
        break
        
# convert the project ID from a string to a numeric field
api_result_df['panoptes_project_id'] = pd.to_numeric(api_result_df['panoptes_project_id'])
# create a panoptes_api field
api_result_df['panoptes_api'] = 1
    

/projects?page=2
/projects?page=3
/projects?page=4
/projects?page=5
/projects?page=6
/projects?page=7
/projects?page=8
/projects?page=9
/projects?page=10
/projects?page=11
/projects?page=12
/projects?page=13
/projects?page=14
/projects?page=15
/projects?page=16
/projects?page=17
/projects?page=18
/projects?page=19
/projects?page=20
/projects?page=21
/projects?page=22
/projects?page=23
/projects?page=24
/projects?page=25


In [10]:
joined_df = api_result_df.merge(project_df,on='panoptes_project_id',how='outer')

In [11]:
ouroboros_classification_df = ouroboros_classification_df.rename(columns={'_id':'ouroboros_mongo_id',
                                            'panoptes_id':'panoptes_project_id',
                                            'activated_subjects_at':'ouroboros_meta_activated_subjects_at',
                                            'classification_count':'ouroboros_meta_classification_count',
                                            'complete_count':'ouroboros_meta_complete_count',
                                            'created_at':'ouroboros_meta_created_at',
                                            'display_name':'ouroboros_project_name',
                                            'panoptes_id':'panoptes_project_id',
                                            'user_count':'ouroboros_meta_user_count'})
ouroboros_classification_df['ouroboros_dump'] = 1
ouroboros_classification_df = ouroboros_classification_df[['ouroboros_mongo_id',
                                                           'ouroboros_meta_activated_subjects_at',
                                                           'ouroboros_meta_classification_count',
                                                           'ouroboros_meta_complete_count',
                                                           'ouroboros_meta_created_at',
                                                           'ouroboros_project_name',
                                                           'panoptes_project_id', 
                                                           'ouroboros_meta_user_count',
                                                           'ouroboros_dump']]

In [12]:
joined_df = joined_df.merge(ouroboros_classification_df,on='panoptes_project_id',how='outer')

In [13]:
len(joined_df.loc[(joined_df['panoptes_api'] == 1) & (joined_df['ouroboros_dump'] == 1)])

58

In [14]:
# set NaN values to 0
joined_df.loc[joined_df['panoptes_dump'].isnull(),'panoptes_dump'] = 0
joined_df.loc[joined_df['panoptes_api'].isnull(),'panoptes_api'] = 0
joined_df.loc[joined_df['ouroboros_dump'].isnull(),'ouroboros_dump'] = 0

In [15]:
# drop non-uniques base on panoptes_project_id
joined_df = joined_df.drop_duplicates(subset='panoptes_project_id',keep='first')

In [16]:
print('panoptes dump: {0}'.format(len(joined_df.loc[joined_df['panoptes_dump'] == 1])))
print('panoptes api: {0}'.format(len(joined_df.loc[joined_df['panoptes_api'] == 1])))
print('ouroboros dump: {0}\n'.format(len(joined_df.loc[joined_df['ouroboros_dump'] == 1])))
print('panoptes dump + api: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 1) & (joined_df['panoptes_api'] == 1)])))
print('panoptes dump + ouroboros dump: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 1) & (joined_df['ouroboros_dump'] == 1)])))
print('ouroboros dump + panoptes api: {0}\n'.format(len(joined_df.loc[(joined_df['panoptes_api'] == 1) & (joined_df['ouroboros_dump'] == 1)])))
print('only panoptes dump: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 1) & (joined_df['panoptes_api'] == 0) & (joined_df['ouroboros_dump'] == 0)])))
print('only panoptes api: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 0) & (joined_df['panoptes_api'] == 1) & (joined_df['ouroboros_dump'] == 0)])))
print('only ouroboros dump: {0}\n'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 0) & (joined_df['panoptes_api'] == 0) & (joined_df['ouroboros_dump'] == 1)])))

panoptes dump: 750
panoptes api: 377
ouroboros dump: 30

panoptes dump + api: 236
panoptes dump + ouroboros dump: 0
ouroboros dump + panoptes api: 29

only panoptes dump: 514
only panoptes api: 112
only ouroboros dump: 1



In [17]:
print('panoptes dump: {0}'.format(len(joined_df.loc[joined_df['panoptes_dump'] == 1])/len(joined_df)))
print('panoptes api: {0}'.format(len(joined_df.loc[joined_df['panoptes_api'] == 1])/len(joined_df)))
print('ouroboros dump: {0}\n'.format(len(joined_df.loc[joined_df['ouroboros_dump'] == 1])/len(joined_df)))
print('panoptes dump + api: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 1) & (joined_df['panoptes_api'] == 1)])/len(joined_df)))
print('panoptes dump + ouroboros dump: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 1) & (joined_df['ouroboros_dump'] == 1)])/len(joined_df)))
print('ouroboros dump + panoptes api: {0}\n'.format(len(joined_df.loc[(joined_df['panoptes_api'] == 1) & (joined_df['ouroboros_dump'] == 1)])/len(joined_df)))
print('only panoptes dump: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 1) & (joined_df['panoptes_api'] == 0) & (joined_df['ouroboros_dump'] == 0)])/len(joined_df)))
print('only panoptes api: {0}'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 0) & (joined_df['panoptes_api'] == 1) & (joined_df['ouroboros_dump'] == 0)])/len(joined_df)))
print('only ouroboros dump: {0}\n'.format(len(joined_df.loc[(joined_df['panoptes_dump'] == 0) & (joined_df['panoptes_api'] == 0) & (joined_df['ouroboros_dump'] == 1)])/len(joined_df)))

panoptes dump: 0.8408071748878924
panoptes api: 0.4226457399103139
ouroboros dump: 0.033632286995515695

panoptes dump + api: 0.2645739910313901
panoptes dump + ouroboros dump: 0.0
ouroboros dump + panoptes api: 0.032511210762331835

only panoptes dump: 0.5762331838565022
only panoptes api: 0.12556053811659193
only ouroboros dump: 0.0011210762331838565



In [30]:
print('official_projects: {0}'.format(len(joined_df.loc[joined_df['panoptes_api_official_project'] == 1])))
print('non-official_projects: {0}'.format(len(joined_df.loc[joined_df['panoptes_api_official_project'] == 0])))
print('no result: {0}\n'.format(len(joined_df.loc[joined_df['panoptes_api_official_project'].isnull()])))

print('PANOPTES DUMP + API')
print('official_projects: {0}'.format(len(joined_df.loc[(joined_df['panoptes_api_official_project'] == 1) & (joined_df['panoptes_dump'] == 1) & (joined_df['panoptes_api'] == 1)])))
print('non-official_projects: {0}'.format(len(joined_df.loc[(joined_df['panoptes_api_official_project'] == 0) & (joined_df['panoptes_dump'] == 1) & (joined_df['panoptes_api'] == 1)])))
print('no result: {0}\n'.format(len(joined_df.loc[(joined_df['panoptes_api_official_project'].isnull()) & (joined_df['panoptes_dump'] == 1) & (joined_df['panoptes_api'] == 1)])))

print('OUROBOROS_DUMP + API')
print('official_projects: {0}'.format(len(joined_df.loc[(joined_df['panoptes_api_official_project'] == 1) & (joined_df['ouroboros_dump'] == 1) & (joined_df['panoptes_api'] == 1)])))
print('non-official_projects: {0}'.format(len(joined_df.loc[(joined_df['panoptes_api_official_project'] == 0) & (joined_df['ouroboros_dump'] == 1) & (joined_df['panoptes_api'] == 1)])))
print('no result: {0}\n'.format(len(joined_df.loc[(joined_df['panoptes_api_official_project'].isnull()) & (joined_df['ouroboros_dump'] == 1) & (joined_df['panoptes_api'] == 1)])))

official_projects: 61
non-official_projects: 316
no result: 515

PANOPTES DUMP + API
official_projects: 38
non-official_projects: 198
no result: 0

OUROBOROS_DUMP + API
official_projects: 19
non-official_projects: 10
no result: 0



In [35]:
joined_df.loc[joined_df['panoptes_api_official_project'] == 1]['panoptes_project_name']

2                                 Planet Hunters
15                         Planet Four: Terrains
17                         Whales as Individuals
19                                   Old Weather
20                              Snapshots at Sea
22                                    Galaxy Zoo
33                           Operation War Diary
36                              Radio Galaxy Zoo
38                            Snapshot Serengeti
40                                 Bat Detective
45                                   Planet Four
46                                Cyclone Center
64                                Worm Watch Lab
66                               Plankton Portal
71                                Galaxy Zoo: 3D
74                                  Condor Watch
78                                   Chimp & See
80                              Orchid Observers
82                                Disk Detective
83                                Science Gossip
87                  

In [163]:
# specify data directory and file
ouroboros_classification_file = '/srv/zooniverse/tables/ouroboros_classification_table_03-09-17.csv'
# read csv file of panoptes classifications
ouroboros_classification_df = pd.read_csv(ouroboros_classification_file)

  interactivity=interactivity, compiler=compiler, result=result)
