# Basic classification stats

These script is heavily inspired from https://github.com/zooniverse/Data-digging/blob/master/scripts_GeneralPython/basic_classification_processing.py

In [67]:
import numpy as np 
import pandas as pd
import json
from datetime import date

In [68]:
def gini(list_of_values):
    sorted_list = sorted(list_of_values)
    height, area = 0, 0
    for value in sorted_list:
        height += value
        area += height - value / 2.
    fair_area = height * len(list_of_values) / 2
    return (fair_area - area) / fair_area

Space Fluff, at the time of Beta, has 3 workflows: 
    - 'Classify!'
    - 'Classify on the go!'
    - 'Hardcore version!'
These are all based on simple multiple choice questions. 

In [69]:
workflow_classify = '~/Desktop/SUNDIAL/images/beta_classify-classifications.csv'
workflow_on_the_go = '~/Desktop/SUNDIAL/images/beta_classify-on-the-go-classifications.csv'
workflow_hardcore = '~/Desktop/SUNDIAL/images/beta_classify-hardcore-edition-classifications.csv'
workflow_all = '~/Desktop/SUNDIAL/images/beta_space-fluff-classifications.csv'

In [70]:
classifications_classify = pd.read_csv(workflow_classify)
classifications_on_the_go = pd.read_csv(workflow_on_the_go)
classifications_hardcore = pd.read_csv(workflow_hardcore)
classifications_all = pd.read_csv(workflow_all)

If you want to select a certain period of time, you can use this snippets here:

In [71]:
## To select between two dates
#classifications['created_at'] = pd.to_datetime(classifications['created_at'])
#classifications[(classifications['created_at'] > pd.Timestamp(date(2020,10,20)))& (classifications['created_at'] < pd.Timestamp(date(2020,10,21)))]

## To select after a certain date

#classifications['created_at'] = pd.to_datetime(classifications['created_at'])
#classifications = classifications[classifications['created_at'] > pd.Timestamp(date(2020,10,20))]

## Remember to turn the data type back to strings

#classifications['created_at'] = str(classifications['created_at'])

In our case, we want to select the classifications made during the Beta Phase. 

In [72]:
#for all classifications: 32229
classifications_all = classifications_all[32239:]
#for 'Classify!': 6295
classifications_classify = classifications_classify[6295:]
#for 'Classify on the go!': 19989
classifications_on_the_go = classifications_on_the_go[19989:]
#for 'Classify: hardcore edition!': 5945
classifications_hardcore = classifications_hardcore[5945:]

In [73]:
# grab the subject counts 
n_subj_tot_all  = len(classifications_all.subject_data.unique())
by_subject_all = classifications_all.groupby('subject_data')
subj_class_all = by_subject_all.created_at.aggregate('count')

n_subj_tot_classify  = len(classifications_classify.subject_data.unique())
by_subject_classify = classifications_classify.groupby('subject_data')
subj_class_classify = by_subject_classify.created_at.aggregate('count')

n_subj_tot_on_the_go  = len(classifications_on_the_go.subject_data.unique())
by_subject_on_the_go = classifications_on_the_go.groupby('subject_data')
subj_class_on_the_go = by_subject_on_the_go.created_at.aggregate('count')

n_subj_tot_hardcore  = len(classifications_hardcore.subject_data.unique())
by_subject_hardcore = classifications_hardcore.groupby('subject_data')
subj_class_hardcore = by_subject_hardcore.created_at.aggregate('count')

In [74]:
# basic stats on how classified the subjects are
subj_class_mean_all = np.mean(subj_class_all)
subj_class_med_all  = np.median(subj_class_all)
subj_class_min_all  = np.min(subj_class_all)
subj_class_max_all  = np.max(subj_class_all)

subj_class_mean_classify = np.mean(subj_class_classify)
subj_class_med_classify  = np.median(subj_class_classify)
subj_class_min_classify  = np.min(subj_class_classify)
subj_class_max_classify  = np.max(subj_class_classify)

subj_class_mean_on_the_go = np.mean(subj_class_on_the_go)
subj_class_med_on_the_go  = np.median(subj_class_on_the_go)
subj_class_min_on_the_go  = np.min(subj_class_on_the_go)
subj_class_max_on_the_go  = np.max(subj_class_on_the_go)

subj_class_mean_hardcore = np.mean(subj_class_hardcore)
subj_class_med_hardcore  = np.median(subj_class_hardcore)
subj_class_min_hardcore  = np.min(subj_class_hardcore)
subj_class_max_hardcore  = np.max(subj_class_hardcore)

In [75]:
all_users_all = classifications_all.user_name.unique()
by_user_all = classifications_all.groupby('user_name')

all_users_classify = classifications_classify.user_name.unique()
by_user_classify = classifications_classify.groupby('user_name')

all_users_on_the_go = classifications_on_the_go.user_name.unique()
by_user_on_the_go = classifications_on_the_go.groupby('user_name')

all_users_hardcore = classifications_hardcore.user_name.unique()
by_user_hardcore = classifications_hardcore.groupby('user_name')

In [76]:
# get total classification and user counts for all classifications
n_class_tot_all = len(classifications_all)
n_users_tot_all = len(all_users_all)

unregistered_all = [q.startswith("not-logged-in") for q in all_users_all]
n_unreg_all = sum(unregistered_all)
n_reg_all   = n_users_tot_all - n_unreg_all

In [77]:
# get total classification and user counts for Classify
n_class_tot_classify = len(classifications_classify)
n_users_tot_classify = len(all_users_classify)

unregistered_classify = [q.startswith("not-logged-in") for q in all_users_classify]
n_unreg_classify = sum(unregistered_classify)
n_reg_classify = n_users_tot_classify - n_unreg_classify

In [78]:
# get total classification and user counts for on the go
n_class_tot_on_the_go = len(classifications_on_the_go)
n_users_tot_on_the_go = len(all_users_on_the_go)

unregistered_on_the_go = [q.startswith("not-logged-in") for q in all_users_on_the_go]
n_unreg_on_the_go = sum(unregistered_on_the_go)
n_reg_on_the_go = n_users_tot_on_the_go - n_unreg_on_the_go

In [79]:
# get total classification and user counts for hardcore
n_class_tot_hardcore = len(classifications_hardcore)
n_users_tot_hardcore = len(all_users_hardcore)

unregistered_hardcore = [q.startswith("not-logged-in") for q in all_users_hardcore]
n_unreg_hardcore = sum(unregistered_hardcore)
n_reg_hardcore = n_users_tot_hardcore - n_unreg_hardcore

In [80]:
nclass_byuser_all = by_user_all.created_at.aggregate('count')
nclass_byuser_ranked_all = nclass_byuser_all.copy()
nclass_byuser_ranked_all.sort_values(ascending=False)

user_name
Liava                                 766
not-logged-in-5bab565a8aafe38c711e    404
ktarkin                               322
Vdeboer75                             322
laura.fishman                         290
tomburgerpie                          289
konakid                               232
VHualde                               228
Sarcocyne                             197
Teddy5                                181
tubby123                              179
KJDL80                                144
petyerakne                            137
KLIMCAK-62                            135
not-logged-in-0983e811370809f65a36    119
keefc                                 117
not-logged-in-83a71e0a4ea08030a206    102
not-logged-in-4f90b10725a8588860d4     96
Teymur_Saif                            90
Cathybarnett                           78
teolo                                  75
sespeight                              75
rohit_sharma2                          66
Bbllee75                

In [81]:
nclass_byuser_classify = by_user_classify.created_at.aggregate('count')
nclass_byuser_ranked_classify = nclass_byuser_classify.copy()
nclass_byuser_ranked_classify.sort_values(ascending=False)

user_name
Vdeboer75                             322
laura.fishman                         290
ktarkin                               259
Teddy5                                181
tubby123                              179
petyerakne                            137
konakid                               124
not-logged-in-4eb4d569344adaa54ddc     96
Cathybarnett                           78
teolo                                  75
sespeight                              75
rohit_sharma2                          66
KLIMCAK-62                             62
aly_ka                                 52
not-logged-in-67f9aa364eb7e02ea1a0     44
not-logged-in-88b36899aaf1c3fa2f53     40
not-logged-in-cf3f5277ece3f9dab6bb     40
not-logged-in-18556654332e361106fa     39
not-logged-in-8620100def420c662441     38
DrPZ                                   35
not-logged-in-37af1954548a8dbc45e6     35
not-logged-in-735207b258f8e4aa0a2f     33
nooneofconsequence                     32
not-logged-in-e8338e5aab

In [82]:
nclass_byuser_hardcore = by_user_hardcore.created_at.aggregate('count')
nclass_byuser_ranked_hardcore = nclass_byuser_hardcore.copy()
nclass_byuser_ranked_hardcore.sort_values(ascending=False)

user_name
Liava                                 756
tomburgerpie                          289
VHualde                               228
konakid                               108
Bbllee75                               65
ktarkin                                61
not-logged-in-a15062d2763746fb0e91     26
rcuthomas                              25
not-logged-in-efe7e5716b04258b63ac     24
KJDL80                                 21
not-logged-in-0586f2014ecd2a1a26cf     19
MonkeyDragonCat                        16
TheSpacers                             15
KLIMCAK-62                             14
Storm3of5                              13
not-logged-in-c85f4fb9fa67267797cb     12
Omniua                                 12
Sonia_B                                12
not-logged-in-a6bbd75a0340497d1e27      8
theBourneian                            7
jai                                     6
not-logged-in-6a7e73ee14866ef44abe      6
not-logged-in-e881e180a99d7d187b18      5
not-logged-in-991df6a305

In [89]:
nclass_byuser_on_the_go = by_user_on_the_go.created_at.aggregate('count')
nclass_byuser_ranked_on_the_go = nclass_byuser_on_the_go.copy()
nclass_byuser_ranked_on_the_go.sort_values(ascending=False)

user_name
not-logged-in-b838129367183a677b7d    398
Sarcocyne                             197
keefc                                 117
not-logged-in-9987848cc389e26ce2a5    116
KJDL80                                105
Teymur_Saif                            90
kaila-jackson                          70
not-logged-in-4330b85304bd7da4fad5     61
KLIMCAK-62                             59
not-logged-in-23d3c0cec3d8f2d07e34     57
Farra                                  57
not-logged-in-46c35853b7242a4822dc     55
EWILMART                               53
not-logged-in-6b57feed7aad08f1ae2b     49
Tiger-009                              47
not-logged-in-fdb6b61d1296c43dee6d     47
smj                                    47
not-logged-in-86b6666310d0bb80cfdb     46
not-logged-in-880a18cb1f45b8ceb91e     46
not-logged-in-d1cc26bcc9862349f9e8     44
existentialpenguin                     39
not-logged-in-e41ec4c833bd8a781ee3     36
S0rCi3r                                32
not-logged-in-5fcc38452a

In [84]:
# very basic stats
nclass_med_all = np.median(nclass_byuser_all)
nclass_mean_all = np.mean(nclass_byuser_all)

nclass_med_classify = np.median(nclass_byuser_classify)
nclass_mean_classify = np.mean(nclass_byuser_classify)

nclass_med_on_the_go = np.median(nclass_byuser_on_the_go)
nclass_mean_on_the_go = np.mean(nclass_byuser_on_the_go)

nclass_med_hardcore = np.median(nclass_byuser_hardcore)
nclass_mean_hardcore = np.mean(nclass_byuser_hardcore)

# Gini coefficient - see the comments above the gini() function for more notes
nclass_gini_all = gini(nclass_byuser_all)
nclass_gini_classify = gini(nclass_byuser_classify)
nclass_gini_on_the_go = gini(nclass_byuser_on_the_go)
nclass_gini_hardcore = gini(nclass_byuser_hardcore)

In [85]:
print("\nOverall:\n\n",n_class_tot_all,"classifications of",n_subj_tot_all,"subjects by",n_users_tot_all,"classifiers,")
print(n_reg_all,"registered and",n_unreg_all,"unregistered.\n")
print("That's %.2f classifications per subject on average (median = %.1f)." % (subj_class_mean_all, subj_class_med_all))
print("The most classified subject has ",subj_class_max_all,"classifications; the least-classified subject has",subj_class_min_all,".\n")
print("Median number of classifications per user:",nclass_med_all)
print("Mean number of classifications per user: %.2f" % nclass_mean_all)
print("\nTop 10 most prolific classifiers:\n",nclass_byuser_ranked_all.head(10))
print("\n\nGini coefficient for classifications by user: %.2f\n" % nclass_gini_all)


Overall:

 8161 classifications of 521 subjects by 326 classifiers,
109 registered and 217 unregistered.

That's 15.66 classifications per subject on average (median = 14.0).
The most classified subject has  35 classifications; the least-classified subject has 2 .

Median number of classifications per user: 7.0
Mean number of classifications per user: 25.03

Top 10 most prolific classifiers:
 user_name
ArnavRaju         11
AstroRobert        6
Barbanzi          13
Bbllee75          65
BeeQueen           3
BlueFlamingSky    22
Budgieye          12
CG1980             5
Cathybarnett      78
Colin2davies       7
Name: created_at, dtype: int64


Gini coefficient for classifications by user: 0.74



In [86]:
print("\nOverall:\n\n",n_class_tot_classify,"classifications of",n_subj_tot_classify,"subjects by",n_users_tot_classify,"classifiers,")
print(n_reg_classify,"registered and",n_unreg_classify,"unregistered.\n")
print("That's %.2f classifications per subject on average (median = %.1f)." % (subj_class_mean_classify, subj_class_med_classify))
print("The most classified subject has ",subj_class_max_classify,"classifications; the least-classified subject has",subj_class_min_classify,".\n")
print("Median number of classifications per user:",nclass_med_classify)
print("Mean number of classifications per user: %.2f" % nclass_mean_classify)
print("\nTop 10 most prolific classifiers:\n",nclass_byuser_ranked_classify.head(10))
print("\n\nGini coefficient for classifications by user: %.2f\n" % nclass_gini_classify)


Overall:

 4003 classifications of 336 subjects by 246 classifiers,
63 registered and 183 unregistered.

That's 11.91 classifications per subject on average (median = 12.0).
The most classified subject has  20 classifications; the least-classified subject has 5 .

Median number of classifications per user: 6.0
Mean number of classifications per user: 16.27

Top 10 most prolific classifiers:
 user_name
ArnavRaju         11
BlueFlamingSky    22
Budgieye           8
CG1980             5
Cathybarnett      78
DrPZ              35
FEWSG              3
Faraday12         12
GreenLizard        5
IRR                9
Name: created_at, dtype: int64


Gini coefficient for classifications by user: 0.70



In [87]:
print("\nOverall:\n\n",n_class_tot_on_the_go,"classifications of",n_subj_tot_on_the_go,"subjects by",n_users_tot_on_the_go,"classifiers,")
print(n_reg_on_the_go,"registered and",n_unreg_on_the_go,"unregistered.\n")
print("That's %.2f classifications per subject on average (median = %.1f)." % (subj_class_mean_on_the_go, subj_class_med_on_the_go))
print("The most classified subject has ",subj_class_max_on_the_go,"classifications; the least-classified subject has",subj_class_min_on_the_go,".\n")
print("Median number of classifications per user:",nclass_med_on_the_go)
print("Mean number of classifications per user: %.2f" % nclass_mean_on_the_go)
print("\nTop 10 most prolific classifiers:\n",nclass_byuser_ranked_on_the_go.head(10))
print("\n\nGini coefficient for classifications by user: %.2f\n" % nclass_gini_on_the_go)


Overall:

 2362 classifications of 336 subjects by 87 classifiers,
40 registered and 47 unregistered.

That's 7.03 classifications per subject on average (median = 7.0).
The most classified subject has  14 classifications; the least-classified subject has 2 .

Median number of classifications per user: 8.0
Mean number of classifications per user: 27.15

Top 10 most prolific classifiers:
 user_name
AstroRobert       6
Barbanzi         13
BeeQueen          3
Colin2davies      7
Davydoo          16
Dusty_Diva       22
EWILMART         53
Farra            57
KJDL80          105
KLIMCAK-62       59
Name: created_at, dtype: int64


Gini coefficient for classifications by user: 0.68



In [88]:
print("\nOverall:\n\n",n_class_tot_hardcore,"classifications of",n_subj_tot_hardcore,"subjects by",n_users_tot_hardcore,"classifiers,")
print(n_reg_hardcore,"registered and",n_unreg_hardcore,"unregistered.\n")
print("That's %.2f classifications per subject on average (median = %.1f)." % (subj_class_mean_hardcore, subj_class_med_hardcore))
print("The most classified subject has ",subj_class_max_hardcore,"classifications; the least-classified subject has",subj_class_min_hardcore,".\n")
print("Median number of classifications per user:",nclass_med_hardcore)
print("Mean number of classifications per user: %.2f" % nclass_mean_hardcore)
print("\nTop 10 most prolific classifiers:\n",nclass_byuser_ranked_hardcore.head(10))
print("\n\nGini coefficient for classifications by user: %.2f\n" % nclass_gini_hardcore)


Overall:

 1806 classifications of 336 subjects by 49 classifiers,
27 registered and 22 unregistered.

That's 5.38 classifications per subject on average (median = 5.0).
The most classified subject has  15 classifications; the least-classified subject has 1 .

Median number of classifications per user: 4.0
Mean number of classifications per user: 36.86

Top 10 most prolific classifiers:
 user_name
Bbllee75                65
Budgieye                 4
Davinelulinvega          3
KJDL80                  21
KLIMCAK-62              14
Liava                  756
MonkeyDragonCat         16
Mtfd2222                 2
Nelllythetardigrade      4
Omniua                  12
Name: created_at, dtype: int64


Gini coefficient for classifications by user: 0.84

