In [91]:
import pandas as pd
from io import BytesIO
import requests
import random

def fetch_data(url, skiprows=0):
    r = requests.get(url)
    data = r.content

    return pd.read_csv(BytesIO(data), index_col=0, skiprows=skiprows)

catalog = fetch_data("https://docs.google.com/spreadsheets/d/1JyGlqmLg9k7UubOw-V_CC8McRZxn2PtknvsrMIxvLGk/export?gid=0&format=csv", 2)

completed_sets = fetch_data("https://docs.google.com/spreadsheets/d/1JyGlqmLg9k7UubOw-V_CC8McRZxn2PtknvsrMIxvLGk/export?gid=1791033131&format=csv")
completed_suppl = fetch_data("https://docs.google.com/spreadsheets/d/1JyGlqmLg9k7UubOw-V_CC8McRZxn2PtknvsrMIxvLGk/export?gid=399306386&format=csv")

In [92]:
categories = catalog.keys()[7:]
repl_nan = { i : False for i in catalog.keys()[3:]}
repl_checked = { i : 'x' for i in catalog.keys()[3:]}

scrubbed_cat = catalog.fillna(value=repl_nan).replace(repl_checked, True)

In [93]:
categorical_cols = scrubbed_cat.columns[3:7]
ctotals = { col:scrubbed_cat[col].where(scrubbed_cat[col]==True).count() for col in categorical_cols }
ctotals

{'AR': 239, 'ALG': 216, 'DA': 116, 'GEO': 132}

In [94]:
skillset_cols = scrubbed_cat.columns[7:-4] 
stotals = { col:scrubbed_cat[col].where(scrubbed_cat[col]==True).count() for col in skillset_cols }
stotals

{'ARG': 29,
 'FAD': 40,
 'PCT': 55,
 'NP': 45,
 'DAP': 36,
 'EAR': 41,
 'EAF': 44,
 'IAV': 44,
 'FFS': 59,
 'ROW': 30,
 'RAT': 43,
 'CT': 38,
 'DST': 31,
 'PROB': 47,
 'PRS': 22,
 'CC': 26,
 'TRI': 43,
 'CG': 24,
 'MG': 18}

In [95]:
misc_cols = scrubbed_cat.columns[-4:]
mtotals = { col:scrubbed_cat[col].where(scrubbed_cat[col]==True).count() for col in misc_cols }
mtotals

{'WP': 42, '2WP': 25, 'VC': 20, 'AQ': 42}

In [96]:
completed_full = pd.concat([completed_sets, completed_suppl], ignore_index=False, sort=False).drop_duplicates().set_index('Global Id')
j = scrubbed_cat.join(completed_full, how='right', lsuffix='_left', rsuffix='_right', sort=False)
completed_meta = j[j['Local Id_right'].notna()]
completed_meta[50:100]

Unnamed: 0_level_0,Local Id_left,Source,Type,AR,ALG,DA,GEO,ARG,FAD,PCT,...,TRI,CG,MG,WP,2WP,VC,AQ,Local Id_right,Completed,Correct
Global Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
474,18.17,M,QC,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,18.17,True,True
480,18.23,M,QC,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,18.23,,True
489,19.03,M,MC,False,True,False,False,False,False,False,...,False,False,False,True,False,True,False,19.03,True,False
503,19.17,M,MC,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,20.05,True,True
508,20.02,M,MA,False,True,False,False,False,False,False,...,False,False,False,True,False,False,False,20.02,True,True
511,20.05,M,NE,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,21.12,True,True
525,20.19,M,MC,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,20.19,True,True
538,20.32,M,MA,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,20.32,True,False
554,21.12,M,QC,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,21.26,True,True
568,21.26,M,NE,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,25.16,True,True


In [107]:
def key_based_accuracy(completed_meta, keys):
    wrong = {}
    accuracy = {}
    for cat in keys:
        df = completed_meta[completed_meta[cat] == True]
        correct = df[df['Correct'] == True].count()[0]
        count = df.count()[0]

        wrong[cat] = df[df['Correct'] == False]
        accuracy[cat] = (count, correct / count)
        
    return (wrong, accuracy)

In [109]:
key_based_accuracy(completed_meta, categorical_cols)[1]

{'AR': (24, 0.8333333333333334),
 'ALG': (33, 0.8787878787878788),
 'DA': (12, 0.75),
 'GEO': (12, 1.0)}

In [99]:
key_based_accuracy(completed_meta, skillset_cols)[1]

{'ARG': (2, 1.0),
 'FAD': (5, 0.8),
 'PCT': (4, 0.75),
 'NP': (4, 0.75),
 'DAP': (5, 0.8),
 'EAR': (7, 0.8571428571428571),
 'EAF': (10, 0.8),
 'IAV': (8, 0.875),
 'FFS': (7, 1.0),
 'ROW': (3, 1.0),
 'RAT': (6, 0.8333333333333334),
 'CT': (5, 0.8),
 'DST': (4, 0.75),
 'PROB': (3, 0.6666666666666666),
 'PRS': (4, 1.0),
 'CC': (1, 1.0),
 'TRI': (3, 1.0),
 'CG': (2, 1.0),
 'MG': (2, 1.0)}

In [100]:
key_based_accuracy(completed_meta, misc_cols)[1]

{'WP': (11, 0.8181818181818182),
 '2WP': (4, 1.0),
 'VC': (2, 0.5),
 'AQ': (5, 0.8)}

In [101]:
def type_based_accuracy(completed_meta):
    accuracy = {}
    for typ in ['QC', 'MC', 'NE', 'MA']:
        df = completed_meta[completed_meta['Type'] == typ]
        correct = df[df['Correct'] == True].count()[0]
        count = df.count()[0]
        accuracy[typ] = (count, correct / count)
    return accuracy

In [102]:
type_based_accuracy(completed_meta)

{'QC': (27, 0.9259259259259259),
 'MC': (32, 0.8125),
 'NE': (17, 0.8823529411764706),
 'MA': (2, 0.5)}