In [216]:
import pandas as pd
from io import BytesIO
import requests
from collections import Counter

def fetch_data(url, skiprows=0):
    r = requests.get(url)
    data = r.content

    return pd.read_csv(BytesIO(data), index_col=0, skiprows=skiprows)

practice_sets = fetch_data("https://docs.google.com/spreadsheets/d/1JyGlqmLg9k7UubOw-V_CC8McRZxn2PtknvsrMIxvLGk/export?gid=1907623426&format=csv", 0)
practice_sets.head()

Unnamed: 0_level_0,Set Id,Local Id,Source,Type,AR,ALG,DA,GEO,ARG,FAD,...,PRS,CC,TRI,CG,MG,WP,2WP,VC,AQ,DI
Global Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1.01,ETS Prep,QC,,,,x,,,...,,,x,,,,,,,
2,1,1.02,ETS Prep,QC,x,,,,,x,...,,,,,,,,,,
3,1,1.03,ETS Prep,QC,x,,,,,,...,,,,,,,,,,
4,1,1.04,ETS Prep,QC,,,,x,,,...,,x,,,,,,,,
5,1,1.05,ETS Prep,QC,,x,,,,,...,,,,,,,,,,


In [220]:
categories = scrubbed_sets.keys()[4:]
repl_nan = { i : False for i in scrubbed_sets.keys()[3:]}
repl_checked = { i : 'x' for i in scrubbed_sets.keys()[3:]}

scrubbed_sets = practice_sets.fillna(value=repl_nan).replace(repl_checked, True)
scrubbed_sets.head()

Unnamed: 0_level_0,Set Id,Local Id,Source,Type,AR,ALG,DA,GEO,ARG,FAD,...,PRS,CC,TRI,CG,MG,WP,2WP,VC,AQ,DI
Global Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1.01,ETS Prep,QC,False,False,False,True,False,False,...,False,False,True,False,False,False,False,False,False,False
2,1,1.02,ETS Prep,QC,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,1,1.03,ETS Prep,QC,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1,1.04,ETS Prep,QC,False,False,False,True,False,False,...,False,True,False,False,False,False,False,False,False,False
5,1,1.05,ETS Prep,QC,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [221]:
gb = scrubbed_sets.groupby('Set Id')
cat_avgs = (gb[categories].sum() / gb[categories].count() * 20).mean()
cat_avgs[categories]

AR      7.300
ALG     7.300
DA      5.125
GEO     4.125
ARG     1.100
FAD     0.450
PCT     2.650
NP      1.575
DAP     0.800
EAR     1.125
EAF     3.175
IAV     1.500
FFS     0.925
ROW     0.825
RAT     1.275
CT      0.800
DST     0.450
PROB    1.325
PRS     0.600
CC      0.700
TRI     1.550
CG      1.025
MG      0.500
WP      1.700
2WP     0.550
VC      0.225
AQ      0.000
DI      3.050
dtype: float64

This will calculate the count of each question type for each practice set we have data for, along with overall average and variance.

Note: the final result is normalized to be a percent of each 20 question test to make it directly comparable to our catalog percentages.

In [272]:
gb2 = scrubbed_sets.groupby(['Set Id', 'Type'])[categories]
qs_in_set = [25 for i in range(0, 8)] + [20 for i in range(0, 24)] # 8 sets, 4 rows per set
qs_per_set = gb2.count()['AR'].divide(qs_in_set, axis='index') * 20
#qs_per_set.groupby('Type').mean()
#qs_per_set.groupby('Type').var()
#qs_per_set.groupby('Type').max() - qs_per_set.groupby('Type').min()
target_weights = qs_per_set.groupby('Type').mean() / 20
target_weights.to_dict()

{'MA': 0.0875, 'MC': 0.44000000000000006, 'NE': 0.095, 'QC': 0.3775}

In [231]:
categorical_cols = scrubbed_sets.columns[4:8]
skillset_cols = scrubbed_sets.columns[8:-5]
misc_cols = scrubbed_sets.columns[-5:]
categorical_cols
skillset_cols
misc_cols

Index(['WP', '2WP', 'VC', 'AQ', 'DI'], dtype='object')

In [235]:
def key_based_liklihood(practice_sets, keys):
    occured = {}
    for cat in keys:
        df = practice_sets[practice_sets[cat] == True]
        key_count = df.count()[0]
        total_count = practice_sets.count()[0]

        occured[cat] = (key_count, key_count / total_count)
        
    return occured

In [269]:
cbreakdown = key_based_liklihood(scrubbed_sets, categorical_cols)
sorted(list(cbreakdown.items()), key=lambda tup: tup[1][1], reverse=True)
cbreakdown

{'AR': (63, 0.37058823529411766),
 'ALG': (62, 0.36470588235294116),
 'DA': (43, 0.2529411764705882),
 'GEO': (35, 0.20588235294117646)}

In [270]:
sbreakdown = key_based_liklihood(scrubbed_sets, skillset_cols)
sorted(list(sbreakdown.items()), key=lambda tup: tup[1][1], reverse=True)
sbreakdown

{'ARG': (9, 0.052941176470588235),
 'FAD': (4, 0.023529411764705882),
 'PCT': (23, 0.13529411764705881),
 'NP': (13, 0.07647058823529412),
 'DAP': (7, 0.041176470588235294),
 'EAR': (10, 0.058823529411764705),
 'EAF': (27, 0.1588235294117647),
 'IAV': (13, 0.07647058823529412),
 'FFS': (8, 0.047058823529411764),
 'ROW': (7, 0.041176470588235294),
 'RAT': (11, 0.06470588235294118),
 'CT': (7, 0.041176470588235294),
 'DST': (4, 0.023529411764705882),
 'PROB': (11, 0.06470588235294118),
 'PRS': (5, 0.029411764705882353),
 'CC': (6, 0.03529411764705882),
 'TRI': (13, 0.07647058823529412),
 'CG': (9, 0.052941176470588235),
 'MG': (4, 0.023529411764705882)}

In [271]:
mbreakdown = key_based_liklihood(scrubbed_sets, misc_cols)
sorted(list(mbreakdown.items()), key=lambda tup: tup[1][1], reverse=True)
mbreakdown

{'WP': (15, 0.08823529411764706),
 '2WP': (5, 0.029411764705882353),
 'VC': (2, 0.011764705882352941),
 'AQ': (0, 0.0),
 'DI': (26, 0.15294117647058825)}