In [214]:
import pandas as pd
from io import BytesIO
import requests
import random

In [215]:
def fetch_data(url, skiprows=0):
    r = requests.get(url)
    data = r.content

    return pd.read_csv(BytesIO(data), index_col=0, skiprows=skiprows)

catalog = fetch_data("https://docs.google.com/spreadsheets/d/1JyGlqmLg9k7UubOw-V_CC8McRZxn2PtknvsrMIxvLGk/export?gid=0&format=csv", 2)
data_interp_catalog = fetch_data("https://docs.google.com/spreadsheets/d/1JyGlqmLg9k7UubOw-V_CC8McRZxn2PtknvsrMIxvLGk/export?gid=1340046923&format=csv", 0)

completed_sets = fetch_data("https://docs.google.com/spreadsheets/d/1JyGlqmLg9k7UubOw-V_CC8McRZxn2PtknvsrMIxvLGk/export?gid=1791033131&format=csv")
completed_suppl = fetch_data("https://docs.google.com/spreadsheets/d/1JyGlqmLg9k7UubOw-V_CC8McRZxn2PtknvsrMIxvLGk/export?gid=399306386&format=csv")
gen_sets = fetch_data("https://docs.google.com/spreadsheets/d/1JyGlqmLg9k7UubOw-V_CC8McRZxn2PtknvsrMIxvLGk/export?gid=998252116&format=csv")

In [216]:
categories = catalog.keys()[7:]
repl_nan = { i : False for i in catalog.keys()[3:]}
repl_checked = { i : 'x' for i in catalog.keys()[3:]}

scrubbed_cat = catalog.fillna(value=repl_nan).replace(repl_checked, True)
#scrubbed_cat.head()
#categories

In [217]:
categorical_cols = scrubbed_cat.columns[3:7]
skillset_cols = scrubbed_cat.columns[7:-4]
misc_cols = scrubbed_cat.columns[-4:]
#categorical_cols
#skillset_cols
#misc_cols

In [218]:
incomplete_sets = gen_sets[gen_sets['Test Taken'] != True]
incomplete_sets

Unnamed: 0_level_0,Set Id,Local Id,Test Taken
Global Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
251,4,12.03,
55,4,7.06,
780,4,29.11,
133,4,9.21,
523,4,20.17,
761,4,28.16,
586,4,22.07,
606,4,22.27,
283,4,12.35,
439,4,17.07,


In [219]:
completed_full = pd.concat([completed_sets, completed_suppl], ignore_index=False, sort=False).drop_duplicates()
completed_full

Unnamed: 0_level_0,Set Id,Local Id,Completed,Correct,Date,Unnamed: 4
Global Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
807,1.0,30.20,True,True,,
722,1.0,27.19,True,True,,
696,1.0,26.18,True,True,,
675,1.0,25.19,False,True,,
663,1.0,25.07,True,True,,
612,1.0,23.02,True,True,,
583,1.0,22.04,True,True,,
525,1.0,20.19,True,True,,
489,1.0,19.03,True,False,,
457,1.0,17.25,True,True,,


In [220]:
used_qs = pd.concat([completed_full, incomplete_sets], ignore_index=False, sort=False)
used_qs

Unnamed: 0_level_0,Set Id,Local Id,Completed,Correct,Date,Unnamed: 4,Test Taken
Global Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
807,1.0,30.20,True,True,,,
722,1.0,27.19,True,True,,,
696,1.0,26.18,True,True,,,
675,1.0,25.19,False,True,,,
663,1.0,25.07,True,True,,,
612,1.0,23.02,True,True,,,
583,1.0,22.04,True,True,,,
525,1.0,20.19,True,True,,,
489,1.0,19.03,True,False,,,
457,1.0,17.25,True,True,,,


In [221]:
j = scrubbed_cat.join(used_qs, how='left', lsuffix='_left', rsuffix='_right', sort=False)
remaining_qs = j[j['Local Id_right'].isna()]
remaining_qs

Unnamed: 0_level_0,Local Id_left,Source,Type,AR,ALG,DA,GEO,ARG,FAD,PCT,...,VC,AQ,DI,Set Id,Local Id_right,Completed,Correct,Date,Unnamed: 4,Test Taken
Global Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50,7.01,M,QC,True,False,False,False,True,False,False,...,False,False,False,,,,,,,
51,7.02,M,QC,True,False,False,False,True,False,False,...,False,False,False,,,,,,,
53,7.04,M,NE,True,False,False,False,True,False,False,...,False,False,False,,,,,,,
54,7.05,M,QC,True,False,False,False,True,False,False,...,False,False,False,,,,,,,
56,7.07,M,QC,True,False,False,False,True,False,False,...,False,False,False,,,,,,,
62,7.13,M,QC,True,False,False,False,True,False,False,...,False,False,False,,,,,,,
63,7.14,M,MC,True,False,False,False,True,False,False,...,False,False,False,,,,,,,
68,7.19,M,NE,True,False,False,False,True,False,False,...,False,False,False,,,,,,,
70,7.21,M,MC,True,True,False,False,True,False,False,...,False,False,False,,,,,,,
71,7.22,M,NE,True,True,False,False,True,False,False,...,False,False,False,,,,,,,


In [222]:
def sample_questions(remaining_qs, count_per_key):
    used = set()
    samples = {}
    for k in count_per_key:
        filtered = remaining_qs[remaining_qs[k] == True]
        sample = filtered.sample(count_per_key[k])['Local Id_left']
        rows = list(tuple(zip(sample.index, sample)))
        
        to_remove = []
        to_add = []
        for row in rows:
            if row[0] in used:
                to_remove.append(row)
                
                i = 0
                while i < 3:
                    resampled = filtered.sample(1)['Local Id_left']
                    r = resampled.iloc[0].index
                    if r not in used:
                        to_add.append((r.index, r['Local Id_left']))
        
        for r in to_remove:
            rows.remove(r)
        rows += to_add
        
        samples[k] = sample
        for r in rows:
            used.add(r[0])
            
    return samples

def flatten_and_shuffle(samples):
    qs = []
    for v in samples.values():
        qs += list(tuple(zip(v.index, v)))
    random.shuffle(qs)
    
    return qs

In [228]:
sample_sizes = {
    'TRI': 4,
    'EAF': 4,
    'PROB': 4,
    'PCT': 4,
    'WP': 4,
    'VC': 1,
    'ARG': 3,
    'DST': 2,
    '2WP': 2,
    'CG': 1,
    'AQ': 1
}
sample_qs = sample_questions(remaining_qs, sample_sizes)
shuffled = flatten_and_shuffle(sample_qs)
shuffled

[(500, 19.14),
 (593, 22.14),
 (743, 27.4),
 (269, 12.21),
 (76, 7.27),
 (738, 27.35),
 (302, 12.54),
 (93, 8.16),
 (762, 28.17),
 (638, 23.28),
 (107, 8.3),
 (709, 27.06),
 (414, 16.1),
 (56, 7.07),
 (101, 8.24),
 (260, 12.12),
 (455, 17.23),
 (417, 16.13),
 (51, 7.02),
 (639, 23.29),
 (108, 8.31),
 (607, 22.28),
 (725, 27.22),
 (413, 16.09),
 (452, 17.2),
 (644, 23.34),
 (627, 23.17),
 (429, 16.25),
 (819, 30.32),
 (280, 12.32)]

In [229]:
any([x[0] for x in shuffled if x[0] in used_qs.index])

False

In [230]:
df = pd.DataFrame(shuffled, columns =['Global Id', 'Local Id'])
df.to_csv(r'C:\Users\bille\Documents\git\gre_analysis\sup_gen.csv', index=False)