### Combine scores from volunteers and turkers

In [1]:
import pandas as pd

In [2]:
unneeded_cols = ['worker2', 'worker3', 'worker4', 'worker5', 'n.workers', 'worker.thresh', 'precision', 'recall', 'specificity', 'f.measure']
option_defaults = {'binary': True, 'remove.low.severity': False}

def set_defaults(df):
    for key in option_defaults.keys():
        df[df[key] == option_defaults[key]]
        del df[key]
    return df

In [3]:
vol_scores = pd.read_csv('../../data/raw/vol-accuracy.csv')
vol_scores.insert(2, 'type', 'volunteer')
vol_scores = vol_scores.rename(
    columns={'condition.id': 'condition_id', 'worker1': 'worker_id'}
)

# check that multiuser sessions aren't included
assert len(vol_scores[vol_scores['n.workers'] != 1]) == 0

# remove excess columns
for col in unneeded_cols:
    del vol_scores[col]

# remove column only in volunteers
del vol_scores['prob.no.prob']

# get scores using defaualt options
vol_scores = set_defaults(vol_scores)

vol_scores.head()

Unnamed: 0,condition_id,worker_id,type,granularity,label.type,true.pos,false.pos,true.neg,false.neg
0,70,01232fef-5a19-4435-8be6-c0da3b38cabd,volunteer,5_meter,Problem,27,26,186,17
1,70,01232fef-5a19-4435-8be6-c0da3b38cabd,volunteer,10_meter,Problem,27,15,76,12
2,70,01232fef-5a19-4435-8be6-c0da3b38cabd,volunteer,street,Problem,17,0,0,0
3,72,9501513f-3822-4921-861e-8f1440dee102,volunteer,5_meter,Problem,22,58,161,7
4,72,9501513f-3822-4921-861e-8f1440dee102,volunteer,10_meter,Problem,21,44,59,3


In [4]:
turk_scores = pd.read_csv('../../data/raw/turk-accuracy.csv')
turk_scores.insert(2, 'type', 'turker')
turk_scores = turk_scores.rename(
    columns={'condition.id': 'condition_id', 'worker1': 'worker_id'}
)

# check that multiuser sessions aren't included
assert len(turk_scores[turk_scores['n.workers'] != 1]) == 0

# remove excess columns
for col in unneeded_cols:
    del turk_scores[col]
    
# get scores with default options
turk_scores = set_defaults(turk_scores)

# select turk_scores without severity threshold
turk_scores = turk_scores[turk_scores['low.severity.thresh'].isna()]
del turk_scores['low.severity.thresh']
    
turk_scores.head()

Unnamed: 0,condition_id,worker_id,type,granularity,label.type,true.pos,false.pos,true.neg,false.neg
0,70,A3PPRVK6XK6GP5,turker,5_meter,Problem,20,10,202,24
1,70,A3PPRVK6XK6GP5,turker,10_meter,Problem,19,6,85,20
2,70,A3PPRVK6XK6GP5,turker,street,Problem,10,0,0,7
3,72,A1TNQU9L5L62PL,turker,5_meter,Problem,21,70,149,8
4,72,A1TNQU9L5L62PL,turker,10_meter,Problem,19,44,59,5


In [5]:
# check that columns are the same
assert len(set(vol_scores.columns) - set(turk_scores.columns)) == 0
assert len(set(turk_scores.columns) - set(vol_scores.columns)) == 0

In [6]:
# combine
scores = vol_scores.append(turk_scores, ignore_index=True)
scores.reindex()
scores.head()

Unnamed: 0,condition_id,worker_id,type,granularity,label.type,true.pos,false.pos,true.neg,false.neg
0,70,01232fef-5a19-4435-8be6-c0da3b38cabd,volunteer,5_meter,Problem,27,26,186,17
1,70,01232fef-5a19-4435-8be6-c0da3b38cabd,volunteer,10_meter,Problem,27,15,76,12
2,70,01232fef-5a19-4435-8be6-c0da3b38cabd,volunteer,street,Problem,17,0,0,0
3,72,9501513f-3822-4921-861e-8f1440dee102,volunteer,5_meter,Problem,22,58,161,7
4,72,9501513f-3822-4921-861e-8f1440dee102,volunteer,10_meter,Problem,21,44,59,3


In [7]:
def combine_label_types(df):
        
    g = df.groupby(by=['condition_id', 'worker_id', 'type', 'granularity'])
    return g.sum()
    
scores_comb = combine_label_types(scores)
scores_comb.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,true.pos,false.pos,true.neg,false.neg
condition_id,worker_id,type,granularity,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
70,01232fef-5a19-4435-8be6-c0da3b38cabd,volunteer,10_meter,116,86,1564,54
70,01232fef-5a19-4435-8be6-c0da3b38cabd,volunteer,5_meter,112,136,3262,74
70,01232fef-5a19-4435-8be6-c0da3b38cabd,volunteer,street,76,18,140,4
70,A1DATRS7IUV9B3,turker,10_meter,120,110,1564,99
70,A1DATRS7IUV9B3,turker,5_meter,95,151,3264,132
70,A1DATRS7IUV9B3,turker,street,126,55,136,48
70,A2U9OI2A0C2DG7,turker,10_meter,172,301,1414,47
70,A2U9OI2A0C2DG7,turker,5_meter,153,348,3100,74
70,A2U9OI2A0C2DG7,turker,street,161,172,136,13
70,A2X3QCJK0H18T8,turker,10_meter,102,407,1370,117


In [8]:
scores_comb.to_csv('../../data/interim/processing/scores-comb.csv')