In [210]:
import pandas as pd
import os 
# instances = pd.read_csv("../../../2_generate_instances/idiom_eng_instance.csv")

In [211]:
batch1 = pd.read_csv("batch_1_summary/batch_1_result_5051395.csv")
batch2 = pd.read_csv("batch_2_summary/batch_2_result_5051730.csv")
batch3 = pd.read_csv("batch_3_summary/batch_3_result_5052140_ver2.csv")
batch4 = pd.read_csv("batch_4_summary/batch_4_result_5053217.csv")
batches = [batch1, batch2, batch3, batch4]

## Configuration

In [235]:
batch_num = 4
NUM_ASSIGNMENTS = 3
NUM_PROBLEMS = 5
to_csv = True
NA = 'NOT_AVAILABLE: SELECT_A_RANDOM_CHOICE'
slash = f"batch_{batch_num}_summary/"

In [236]:
batch = batches[batch_num - 1]
try:
    batch['WorkTimeInSeconds'] = batch['Answer.TimeMe']
except:
    pass
try: os.mkdir(slash) 
except: pass

## Original
- Retains the original batch result format as much as possible
- Includes the `Answer.feedback` column if `batch_num` is set to be `>= 3`

In [237]:
batch_original = batch[["Input.idiom_0","Input.meaning_0","Input.instance_0","Input.label_0",
                        "Input.idiom_1","Input.meaning_1","Input.instance_1","Input.label_1",
                        "Input.idiom_2","Input.meaning_2","Input.instance_2","Input.label_2",
                        "Input.idiom_3","Input.meaning_3","Input.instance_3","Input.label_3",
                        "Input.idiom_4","Input.meaning_4","Input.instance_4","Input.label_4",
                        "Answer.0_gold","Answer.1_gold","Answer.2_gold","Answer.3_gold","Answer.4_gold"]
                        + (['Answer.feedback'] if batch_num >= 3 else [])]
if to_csv: batch_original.to_csv(slash + f'batch_{batch_num}_original.csv', index=False)

## Instance

In [238]:
batch_instance = batch[["Input.idiom_0","Input.meaning_0","Input.instance_0","Input.label_0",
                        "Input.idiom_1","Input.meaning_1","Input.instance_1","Input.label_1",
                        "Input.idiom_2","Input.meaning_2","Input.instance_2","Input.label_2",
                        "Input.idiom_3","Input.meaning_3","Input.instance_3","Input.label_3",
                        "Input.idiom_4","Input.meaning_4","Input.instance_4","Input.label_4",
                        "Answer.0_gold","Answer.1_gold","Answer.2_gold","Answer.3_gold","Answer.4_gold",
                        "WorkerId","WorkTimeInSeconds","AssignmentId","HITId","HITTypeId"]]
no_agg_cols = ["Input.idiom_0","Input.meaning_0","Input.instance_0","Input.label_0","Input.idiom_1","Input.meaning_1","Input.instance_1","Input.label_1","Input.idiom_2","Input.meaning_2","Input.instance_2","Input.label_2","Input.idiom_3","Input.meaning_3","Input.instance_3","Input.label_3","Input.idiom_4","Input.meaning_4","Input.instance_4","Input.label_4"]
batch_instance = batch_instance.groupby(
    batch_instance.index // NUM_ASSIGNMENTS
).agg(lambda x: list(x)).reset_index(drop=True)
batch_instance[no_agg_cols] = batch_instance[no_agg_cols].apply(lambda row: pd.Series([row[j][0] for j in row.index]), axis=1)

def columns_to_list(row):
    row['idiom'] = []
    row['meaning'] = []
    row['instance'] = []
    row['label'] = []
    row['gold'] = []
    for i in range(NUM_PROBLEMS):
        row['idiom'].append(row[f'Input.idiom_{i}'])
        row['meaning'].append(row[f'Input.meaning_{i}'])
        row['instance'].append(row[f'Input.instance_{i}'])
        row['label'].append(row[f'Input.label_{i}'])
        row['gold'].append(row[f'Answer.{i}_gold'])
    return row
batch_instance = batch_instance.apply(columns_to_list, axis=1)
batch_instance = batch_instance.explode(['idiom', 'meaning', 'instance', 'label', 'gold']).reset_index(drop=True)

def most_common(lst):
    return max(set(lst), key=lst.count)
def aggregate(row):
    for i in range(NUM_ASSIGNMENTS):
        row[f'assignment_{i}'] = {
            'WorkerId': row['WorkerId'][i],
            'WorkTimeInSeconds': row['WorkTimeInSeconds'][i],
            'AssignmentId': row['AssignmentId'][i],
            'HITId': row['HITId'][i],
            'HITTypeId': row['HITTypeId'][i],
            'gold': row['gold'][i],
        }
        row[f'assignment_{i}']
        row[f'gold_{i}'] = row['gold'][i]
    row['maxVote'] = most_common([row['gold'][i] for i in range(NUM_ASSIGNMENTS)])
    
    row['review_noAgreement'] = \
        len(set([row[f'gold_{i}'] for i in range(NUM_ASSIGNMENTS)])) == NUM_ASSIGNMENTS

    row['review_badExists'] = False
    for i in range(NUM_ASSIGNMENTS):
        row[f'assignment_{i}']['bad'] = \
            row['WorkTimeInSeconds'][i] <= 20 and row['gold'][i] != row['maxVote']
        row['review_badExists'] = row['review_badExists'] or row[f'assignment_{i}']['bad']

    row['review_maxVoteGptDiff'] = row['maxVote'] != row['label']

    return row

batch_instance = batch_instance.apply(aggregate, axis=1)
batch_instance = batch_instance[[ 'idiom', 'meaning', 'instance', 'label', 
                                   'assignment_0', 'assignment_1', 'assignment_2',
                                   'gold_0', 'gold_1', 'gold_2',
                                     'maxVote', 'review_noAgreement', 'review_maxVoteGptDiff']]
batch_instance = batch_instance[batch_instance['idiom'] != NA].reset_index(drop=True)
if to_csv: batch_instance.to_csv(slash + f'batch_{batch_num}_instance.csv', index=False)

In [239]:
batch_worker = {}
for i in range(len(batch_instance)):
    row = batch_instance.iloc[i]
    for i in range(NUM_ASSIGNMENTS):
        worker = row[f'assignment_{i}']
        workerId = worker['WorkerId']
        if workerId not in batch_worker:
            batch_worker[workerId] = {
                'numHIT': 0,
                'total_time': 0,
                'gold_maxvote_diff': 0,
                'gold_gpt_diff': 0,
            }
        batch_worker[workerId]['numHIT'] += 1
        batch_worker[workerId]['total_time'] += worker['WorkTimeInSeconds']
        batch_worker[workerId]['gold_maxvote_diff'] += 1 if worker['gold'] != row['maxVote'] else 0
        batch_worker[workerId]['gold_gpt_diff'] += 1 if worker['gold'] != row['label'] else 0
        batch_worker[workerId]['time_per_HIT'] = batch_worker[workerId]['total_time'] / batch_worker[workerId]['numHIT']
        batch_worker[workerId]['mean_gold_maxvote_diff'] = batch_worker[workerId]['gold_maxvote_diff'] / batch_worker[workerId]['numHIT']
        batch_worker[workerId]['mean_gold_gpt_diff'] = batch_worker[workerId]['gold_gpt_diff'] / batch_worker[workerId]['numHIT']

batch_worker = pd.DataFrame.from_dict(batch_worker, orient='index')
if to_csv: batch_worker.to_csv(slash + f'batch_{batch_num}_worker.csv', index=False)
batch_worker

Unnamed: 0,numHIT,total_time,gold_maxvote_diff,gold_gpt_diff,time_per_HIT,mean_gold_maxvote_diff,mean_gold_gpt_diff
A1HKYY6XI2OHO1,240,6935.575,29,51,28.898229,0.120833,0.2125
A2A6FH0F7LD9ND,228,7497.891,59,84,32.885487,0.258772,0.368421
AURYD2FH3FUOQ,255,8053.835,27,52,31.583667,0.105882,0.203922
A2LMQ4497NMK3S,315,45685.93,60,85,145.034698,0.190476,0.269841
AKQAI78JTXXC9,110,8058.795,10,27,73.261773,0.090909,0.245455
A1P3Z24Y6GRNVA,185,8522.08,44,56,46.065297,0.237838,0.302703
A1QUQ0TV9KVD4C,223,5934.17,25,60,26.610628,0.112108,0.269058
A26UIS59SY4NM6,85,13752.615,14,28,161.795471,0.164706,0.329412
A3CGQOJC28OVGN,88,2498.274,19,22,28.389477,0.215909,0.25
AX8NXTT8QMGHC,210,10529.61,26,51,50.141,0.12381,0.242857


In [240]:
from statsmodels.stats.inter_rater import aggregate_raters
from statsmodels.stats.inter_rater import fleiss_kappa
import random
random.seed(42)
golds = batch_instance[['assignment_0', 'assignment_1', 'assignment_2']]

def aggregate_golds(row):
    for i in range(NUM_ASSIGNMENTS):
        row[f'assignment_{i}'] = row[f'assignment_{i}']['gold']
    return row

def aggregate_golds_exclude_worker(row, EXCLUDED_WORKER ='A1AXQM54WTRGA5'):
    worker_ids = [row[f'assignment_{i}']['WorkerId'] for i in range(NUM_ASSIGNMENTS)]

    if EXCLUDED_WORKER in worker_ids:
        idx = worker_ids.index(EXCLUDED_WORKER)
    else:
        idx = random.randint(0, NUM_ASSIGNMENTS - 1)

    j = 0
    for i in range(NUM_ASSIGNMENTS):
        if i != idx:
            row[f'fixed_assignment_{j}'] = row[f'assignment_{i}']['gold']
            j += 1
        
    return row[[f'fixed_assignment_{i}' for i in range(NUM_ASSIGNMENTS - 1)]]

golds = golds.apply(aggregate_golds, axis=1)
# golds = golds.apply(aggregate_golds_exclude_worker, axis=1)
golds = aggregate_raters(golds)[0]
kappa = fleiss_kappa(golds, method='fleiss')

kappa

0.5247389943196729

In [241]:
if False:
    import sys
    sys.path.append('../../mturk_workspace/')
    sys.path.append('../../mturk_workspace/utils/qualifications/')
    from assign_workers_to_qualification_list import main as assign_workers

    def exclude(row, EXCLUDED_WORKER ='A1AXQM54WTRGA5'):
        worker_ids = [row[f'assignment_{i}']['WorkerId'] for i in range(NUM_ASSIGNMENTS)]
        row['excluded'] = EXCLUDED_WORKER in worker_ids
        return row

    batch_exclude = batch_instance.apply(exclude, axis=1).reset_index(drop=True)
    batch_exclude = batch_exclude[batch_exclude['excluded']]
    worker_ids = set()
    for i in range(len(batch_exclude)):
        row = batch_exclude.iloc[i]
        for j in range(NUM_ASSIGNMENTS):
            worker_ids.add(row[f'assignment_{j}']['WorkerId'])
    worker_ids = pd.DataFrame(list(worker_ids), columns=['worker_id'])
    worker_ids['integer_value'] = -1
    worker_ids['qualification_type_id'] = '3AWVFG4SGD9V9ROD3J1XD1PIYMHBRJ'
    assign_workers(df=worker_ids)
    batch_exclude[["idiom", "meaning", 
                "instance", "label"]].to_csv('idiom-usage-classification-batch3-ver2.csv', index=False)