In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("hit_result.csv")

In [2]:
def f(row):
    row['gold'] = [row['gold_0'], row['gold_1']]
    return row
df = df.apply(f, axis=1)

In [3]:
import ast
import pandas as pd
import numpy as np

# Convert TimeMe and WorkerId columns from string to list
df['TimeMe'] = df['TimeMe'].apply(lambda x: [float(i) for i in ast.literal_eval(x)])
df['WorkerId'] = df['WorkerId'].apply(ast.literal_eval)

# First, explode the dataframe so that each row has a single worker ID, gold, and time value
df_exp = df.apply(lambda x: x.explode() if x.name in ['gold', 'WorkerId', 'TimeMe'] else x)

# Create a new column for label disagreement
df_exp['labelDisagreement'] = (df_exp['gold'] != df_exp['label']).astype(int)

# Now, reshape dataframe to calculate worker disagreements
reshaped_df = df_exp.pivot_table(index=df_exp.index, columns=df_exp.groupby(df_exp.index).cumcount(), aggfunc='first')

# Check for worker disagreements in 'gold' column
worker_disagreement = np.where(reshaped_df['gold'][0] != reshaped_df['gold'][1], 1, 0)

# Assign worker disagreements back to the exploded dataframe
df_exp['workerDisagreement'] = np.repeat(worker_disagreement, 2)

# Finally, group by worker ID and calculate the required statistics
result = df_exp.groupby('WorkerId').agg(
    num_submission=('label', 'size'),
    num_workerDisagreement=('workerDisagreement', 'sum'),
    num_labelDisagreement=('labelDisagreement', 'sum'),
    avg_time=('TimeMe', 'mean')
).reset_index()

result['percent_workerDisagreement'] = result['num_workerDisagreement'] / result['num_submission']
result['percent_labelDisagreement'] = result['num_labelDisagreement'] / result['num_submission']

In [4]:
result.to_csv('perWorkerDisagreement.csv', index=False)

In [5]:
def filter_rows(df, worker_ids):
    # Initialize column with None
    df['WorkerIdPosition'] = None
    # Check if any worker in the list is present in each row's 'WorkerId' column
    for idx, workers in df['WorkerId'].items():
        for i, worker in enumerate(workers):
            if worker in worker_ids:
                df.loc[idx, 'WorkerIdPosition'] = i
    # Filter rows where 'WorkerIdPosition' is not None
    return df[df['WorkerIdPosition'].notna()]


filtered_df = filter_rows(df, ['A110KENBXU7SUJ', 'A2A6FH0F7LD9ND', 'AKQAI78JTXXC9'])

In [6]:
filtered_df.to_csv('result_problematic.csv', index=False)

In [7]:
paired_df = pd.DataFrame()

paired_df['idiom'] = df['idiom'].iloc[::2].reset_index(drop=True) # takes every 2nd row starting from 0
paired_df['meaning'] = df['meaning'].iloc[::2].reset_index(drop=True) # assumes 'meaning' is the same for each pair

paired_df['intended_figurative'] = df['instance'].iloc[::2].reset_index(drop=True) # first instance of each pair
paired_df['intended_literal'] = df['instance'].iloc[1::2].reset_index(drop=True) # second instance of each pair

paired_df['maxVote_figurative'] = df['maxVote'].iloc[::2].reset_index(drop=True) # first maxVote of each pair
paired_df['maxVote_literal'] = df['maxVote'].iloc[1::2].reset_index(drop=True) # second maxVote of each pair

paired_df['gold_figurative'] = df['gold'].iloc[::2].reset_index(drop=True) # first gold of each pair
paired_df['gold_literal'] = df['gold'].iloc[1::2].reset_index(drop=True) # second gold of each pair

# 'well_formed' is true iff the first row has a maxVote of 'figurative' and the second row has a maxVote of 'literal', or vice versa
paired_df['well_formed'] = ((paired_df['maxVote_figurative'] == 'figurative') & (paired_df['maxVote_literal'] == 'literal')) | ((paired_df['maxVote_figurative'] == 'literal') & (paired_df['maxVote_literal'] == 'figurative'))

In [17]:
amb_df = pd.read_csv("gen_final_chatgpt.csv")[['idiom', 'intended_ambiguous']]
merged_df = pd.merge(paired_df, amb_df, on='idiom', how='inner')
merged_df.to_csv("per_idiom_dataset.csv", index=False)