In [4]:
import os 
import pandas as pd
import numpy as np

data_path = '../../../data/annotations/group_mention_categorization'

In [80]:
fp = os.path.join(data_path, 'consolidated_annotations_post_review.tsv')
df = pd.read_csv(fp, sep='\t')

In [81]:
# gather all category values where label == 'Yes' by mention_id and attribute
wider = df.query('label == "Yes"').groupby(['mention_id', 'attribute']).agg({'category': lambda x: '; '.join(x.to_list())}).reset_index()
wider = wider.pivot_table(index='mention_id', columns='attribute', values='category', aggfunc='first').reset_index()
wider.columns.name = None

cols = ['mention_id', 'text', 'mention']
wider = df[cols].drop_duplicates().merge(wider, on='mention_id', how='left')

In [82]:
import re
def get_cases(keywords: list[str]):        
    pat = re.compile('|'.join([re.escape(k.lower()) for k in keywords]), re.IGNORECASE)
    out = wider[wider.mention.str.contains(pat)].sort_values('mention').reset_index(drop=True).copy()
    add_cols = ['comment', 'add codings', 'remove codings']
    for col in add_cols:
        out[col] = ''
    return out

In [95]:
sheets = {}

In [96]:
keywords = ['worker', 'employee']
sheets[', '.join(keywords)] = get_cases(keywords)
sheets[', '.join(keywords)].to_clipboard()

In [97]:
keywords = ['societ']
sheets[', '.join(keywords)] = get_cases(keywords)
get_cases(keywords).to_clipboard()

In [98]:
dest = os.path.join(data_path, 'social-group-mention-categorization-expert-consolidation')
os.makedirs(dest, exist_ok=True)
fn = 'annotations_to_review_wordwise.xlsx'
fp = os.path.join(dest, fn)
with pd.ExcelWriter(fp, engine='openpyxl') as writer:
    for sn, df in sheets.items():
        df.to_excel(writer, sheet_name=sn.replace(':', '-').replace('/', ', '), index=False)