In [4]:
import os 
import pandas as pd
import numpy as np

data_path = '../../../data/annotations/group_mention_categorization'

In [5]:
fp = os.path.join(data_path, 'consolidated_annotations.tsv')
df = pd.read_csv(fp, sep='\t')

In [3]:
df.attribute_combination.unique()

array(['economic: class membership', 'economic: ecology of group',
       'economic: education level', 'economic: employment status',
       'economic: income/wealth/economic status',
       'economic: occupation/profession', 'economic: other',
       'non-economic: age', 'non-economic: crime',
       'non-economic: ethnicity', 'non-economic: family',
       'non-economic: gender/sexuality', 'non-economic: health',
       'non-economic: nationality', 'non-economic: other',
       'non-economic: place/location', 'non-economic: religion',
       'non-economic: shared values/mentalities', 'stance: ',
       'universal: '], dtype=object)

In [3]:
def get_cases(df, categories: list[str]):        
    focal = df.loc[df.attribute_combination.isin(categories)]
    mention_ids = focal.mention_id.unique()

    #  concatenate category values where label == 'Yes'
    df = df[df.category.notnull()]
    attrs = df[df.mention_id.isin(mention_ids)].groupby(['mention_id', 'attribute'])[['category', 'label']].apply(lambda x: x['category'][x['label']=='Yes'].to_list()).reset_index()
    attrs['categories'] = attrs[0].apply(lambda x: '; '.join(x) if isinstance(x, list) and len(x) > 0 else '')
    del attrs[0]
    attrs = attrs.pivot(index='mention_id', columns='attribute', values='categories').reset_index()
    attrs.columns.name = None

    focal = focal[focal.label=='Yes'].drop_duplicates().pivot_table(index=['mention_id', 'text', 'mention'], columns='attribute_combination', values='label', aggfunc='first', fill_value='').reset_index()
    focal.columns.name = None

    return focal.merge(attrs, on='mention_id', how='left')

In [4]:
sheets = {}

In [5]:
categories = ["economic: education level", "economic: occupation/profession"]
sn = '; '.join(categories)
sheets[sn] = get_cases(df, categories)

In [6]:
categories = ["economic: occupation/profession", "economic: ecology of group"]
sn = '; '.join(categories)
sheets[sn] = get_cases(df, categories)

In [7]:
categories = ["economic: class membership", "economic: employment status", "economic: income/wealth/economic status", "economic: occupation/profession"]
sn = '; '.join(categories)
sheets[sn] = get_cases(df, categories)

In [8]:
categories = ["non-economic: family", "non-economic: age", "non-economic: gender/sexuality"]
sn = '; '.join(categories)
sheets[sn] = get_cases(df, categories)

In [9]:
categories = ["non-economic: nationality", "non-economic: ethnicity"]
sn = '; '.join(categories)
sheets[sn] = get_cases(df, categories)

In [10]:
categories = ["economic: occupation/profession", "non-economic: health"]
sn = '; '.join(categories)
sheets[sn] = get_cases(df, categories)

In [11]:
categories = ["economic: ecology of group", "non-economic: shared values/mentalities"]
sn = '; '.join(categories)
sheets[sn] = get_cases(df, categories)

In [12]:
categories = ["non-economic: age", "economic: education level"]
sn = '; '.join(categories)
sheets[sn] = get_cases(df, categories)

In [13]:
dest = os.path.join(data_path, 'social-group-mention-categorization-expert-consolidation')
os.makedirs(dest, exist_ok=True)
fn = 'annotations_to_review_conceptually.xlsx'
fp = os.path.join(dest, fn)
with pd.ExcelWriter(fp, engine='openpyxl') as writer:
    for sn, df in sheets.items():
        df.to_excel(writer, sheet_name=sn.replace(':', '-').replace('/', ', '), index=False)

