In [1]:
from types import SimpleNamespace

args = SimpleNamespace()

args.data_path = '../../../data/annotations/group_mention_categorization/social-group-mention-categorization-round02'

args.text_data_file = '../../../data/intermediate/social_group_mentions_ranked.tsv'
# args.text_col = 'text'
# args.text_id_col = 'text_id'
# args.mention_col = 'mention'
# args.mention_id_col = 'mention_nr'
# args.mention_id_format = '{text_id}-{mention_id}'

args.attributes_file = '../../../data/annotations/group_mention_categorization/group_attributes_v2.yaml'

In [2]:
import os
import yaml
import pandas as pd

In [3]:
with open(args.attributes_file, 'r') as f:
    ontology = yaml.load(f, Loader=yaml.FullLoader)

attributes = pd.DataFrame([
    {
        'q_id': a,
        'q_category': i,
        'label': v,
    }
    for a, d in ontology['social_group'].items()
    for i, v in enumerate(d['attributes'].keys(), start=1)
])
attributes.q_id = attributes.q_id.str.replace('non_', 'non-')
attributes.label = attributes.label.str.replace('<i>Other attribute</i>', 'other')

attributes.q_category = attributes.q_category.astype(str)

econ_attributes_map = {c: l for _, c, l in attributes.loc[attributes.q_id == 'economic_attributes', ['q_category', 'label']].itertuples()}
nonecon_attributes_map = {c: l for _, c, l in attributes.loc[attributes.q_id == 'non-economic_attributes', ['q_category', 'label']].itertuples()}
data_quality_map = {'1': 'has_formatting_issue', '2': 'has_translation_issue'}

In [4]:
fp = os.path.join(args.data_path, 'parsed', 'reviewed_examples.tsv')
consolidated = pd.read_csv(fp, sep='\t')

# apply expert decision to disagreement cases
consolidated.loc[consolidated.decision.notna(), 'response'] = consolidated.loc[consolidated.decision.notna(), 'decision']
consolidated['response'] = consolidated.response.str.title()

consolidated.loc[consolidated.category.isna(), 'category'] = ''

In [5]:
fp = os.path.join(args.data_path, 'parsed', 'annotations.tsv')
annotations = pd.read_csv(fp, sep='\t')

annotations.loc[annotations.q_category.isna(), 'q_category'] = -1
annotations['q_category'] = annotations.q_category.astype(int)
annotations.loc[annotations.category.isna(), 'category'] = ''

out = annotations[annotations.q_id.isin(['universal_attributes', 'economic_attributes', 'non-economic_attributes'])]

cols = out.columns[:-2].tolist()
out = out.groupby(cols)[out.columns].agg({'response': lambda x: '; '.join(x[x.notna()].unique().tolist())}).reset_index()
out['coder_disagreement'] = out.response.str.contains('; ')

out['q_vals'] = out.q_id.map({'universal_attributes': 1, 'economic_attributes': 2, 'non-economic_attributes': 3})
out.sort_values(by=['mention_id', 'q_vals', 'q_category'], inplace=True, ascending=True)
del out['q_vals']

aggreement_cases = out.groupby('mention_id').filter(lambda x: ~x.coder_disagreement.any())
del aggreement_cases['coder_disagreement']

In [7]:
annotations.mention_id.nunique(), aggreement_cases.mention_id.nunique(), consolidated.mention_id.nunique()

(150, 55, 95)

In [8]:
out = pd.concat([aggreement_cases, consolidated[aggreement_cases.columns]], axis=0, ignore_index=True)

In [9]:
out.mention_id.nunique()

150

In [10]:
out['q_id'].value_counts()

q_id
non-economic_attributes    1650
economic_attributes        1050
universal_attributes        150
Name: count, dtype: int64

In [12]:
out.loc[out['q_id']=='universal_attributes', 'response'].value_counts()

response
No     134
Yes     16
Name: count, dtype: int64

In [13]:
out.loc[out['q_id']=='economic_attributes', ['category', 'response']].value_counts(sort=False).unstack()

response,No,Yes
category,Unnamed: 1_level_1,Unnamed: 2_level_1
class membership,140.0,10.0
ecology of group,144.0,6.0
education level,141.0,9.0
employment status,132.0,18.0
income/wealth/economic status,129.0,21.0
occupation/profession,114.0,36.0
other,150.0,


In [14]:
out.loc[out['q_id']=='non-economic_attributes', ['category', 'response']].value_counts(sort=False).unstack()

response,No,Yes
category,Unnamed: 1_level_1,Unnamed: 2_level_1
age,138,12
crime,141,9
ethnicity,146,4
family,144,6
gender/sexuality,147,3
health,139,11
nationality,135,15
other,145,5
place/location,146,4
religion,147,3


In [15]:
# # sanity check
# out.apply(lambda r: r.mention not in r.text, axis=1).sum()

0

In [16]:
texts = pd.read_csv(args.text_data_file, sep='\t')
texts['mention_id'] = texts.text_id.astype(str) + '-' + texts.mention_nr.astype(str)
texts = texts[['mention_id', 'prev_texts', 'next_texts']]

In [17]:
df = out.merge(texts, how='left', indicator=True)
assert all(df._merge == 'both')

In [18]:
df.rename(columns={'response': 'label'}, inplace=True)

In [19]:
cols = ['mention_id', 'text', 'mention', 'prev_texts', 'next_texts', 'q_id', 'q_category', 'category', 'label']
df = df[cols]

In [20]:
df

Unnamed: 0,mention_id,text,mention,prev_texts,next_texts,q_id,q_category,category,label
0,11110_200209-393006-1,Students with children have great difficulties...,Students with children,The school should have the resources to carry ...,We want to reintroduce a child allowance.\nAs ...,universal_attributes,-1,,No
1,11110_200209-393006-1,Students with children have great difficulties...,Students with children,The school should have the resources to carry ...,We want to reintroduce a child allowance.\nAs ...,economic_attributes,1,class membership,No
2,11110_200209-393006-1,Students with children have great difficulties...,Students with children,The school should have the resources to carry ...,We want to reintroduce a child allowance.\nAs ...,economic_attributes,2,employment status,No
3,11110_200209-393006-1,Students with children have great difficulties...,Students with children,The school should have the resources to carry ...,We want to reintroduce a child allowance.\nAs ...,economic_attributes,3,education level,Yes
4,11110_200209-393006-1,Students with children have great difficulties...,Students with children,The school should have the resources to carry ...,We want to reintroduce a child allowance.\nAs ...,economic_attributes,4,income/wealth/economic status,No
...,...,...,...,...,...,...,...,...,...
2845,97710_200809-386883-4,new employment opportunities according to the ...,foreign workforce,We advocate for:\nElimination of all new busin...,"Childcare, Education, Culture and Sport\nWe ad...",non-economic_attributes,5,nationality,Yes
2846,97710_200809-386883-4,new employment opportunities according to the ...,foreign workforce,We advocate for:\nElimination of all new busin...,"Childcare, Education, Culture and Sport\nWe ad...",non-economic_attributes,6,ethnicity,No
2847,97710_200809-386883-4,new employment opportunities according to the ...,foreign workforce,We advocate for:\nElimination of all new busin...,"Childcare, Education, Culture and Sport\nWe ad...",non-economic_attributes,7,religion,No
2848,97710_200809-386883-4,new employment opportunities according to the ...,foreign workforce,We advocate for:\nElimination of all new busin...,"Childcare, Education, Culture and Sport\nWe ad...",non-economic_attributes,8,health,No


In [21]:
fp = os.path.join(args.data_path, 'parsed', 'consolidated_annotations.tsv')
df.to_csv(fp, sep='\t', index=False)