In [41]:
from types import SimpleNamespace

args = SimpleNamespace()

args.data_path = '../../../data/annotations/group_mention_categorization/social-group-mention-categorization-round03'

args.text_data_file = '../../../data/labeled/manifesto_sentences_predicted_group_mentions_spans.tsv'
args.text_col = 'sentence_text'
args.text_id_col = 'sentence_id'
args.mention_col = 'text'
args.mention_id_col = 'span_nr'
args.mention_id_format = '{text_id}-{mention_id}'

args.attributes_file = '../../../data/annotations/group_mention_categorization/group_attributes_v2.yaml'

In [42]:
import os
import yaml
import pandas as pd

In [43]:
with open(args.attributes_file, 'r') as f:
    ontology = yaml.load(f, Loader=yaml.FullLoader)

attributes = pd.DataFrame([
    {
        'q_id': a,
        'q_category': i,
        'label': v,
    }
    for a, d in ontology['social_group'].items()
    for i, v in enumerate(d['attributes'].keys(), start=1)
])
attributes.q_id = attributes.q_id.str.replace('non_', 'non-')
attributes.label = attributes.label.str.replace('<i>Other attribute</i>', 'other')

attributes.q_category = attributes.q_category.astype(str)

econ_attributes_map = {c: l for _, c, l in attributes.loc[attributes.q_id == 'economic_attributes', ['q_category', 'label']].itertuples()}
nonecon_attributes_map = {c: l for _, c, l in attributes.loc[attributes.q_id == 'non-economic_attributes', ['q_category', 'label']].itertuples()}

In [46]:
fp = os.path.join(args.data_path, 'parsed', 'annotations.tsv')
annotations = pd.read_csv(fp, sep='\t')

annotations.loc[annotations.q_category.isna(), 'q_category'] = -1
annotations['q_category'] = annotations.q_category.astype(int)
annotations.loc[annotations.category.isna(), 'category'] = ''

annotations.mention_id.nunique()

179

In [47]:
annotations.head()

Unnamed: 0,mention_id,text,mention,q_id,q_category,category,annotator,response
0,11110_200609-393916-1,Sweden will work in the UN to end discriminati...,people on the basis of sexual orientation or g...,universal_attributes,-1,,Ford,No
1,11110_200609-393916-1,Sweden will work in the UN to end discriminati...,people on the basis of sexual orientation or g...,economic_attributes,1,class membership,Ford,No
2,11110_200609-393916-1,Sweden will work in the UN to end discriminati...,people on the basis of sexual orientation or g...,economic_attributes,2,employment status,Ford,No
3,11110_200609-393916-1,Sweden will work in the UN to end discriminati...,people on the basis of sexual orientation or g...,economic_attributes,3,education level,Ford,No
4,11110_200609-393916-1,Sweden will work in the UN to end discriminati...,people on the basis of sexual orientation or g...,economic_attributes,4,income/wealth/economic status,Ford,No


In [50]:
fp = os.path.join(args.data_path, 'parsed', 'reviewed_examples.tsv')
consolidated = pd.read_csv(fp, sep='\t')

# apply expert decision to disagreement cases
# consolidated.loc[consolidated.decision_hauke.notna(), 'response'] = consolidated.loc[consolidated.decision_hauke.notna(), 'decision_hauke']
consolidated.loc[consolidated.decision_hauke.notna(), 'response'] = consolidated.loc[consolidated.decision_hauke.notna(), 'decision_hauke']
consolidated['response'] = consolidated.response.str.title()

consolidated.loc[consolidated.category.isna(), 'category'] = ''

consolidated.mention_id.nunique()

95

In [52]:
out = annotations[annotations.q_id.isin(['universal_attributes', 'economic_attributes', 'non-economic_attributes'])]

cols = out.columns[:-2].tolist()
out = out.groupby(cols)[out.columns].agg({'response': lambda x: '; '.join(x[x.notna()].unique().tolist())}).reset_index()
out['coder_disagreement'] = out.response.str.contains('; ')

out['q_vals'] = out.q_id.map({'universal_attributes': 1, 'economic_attributes': 2, 'non-economic_attributes': 3})
out.sort_values(by=['mention_id', 'q_vals', 'q_category'], inplace=True, ascending=True)
del out['q_vals']

out.mention_id.nunique()

179

In [53]:
aggreement_cases = out.groupby('mention_id').filter(lambda x: ~x.coder_disagreement.any())
del aggreement_cases['coder_disagreement']

aggreement_cases.mention_id.nunique()

84

In [55]:
annotations.mention_id.nunique(), aggreement_cases.mention_id.nunique(), consolidated.mention_id.nunique()

(179, 84, 95)

In [56]:
out = pd.concat([aggreement_cases, consolidated[aggreement_cases.columns]], axis=0, ignore_index=True)

In [64]:
# TODO: discard this line
out = out.groupby('mention_id').filter(lambda x: ~(x.response=='??').any())

In [65]:
out.mention_id.nunique()

151

In [66]:
out['q_id'].value_counts()

q_id
non-economic_attributes    1661
economic_attributes        1057
universal_attributes        151
Name: count, dtype: int64

In [67]:
out.loc[out['q_id']=='universal_attributes', 'response'].value_counts()
# NOTE: this is expected because of attribute query-based sampling

response
No    151
Name: count, dtype: int64

In [None]:
out.loc[out['q_id']=='economic_attributes', ['category', 'response']].value_counts(sort=False).unstack().fillna(0).astype(int)
# NOTE: this worked nicely as we have succeeded in oversampling attribute instances that were underrepresented when sampling at random

response,No,Yes
category,Unnamed: 1_level_1,Unnamed: 2_level_1
class membership,146,5
ecology of group,141,10
education level,141,10
employment status,149,2
income/wealth/economic status,150,1
occupation/profession,143,8
other,151,0


In [None]:
out.loc[out['q_id']=='non-economic_attributes', ['category', 'response']].value_counts(sort=False).unstack().fillna(0).astype(int)
# NOTE: this worked nicely as we have succeeded in oversampling attribute instances that were underrepresented when sampling at random

response,No,Yes
category,Unnamed: 1_level_1,Unnamed: 2_level_1
age,145,6
crime,136,15
ethnicity,124,27
family,137,14
gender/sexuality,114,37
health,136,15
nationality,139,12
other,149,2
place/location,148,3
religion,131,20


In [71]:
# sanity check
out.apply(lambda r: r.mention not in r.text, axis=1).sum()

0

In [120]:
# texts = pd.read_csv(args.text_data_file, sep='\t')
# texts['mention_id'] = texts.text_id.astype(str) + '-' + texts.mention_nr.astype(str)
# texts = texts[['mention_id', 'prev_texts', 'next_texts']]
texts = pd.read_csv(args.text_data_file, sep='\t')
texts['mention_id'] = texts.apply(lambda r: args.mention_id_format.format(text_id=r[args.text_id_col], mention_id=r[args.mention_id_col]), axis=1)
texts['manifesto_id'] = texts[args.text_id_col].str.split('-').str[0]
texts.rename(columns={args.mention_col: 'mention', args.text_col: 'text', 'label': 'group_type'}, inplace=True)
texts.head()

Unnamed: 0,country_iso3c,sentence_id,text,span_nr,group_type,mention,mention_id,manifesto_id
0,AUS,63110_200410-00001,The Greens believe that everyone has a right t...,1,social group,everyone,63110_200410-00001-1,63110_200410
1,AUS,63110_200410-00010,The most obvious example in Australia is the a...,1,social group,Indigenous peoples,63110_200410-00010-1,63110_200410
2,AUS,63110_200410-00016,Our plan for the timber industry moves wood pr...,1,organizational group,the timber industry,63110_200410-00016-1,63110_200410
3,AUS,63110_200410-00017,It promotes paper recycling and incorporates r...,1,social group,workers,63110_200410-00017-1,63110_200410
4,AUS,63110_200410-00030,Australians are the world's highest per capita...,1,social group,Australians,63110_200410-00030-1,63110_200410


In [121]:
# get leading and trailing texts with shift window wdith 2
texts['prev_texts'] = texts.groupby('manifesto_id').text.shift((2, 1)).apply(lambda x: '  '.join(x.dropna().tolist()), axis=1)
texts['next_texts'] = texts.groupby('manifesto_id').text.shift((-1, -2)).apply(lambda x: '  '.join(x.dropna().tolist()), axis=1)

In [122]:
df = out.merge(texts[['mention_id', 'prev_texts', 'next_texts']], how='left', indicator=True)
assert all(df._merge == 'both')

In [123]:
df.rename(columns={'response': 'label'}, inplace=True)

In [124]:
cols = ['mention_id', 'text', 'mention', 'prev_texts', 'next_texts', 'q_id', 'q_category', 'category', 'label']
df = df[cols]

In [125]:
fp = os.path.join(args.data_path, 'parsed', 'consolidated_annotations.tsv')
df.to_csv(fp, sep='\t', index=False)