In [45]:
from types import SimpleNamespace

args = SimpleNamespace()

args.data_path = '../../../data/annotations/group_mention_categorization/social-group-mention-categorization-round03'

args.text_data_file = '../../../data/labeled/manifesto_sentences_predicted_group_mentions_spans.tsv'
args.text_col = 'sentence_text'
args.text_id_col = 'sentence_id'
args.mention_col = 'text'
args.mention_id_col = 'span_nr'
args.mention_id_format = '{text_id}-{mention_id}'

args.attributes_file = '../../../data/annotations/group_mention_categorization/group_attributes_v2.yaml'

In [2]:
import os
import yaml
import pandas as pd
import numpy as np
from collections import Counter

In [None]:
texts = pd.read_csv(args.text_data_file, sep='\t')
texts['mention_id'] = texts.apply(lambda r: args.mention_id_format.format(text_id=r[args.text_id_col], mention_id=r[args.mention_id_col]), axis=1)
texts.rename(columns={args.mention_col: 'mention', args.text_col: 'text'}, inplace=True)
texts.head()

In [4]:
# TODO:
#  - wrap reading and parsing logic in a function 

In [4]:
with open(args.attributes_file, 'r') as f:
    ontology = yaml.load(f, Loader=yaml.FullLoader)

attributes = pd.DataFrame([
    {
        'q_id': a,
        'q_category': i,
        'label': v,
    }
    for a, d in ontology['social_group'].items()
    for i, v in enumerate(d['attributes'].keys(), start=1)
])
attributes.q_id = attributes.q_id.str.replace('non_', 'non-')
attributes.label = attributes.label.str.replace('<i>Other attribute</i>', 'other')

attributes.q_category = attributes.q_category.astype(str)

## Clean the responses data

In [5]:
# read the annotations data
fp = os.path.join(args.data_path, 'responses.tsv')
df = pd.read_csv(fp, sep='\t', encoding='UTF-16')

In [6]:
data_cols = df.columns.str.match('^\d+_\d{6}')
metadata_cols = np.where(~data_cols)[0]
data_cols = np.where(data_cols)[0]

In [7]:
df.columns[data_cols].str.split('__').str[-1].unique()

Index(['universal_attributes', 'economic_attributes_1',
       'economic_attributes_2', 'economic_attributes_3',
       'economic_attributes_4', 'economic_attributes_5',
       'economic_attributes_6', 'economic_attributes_7',
       'non-economic_attributes_1', 'non-economic_attributes_2',
       'non-economic_attributes_3', 'non-economic_attributes_4',
       'non-economic_attributes_5', 'non-economic_attributes_6',
       'non-economic_attributes_7', 'non-economic_attributes_8',
       'non-economic_attributes_9', 'non-economic_attributes_10',
       'non-economic_attributes_11', 'stance', 'comments'],
      dtype='object')

In [8]:
df.RecipientLastName.value_counts()

RecipientLastName
Eichholz                            3
Ford                                2
Recipient Last Name                 1
{"ImportId":"recipientLastName"}    1
RÃ¶th                                1
Name: count, dtype: int64

In [9]:
annotators = ['Eichholz', 'Ford']
# NOTE: we had a glitch in data collection, so only last response count
ridxs = np.where(df.RecipientLastName.isin(annotators))[0]
df.loc[ridxs, ['StartDate', 'RecipientLastName']]
ridxs = ridxs[3: ]

In [10]:
data = df.iloc[ridxs, data_cols]
metadata = df.iloc[ridxs, metadata_cols]

In [11]:
id2annotator = dict(metadata.RecipientLastName)
annotator2id = {v: k for k, v in id2annotator.items()}
annotator2id

{'Ford': 6, 'Eichholz': 7}

##

In [12]:
tmp = data.T
tmp.reset_index(inplace=True)
tmp[['mention_id', 'q_id']] = tmp['index'].str.split('__', expand=True)
tmp.drop(columns=['index'], inplace=True)
# pivot longer: values from first two columns to rows
tmp = tmp.melt(id_vars=['mention_id', 'q_id'], value_name='value', var_name='annotator')

In [13]:
texts = texts.loc[texts.mention_id.isin(tmp.mention_id), ['mention_id', 'text', 'mention']]

In [14]:
tmp[['q_id', 'q_category']] = tmp.q_id.str.split(r'[_-](?=\d+$)', regex=True, expand=True)

In [16]:
tmp.mention_id.nunique()

179

In [17]:
tmp.q_id.value_counts().sort_index()

q_id
comments                    358
economic_attributes        2506
non-economic_attributes    3938
stance                      358
universal_attributes        358
Name: count, dtype: int64

In [18]:
tmp[tmp.q_id == 'universal_attributes'][['value', 'q_category']].value_counts(dropna=False)

value  q_category
NaN    NaN           357
Yes    NaN             1
Name: count, dtype: int64

In [19]:
tmp.loc[tmp.q_id == 'universal_attributes', 'value'] = tmp[tmp.q_id == 'universal_attributes'].value.replace({np.nan: 'No'})

In [20]:
tmp[tmp.q_id == 'economic_attributes'].value.value_counts(dropna=False)

value
NaN       2378
Yes        124
Unsure       4
Name: count, dtype: int64

In [21]:
tmp.loc[tmp.q_id == 'economic_attributes', 'value'] = tmp[tmp.q_id == 'economic_attributes'].value.replace({np.nan: 'No'})

In [22]:
tmp[tmp.q_id == 'non-economic_attributes'].value.value_counts(dropna=False)

value
NaN           3504
Yes            424
Unsure           8
Yes,Unsure       2
Name: count, dtype: int64

In [23]:
tmp.loc[tmp.q_id == 'non-economic_attributes', 'value'] = tmp[tmp.q_id == 'non-economic_attributes'].value.replace({np.nan: 'No', 'Yes,Unsure': 'Unsure'})

In [24]:
tmp[tmp.q_id == 'stance'].value.value_counts(dropna=False)

value
Positive    295
Negative     35
Neutral      16
Unsure       10
NaN           2
Name: count, dtype: int64

In [25]:
mid = '11110_200609-393916-1'
foo = tmp[tmp.mention_id==mid]
# pivot wider: values in 'annotator' column to columns usin values in 'value' as values
foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()

annotator,mention_id,q_id,q_category,6,7
0,11110_200609-393916-1,comments,,,
1,11110_200609-393916-1,economic_attributes,1.0,No,No
2,11110_200609-393916-1,economic_attributes,2.0,No,No
3,11110_200609-393916-1,economic_attributes,3.0,No,No
4,11110_200609-393916-1,economic_attributes,4.0,No,No
5,11110_200609-393916-1,economic_attributes,5.0,No,No
6,11110_200609-393916-1,economic_attributes,6.0,No,No
7,11110_200609-393916-1,economic_attributes,7.0,No,No
8,11110_200609-393916-1,non-economic_attributes,1.0,No,No
9,11110_200609-393916-1,non-economic_attributes,10.0,No,No


## Write in usuful format to disk



In [26]:
econ_attributes_map = {c: l for _, c, l in attributes.loc[attributes.q_id == 'economic_attributes', ['q_category', 'label']].itertuples()}
nonecon_attributes_map = {c: l for _, c, l in attributes.loc[attributes.q_id == 'non-economic_attributes', ['q_category', 'label']].itertuples()}

print(econ_attributes_map)
print(nonecon_attributes_map)

{'1': 'class membership', '2': 'employment status', '3': 'education level', '4': 'income/wealth/economic status', '5': 'occupation/profession', '6': 'ecology of group', '7': 'other'}
{'1': 'age', '2': 'family', '3': 'gender/sexuality', '4': 'place/location', '5': 'nationality', '6': 'ethnicity', '7': 'religion', '8': 'health', '9': 'crime', '10': 'shared values/mentalities', '11': 'other'}


In [56]:
annotations = texts[['mention_id', 'text', 'mention']].merge(tmp, on='mention_id', how='right')

In [57]:
annotations.annotator = annotations.annotator.map(id2annotator)

annotations.loc[annotations.q_id == 'economic_attributes', 'category'] = annotations.loc[annotations.q_id == 'economic_attributes', 'q_category'].map(econ_attributes_map)
annotations.loc[annotations.q_id == 'non-economic_attributes', 'category'] = annotations.loc[annotations.q_id == 'non-economic_attributes', 'q_category'].map(nonecon_attributes_map)
# annotations.loc[annotations.q_id == 'data_quality', 'category'] = annotations.loc[annotations.q_id == 'data_quality', 'q_category'].map(data_quality_map)

cols = ['mention_id', 'text', 'mention', 'q_id', 'q_category', 'category', 'annotator', 'value']
annotations = annotations[cols]
annotations.rename(columns={'value': 'response'}, inplace=True)

annotations.head()


Unnamed: 0,mention_id,text,mention,q_id,q_category,category,annotator,response
0,11110_200609-393916-1,Sweden will work in the UN to end discriminati...,people on the basis of sexual orientation or g...,universal_attributes,,,Ford,No
1,11110_200609-393916-1,Sweden will work in the UN to end discriminati...,people on the basis of sexual orientation or g...,economic_attributes,1.0,class membership,Ford,No
2,11110_200609-393916-1,Sweden will work in the UN to end discriminati...,people on the basis of sexual orientation or g...,economic_attributes,2.0,employment status,Ford,No
3,11110_200609-393916-1,Sweden will work in the UN to end discriminati...,people on the basis of sexual orientation or g...,economic_attributes,3.0,education level,Ford,No
4,11110_200609-393916-1,Sweden will work in the UN to end discriminati...,people on the basis of sexual orientation or g...,economic_attributes,4.0,income/wealth/economic status,Ford,No


In [58]:
dest = os.path.join(args.data_path, 'parsed')
os.makedirs(dest, exist_ok=True)
fp = os.path.join(dest, 'annotations.tsv')
annotations.to_csv(fp, sep='\t', index=False)

## Evaluate ICA

In [59]:
from sklearn.metrics import cohen_kappa_score, f1_score
import krippendorff
from typing import List, Dict
def compute_metrics(
        a: pd.Series, 
        b: pd.Series, 
        labels: List[str] = ['Yes', 'No'],
        pos_label: str = 'Yes',
    ) -> Dict:
    
    out = {
        'n': len(a),
        f'prop_{pos_label.lower()}': np.logical_or(a == pos_label, b == pos_label).mean(),
    }
    if len(labels) == 2:
        out['f1_score'] = f1_score(a, b, average='binary', pos_label=pos_label)
    else:
        f1s = f1_score(a, b, average=None, labels=labels)
        out.update({f'f1_score_{l.lower()}': f1s[i] for i, l in enumerate(labels)})
    out['cohens_kappa'] = cohen_kappa_score(a, b)
    a = a.astype('category').cat.codes
    b = b.astype('category').cat.codes
    out['krippendorff_alpha'] = krippendorff.alpha(reliability_data=np.array([a, b]), level_of_measurement='nominal')

    return out

In [60]:
ica = {}

In [61]:
foo = tmp[tmp.q_id=='universal_attributes']

# pivot wider: values in 'annotator' column to columns usin values in 'value' as values
foo = foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()
foo = foo[~np.logical_or(foo[annotator2id[annotators[0]]] == 'Unsure', foo[annotator2id[annotators[1]]] == 'Unsure')]

ica[('universal_attributes', None)] = compute_metrics(foo[annotator2id[annotators[0]]], foo[annotator2id[annotators[1]]])

In [62]:
# econ/non-econ attributes
for d in attributes.itertuples():
     
    foo = tmp[np.logical_and(tmp.q_id==d.q_id, tmp.q_category==str(d.q_category))]
    if len(foo) == 0:
        continue
    # pivot wider: values in 'annotator' column to columns usin values in 'value' as values
    foo = foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()
    foo = foo[~np.logical_or(foo[annotator2id[annotators[0]]] == 'Unsure', foo[annotator2id[annotators[1]]] == 'Unsure')]
    
    ica[(d.q_id, d.q_category, d.label)] = compute_metrics(foo[annotator2id[annotators[0]]], foo[annotator2id[annotators[1]]])

In [63]:
foo = tmp[tmp.q_id=='economic_attributes']
foo = foo.groupby(['mention_id', 'annotator']).agg({'value': lambda v: (v != 'No').any()}).reset_index()
foo.value = foo.value.map({True: 'Yes', False: 'No'})
foo = foo.pivot(index=['mention_id'], columns='annotator', values='value').reset_index()

ica[('economic_attributes', 'overall')] = compute_metrics(foo[annotator2id[annotators[0]]], foo[annotator2id[annotators[1]]])

In [64]:
foo = tmp[tmp.q_id=='non-economic_attributes']
foo = foo.groupby(['mention_id', 'annotator']).agg({'value': lambda v: (v != 'No').any()}).reset_index()
foo.value = foo.value.map({True: 'Yes', False: 'No'})
foo = foo.pivot(index=['mention_id'], columns='annotator', values='value').reset_index()

ica[('non-economic_attributes', 'overall')] = compute_metrics(foo[annotator2id[annotators[0]]], foo[annotator2id[annotators[1]]])

In [65]:
cats = ['Positive', 'Neutral', 'Negative']
foo = tmp[np.logical_and(tmp.q_id=='stance', tmp.q_category.isna())]
foo = foo[~foo.value.isna()]
foo.value.value_counts(dropna=False)
foo = foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()
foo = foo[np.logical_and(foo[annotator2id[annotators[0]]].isin(cats), foo[annotator2id[annotators[1]]].isin(cats))]

ica[('stance', None)] = compute_metrics(foo[annotator2id[annotators[0]]], foo[annotator2id[annotators[1]]], labels=cats)

In [66]:
pd.DataFrame(ica).T.reset_index().rename(columns={'level_0': 'q_id', 'level_1': 'q_category', 'level_2': 'label'})

Unnamed: 0,q_id,q_category,label,n,prop_yes,f1_score,cohens_kappa,krippendorff_alpha,f1_score_positive,f1_score_neutral,f1_score_negative
0,universal_attributes,,,179.0,0.005587,0.0,0.0,0.0,,,
1,economic_attributes,1,class membership,178.0,0.050562,0.8,0.791569,0.791789,,,
2,economic_attributes,2,employment status,179.0,0.067039,0.666667,0.649935,0.65,,,
3,economic_attributes,3,education level,179.0,0.089385,0.896552,0.887445,0.887748,,,
4,economic_attributes,4,income/wealth/economic status,179.0,0.027933,0.75,0.74465,0.745,,,
5,economic_attributes,5,occupation/profession,178.0,0.129213,0.296296,0.268282,0.240684,,,
6,economic_attributes,6,ecology of group,179.0,0.067039,0.956522,0.953543,0.953666,,,
7,economic_attributes,7,other,177.0,0.016949,0.0,0.0,-0.005698,,,
8,non-economic_attributes,1,age,178.0,0.106742,0.48,0.448785,0.442296,,,
9,non-economic_attributes,2,family,178.0,0.106742,0.774194,0.753267,0.75335,,,


## Export for consolidation

In [67]:
annotations.loc[annotations.q_category.isna(), 'q_category'] = -1
annotations.loc[annotations.category.isna(), 'category'] = ''

In [68]:
comments = annotations[annotations.q_id=='comments'].groupby('mention_id').response.apply(lambda x: x[x.notna()].str.cat(sep='; ')).reset_index()
comments.rename(columns={'response': 'coder_comments'}, inplace=True)

In [69]:
out = annotations[annotations.q_id.isin(['universal_attributes', 'economic_attributes', 'non-economic_attributes'])]

cols = out.columns[:-2].tolist()
out = out.groupby(cols)[out.columns].agg({'response': lambda x: '; '.join(x[x.notna()].unique().tolist())}).reset_index()
out['coder_disagreement'] = out.response.str.contains('; ')

out['q_vals'] = out.q_id.map({'universal_attributes': 1, 'economic_attributes': 2, 'non-economic_attributes': 3})
out.sort_values(by=['mention_id', 'q_vals', 'q_category'], inplace=True, ascending=True)
del out['q_vals']

out.mention_id.nunique()

179

In [70]:
out = out.merge(comments, on='mention_id', how='left')
out.mention_id.nunique()

179

In [71]:
disagreement_cases = out.groupby('mention_id').filter(lambda x: x.coder_disagreement.any())
disagreement_cases.mention_id.nunique()

95

In [73]:
disagreement_cases.coder_disagreement.value_counts()

coder_disagreement
False    1656
True      149
Name: count, dtype: int64

In [72]:
fp = os.path.join(dest, 'disagreement_cases.tsv')
disagreement_cases.to_csv(fp, sep='\t', index=False)