In [130]:
from types import SimpleNamespace

args = SimpleNamespace()

args.data_path = '../../../data/annotations/group_mention_categorization/social-group-mention-categorization-round02'

args.text_data_file = '../../../data/intermediate/social_group_mentions_ranked.tsv'
args.text_col = 'text'
args.text_id_col = 'text_id'
args.mention_col = 'mention'
args.mention_id_col = 'mention_nr'
args.mention_id_format = '{text_id}-{mention_id}'

args.attributes_file = '../../../data/annotations/group_mention_categorization/group_attributes_v2.yaml'

In [8]:
import os
import yaml
import pandas as pd
import numpy as np
from collections import Counter

In [9]:
texts = pd.read_csv(args.text_data_file, sep='\t')
texts['mention_id'] = texts.apply(lambda r: args.mention_id_format.format(text_id=r[args.text_id_col], mention_id=r[args.mention_id_col]), axis=1)

In [22]:
# TODO:
#  - wrap reading and parsing logic in a function 

In [10]:
with open(args.attributes_file, 'r') as f:
    ontology = yaml.load(f, Loader=yaml.FullLoader)

attributes = pd.DataFrame([
    {
        'q_id': a,
        'q_category': i,
        'label': v,
    }
    for a, d in ontology['social_group'].items()
    for i, v in enumerate(d['attributes'].keys(), start=1)
])
attributes.q_id = attributes.q_id.str.replace('non_', 'non-')
attributes.label = attributes.label.str.replace('<i>Other attribute</i>', 'other')

attributes.q_category = attributes.q_category.astype(str)

## Clean the responses data

In [131]:
# read the annotations data
fp = os.path.join(args.data_path, 'qualtrix_responses.tsv')
df = pd.read_csv(fp, sep='\t', encoding='UTF-16')

In [132]:
data_cols = df.columns.str.match('^\d+_\d{6}')
metadata_cols = np.where(~data_cols)[0]
data_cols = np.where(data_cols)[0]

In [133]:
df.columns[data_cols].str.split('__').str[-1].unique()

Index(['data_quality_1', 'data_quality_2', 'universal_attributes',
       'economic_attributes_1', 'economic_attributes_2',
       'economic_attributes_3', 'economic_attributes_4',
       'economic_attributes_5', 'economic_attributes_6',
       'economic_attributes_7', 'non-economic_attributes_1',
       'non-economic_attributes_2', 'non-economic_attributes_3',
       'non-economic_attributes_4', 'non-economic_attributes_5',
       'non-economic_attributes_6', 'non-economic_attributes_7',
       'non-economic_attributes_8', 'non-economic_attributes_9',
       'non-economic_attributes_10', 'non-economic_attributes_11', 'stance',
       'comments'],
      dtype='object')

In [134]:
df.RecipientLastName.value_counts()

RecipientLastName
Recipient Last Name                 1
{"ImportId":"recipientLastName"}    1
Licht                               1
Röth                                1
Eichholz                            1
Ford                                1
Name: count, dtype: int64

In [135]:
annotators = ['Eichholz', 'Ford']

ridxs = np.where(df.RecipientLastName.isin(annotators))[0]

data = df.iloc[ridxs, data_cols]
metadata = df.iloc[ridxs, metadata_cols]

In [136]:
id2annotator = dict(metadata.RecipientLastName)
annotator2id = {v: k for k, v in id2annotator.items()}
annotator2id

{'Eichholz': 4, 'Ford': 5}

##

In [137]:
tmp = data.T
tmp.reset_index(inplace=True)
tmp[['mention_id', 'q_id']] = tmp['index'].str.split('__', expand=True)
tmp.drop(columns=['index'], inplace=True)
# pivot longer: values from first two columns to rows
tmp = tmp.melt(id_vars=['mention_id', 'q_id'], value_name='value', var_name='annotator')

In [138]:
texts = texts.loc[texts.mention_id.isin(tmp.mention_id), ['mention_id', 'text', 'mention']]

In [139]:
tmp[['q_id', 'q_category']] = tmp.q_id.str.split(r'[_-](?=\d+$)', regex=True, expand=True)

In [140]:
tmp.q_id.value_counts().sort_index()

q_id
comments                    300
data_quality                600
economic_attributes        2100
non-economic_attributes    3300
stance                      300
universal_attributes        300
Name: count, dtype: int64

In [141]:
tmp[tmp.q_id == 'data_quality'].value.value_counts(dropna=False)

value
NaN       593
Yes         6
Unsure      1
Name: count, dtype: int64

In [142]:
tmp.loc[tmp.q_id == 'data_quality', 'value'] = tmp[tmp.q_id == 'data_quality'].value.replace({np.nan: 'No'})

In [143]:
tmp[tmp.q_id == 'universal_attributes'][['value', 'q_category']].value_counts(dropna=False)

value   q_category
NaN     NaN           270
Yes     NaN            26
Unsure  NaN             4
Name: count, dtype: int64

In [144]:
tmp.loc[tmp.q_id == 'universal_attributes', 'value'] = tmp[tmp.q_id == 'universal_attributes'].value.replace({np.nan: 'No'})

In [145]:
tmp[tmp.q_id == 'economic_attributes'].value.value_counts(dropna=False)

value
NaN       1894
Yes        186
Unsure      20
Name: count, dtype: int64

In [146]:
tmp.loc[tmp.q_id == 'economic_attributes', 'value'] = tmp[tmp.q_id == 'economic_attributes'].value.replace({np.nan: 'No'})

In [147]:
tmp[tmp.q_id == 'non-economic_attributes'].value.value_counts(dropna=False)

value
NaN           3073
Yes            216
Unsure          10
Yes,Unsure       1
Name: count, dtype: int64

In [148]:
tmp.loc[tmp.q_id == 'non-economic_attributes', 'value'] = tmp[tmp.q_id == 'non-economic_attributes'].value.replace({np.nan: 'No', 'Yes,Unsure': 'Unsure'})

In [149]:
tmp[tmp.q_id == 'stance'].value.value_counts(dropna=False)

value
Positive    225
Negative     58
Neutral       8
Unsure        5
NaN           4
Name: count, dtype: int64

In [150]:
mid = '41112_199012-127111-2'
foo = tmp[tmp.mention_id==mid]
# pivot wider: values in 'annotator' column to columns usin values in 'value' as values
foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()

annotator,mention_id,q_id,q_category,4,5
0,41112_199012-127111-2,comments,,Unsure what all walks of life refers to,
1,41112_199012-127111-2,data_quality,1.0,No,No
2,41112_199012-127111-2,data_quality,2.0,No,No
3,41112_199012-127111-2,economic_attributes,1.0,Unsure,No
4,41112_199012-127111-2,economic_attributes,2.0,No,No
5,41112_199012-127111-2,economic_attributes,3.0,No,No
6,41112_199012-127111-2,economic_attributes,4.0,No,No
7,41112_199012-127111-2,economic_attributes,5.0,Yes,Yes
8,41112_199012-127111-2,economic_attributes,6.0,No,No
9,41112_199012-127111-2,economic_attributes,7.0,No,No


## Write in usuful format to disk



In [151]:
import re

econ_attributes_map = {c: l for _, c, l in attributes.loc[attributes.q_id == 'economic_attributes', ['q_category', 'label']].itertuples()}
nonecon_attributes_map = {c: l for _, c, l in attributes.loc[attributes.q_id == 'non-economic_attributes', ['q_category', 'label']].itertuples()}

data_quality_map = {'1': 'has_formatting_issue', '2': 'has_translation_issue'}

print(econ_attributes_map)
print(nonecon_attributes_map)

{'1': 'class membership', '2': 'employment status', '3': 'education level', '4': 'income/wealth/economic status', '5': 'occupation/profession', '6': 'ecology of group', '7': 'other'}
{'1': 'age', '2': 'family', '3': 'gender/sexuality', '4': 'place/location', '5': 'nationality', '6': 'ethnicity', '7': 'religion', '8': 'health', '9': 'crime', '10': 'shared values/mentalities', '11': 'other'}


In [152]:
annotations = tmp.copy(deep=True)

annotations.annotator = annotations.annotator.map(id2annotator)

annotations.loc[annotations.q_id == 'economic_attributes', 'category'] = annotations.loc[annotations.q_id == 'economic_attributes', 'q_category'].map(econ_attributes_map)
annotations.loc[annotations.q_id == 'non-economic_attributes', 'category'] = annotations.loc[annotations.q_id == 'non-economic_attributes', 'q_category'].map(nonecon_attributes_map)
annotations.loc[annotations.q_id == 'data_quality', 'category'] = annotations.loc[annotations.q_id == 'data_quality', 'q_category'].map(data_quality_map)

annotations = annotations.merge(texts, on='mention_id', how='left')

cols = ['mention_id', 'text', 'mention', 'q_id', 'q_category', 'category', 'annotator', 'value']
annotations = annotations[cols]
annotations.rename(columns={'value': 'response'}, inplace=True)

dest = os.path.join(args.data_path, 'parsed')
os.makedirs(dest, exist_ok=True)
fp = os.path.join(dest, 'annotations.tsv')
annotations.to_csv(fp, sep='\t', index=False)

## Evaluate ICA

In [154]:
from sklearn.metrics import cohen_kappa_score, f1_score
import krippendorff
from typing import List, Dict
def compute_metrics(
        a: pd.Series, 
        b: pd.Series, 
        labels: List[str] = ['Yes', 'No'],
        pos_label: str = 'Yes',
    ) -> Dict:
    
    out = {
        'n': len(a),
        f'prop_{pos_label.lower()}': np.logical_or(a == pos_label, b == pos_label).mean(),
    }
    if len(labels) == 2:
        out['f1_score'] = f1_score(a, b, average='binary', pos_label=pos_label)
    else:
        f1s = f1_score(a, b, average=None, labels=labels)
        out.update({f'f1_score_{l.lower()}': f1s[i] for i, l in enumerate(labels)})
    out['cohens_kappa'] = cohen_kappa_score(a, b)
    a = a.astype('category').cat.codes
    b = b.astype('category').cat.codes
    out['krippendorff_alpha'] = krippendorff.alpha(reliability_data=np.array([a, b]), level_of_measurement='nominal')

    return out

In [155]:
ica = {}

In [156]:
q = 'data_quality'
for i, label in data_quality_map.items():
    foo = tmp[np.logical_and(tmp.q_id==q, tmp.q_category==str(i))]
    # pivot wider: values in 'annotator' column to columns usin values in 'value' as values
    foo = foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()
    foo = foo[~np.logical_or(foo[annotator2id[annotators[0]]] == 'Unsure', foo[annotator2id[annotators[1]]] == 'Unsure')]
    
    ica[(q, i, label)] = compute_metrics(foo[annotator2id[annotators[0]]], foo[annotator2id[annotators[1]]])

In [157]:
foo = tmp[tmp.q_id=='universal_attributes']

# pivot wider: values in 'annotator' column to columns usin values in 'value' as values
foo = foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()
foo = foo[~np.logical_or(foo[annotator2id[annotators[0]]] == 'Unsure', foo[annotator2id[annotators[1]]] == 'Unsure')]

ica[('universal_attributes', None)] = compute_metrics(foo[annotator2id[annotators[0]]], foo[annotator2id[annotators[1]]])

In [158]:
# econ/non-econ attributes
for d in attributes.itertuples():
     
    foo = tmp[np.logical_and(tmp.q_id==d.q_id, tmp.q_category==str(d.q_category))]
    if len(foo) == 0:
        continue
    # pivot wider: values in 'annotator' column to columns usin values in 'value' as values
    foo = foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()
    foo = foo[~np.logical_or(foo[annotator2id[annotators[0]]] == 'Unsure', foo[annotator2id[annotators[1]]] == 'Unsure')]
    
    ica[(d.q_id, d.q_category, d.label)] = compute_metrics(foo[annotator2id[annotators[0]]], foo[annotator2id[annotators[1]]])

In [159]:
foo = tmp[tmp.q_id=='economic_attributes']
foo = foo.groupby(['mention_id', 'annotator']).agg({'value': lambda v: (v != 'No').any()}).reset_index()
foo.value = foo.value.map({True: 'Yes', False: 'No'})
foo = foo.pivot(index=['mention_id'], columns='annotator', values='value').reset_index()

ica[('economic_attributes', 'overall')] = compute_metrics(foo[annotator2id[annotators[0]]], foo[annotator2id[annotators[1]]])

In [160]:
foo = tmp[tmp.q_id=='non-economic_attributes']
foo = foo.groupby(['mention_id', 'annotator']).agg({'value': lambda v: (v != 'No').any()}).reset_index()
foo.value = foo.value.map({True: 'Yes', False: 'No'})
foo = foo.pivot(index=['mention_id'], columns='annotator', values='value').reset_index()

ica[('non-economic_attributes', 'overall')] = compute_metrics(foo[annotator2id[annotators[0]]], foo[annotator2id[annotators[1]]])

In [161]:
cats = ['Positive', 'Neutral', 'Negative']
foo = tmp[np.logical_and(tmp.q_id=='stance', tmp.q_category.isna())]
foo = foo[~foo.value.isna()]
foo.value.value_counts(dropna=False)
foo = foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()
foo = foo[np.logical_and(foo[annotator2id[annotators[0]]].isin(cats), foo[annotator2id[annotators[1]]].isin(cats))]

ica[('stance', None)] = compute_metrics(foo[annotator2id[annotators[0]]], foo[annotator2id[annotators[1]]], labels=cats)

In [162]:
pd.DataFrame(ica).T.reset_index().rename(columns={'level_0': 'q_id', 'level_1': 'q_category', 'level_2': 'label'})

Unnamed: 0,q_id,q_category,label,n,prop_yes,f1_score,cohens_kappa,krippendorff_alpha,f1_score_positive,f1_score_neutral,f1_score_negative
0,data_quality,1,has_formatting_issue,149.0,0.026846,0.0,0.0,-0.010204,,,
1,data_quality,2,has_translation_issue,150.0,0.006667,0.0,0.0,0.0,,,
2,universal_attributes,,,146.0,0.143836,0.32,0.259104,0.258876,,,
3,economic_attributes,1,class membership,144.0,0.048611,0.0,-0.02439,-0.021352,,,
4,economic_attributes,2,employment status,148.0,0.128378,0.642857,0.608879,0.606876,,,
5,economic_attributes,3,education level,150.0,0.08,0.857143,0.846626,0.846902,,,
6,economic_attributes,4,income/wealth/economic status,146.0,0.130137,0.914286,0.90271,0.902946,,,
7,economic_attributes,5,occupation/profession,146.0,0.287671,0.764706,0.695278,0.694328,,,
8,economic_attributes,6,ecology of group,150.0,0.066667,0.666667,0.64986,0.650292,,,
9,economic_attributes,7,other,146.0,0.034247,0.0,-0.01108,-0.013937,,,


## Export for consolidation

In [163]:
annotations.loc[annotations.q_category.isna(), 'q_category'] = -1
annotations.loc[annotations.category.isna(), 'category'] = ''

In [164]:
comments = annotations[annotations.q_id=='comments'].groupby('mention_id').response.apply(lambda x: x[x.notna()].str.cat(sep='; ')).reset_index()
comments.rename(columns={'response': 'coder_comments'}, inplace=True)

out = annotations[annotations.q_id.isin(['universal_attributes', 'economic_attributes', 'non-economic_attributes'])]

cols = out.columns[:-2].tolist()
out = out.groupby(cols)[out.columns].agg({'response': lambda x: '; '.join(x[x.notna()].unique().tolist())}).reset_index()
out['coder_disagreement'] = out.response.str.contains('; ')

out['q_vals'] = out.q_id.map({'universal_attributes': 1, 'economic_attributes': 2, 'non-economic_attributes': 3})
out.sort_values(by=['mention_id', 'q_vals', 'q_category'], inplace=True, ascending=True)
del out['q_vals']

In [165]:
out = out.merge(comments, on='mention_id', how='left')

In [166]:
disagreement_cases = out.groupby('mention_id').filter(lambda x: x.coder_disagreement.any())

In [168]:
fp = os.path.join(dest, 'disagreement_cases.tsv')
disagreement_cases.to_csv(fp, sep='\t', index=False)