In [19]:
from types import SimpleNamespace

args = SimpleNamespace()

args.text_data_file = '../../data/intermediate/social_group_mentions_ranked.tsv'
args.text_col = 'text'
args.text_id_col = 'text_id'
args.mention_col = 'mention'
args.mention_id_col = 'mention_nr'
args.mention_id_format = '{text_id}-{mention_id}'

args.attributes_file = '../../data/annotations/group_mention_categorization/group_attributes.yaml'

In [20]:
import os
import yaml
import pandas as pd
import numpy as np
from collections import Counter

In [21]:
texts = pd.read_csv(args.text_data_file, sep='\t')
texts['mention_id'] = texts.apply(lambda r: args.mention_id_format.format(text_id=r[args.text_id_col], mention_id=r[args.mention_id_col]), axis=1)

In [22]:
# TODO:
#  - wrap reading and parsing logic in a function 

In [23]:
attributes_file = '../../data/annotations/group_mention_categorization/group_attributes.yaml'

with open(attributes_file, 'r') as f:
    ontology = yaml.load(f, Loader=yaml.FullLoader)

attributes = pd.DataFrame([
    {
        'q_id': a,
        'q_category': i,
        'label': v,
    }
    for a, d in ontology['social_group'].items()
    for i, v in enumerate(d['attributes'].keys(), start=1)
])
attributes.q_id = attributes.q_id.str.replace('non_', 'non-')
attributes.label = attributes.label.str.replace('<i>Other attribute</i>', 'other')

attributes.q_category = attributes.q_category.astype(str)

In [24]:
# read the annotations data
data_path = '../../data/annotations/group_mention_categorization/social-group-mention-categorization-coder-training'
fp = os.path.join(data_path, 'responses/responses.tsv')
df = pd.read_csv(fp, sep='\t', encoding='UTF-16')

In [25]:
data_cols = df.columns.str.match('^\d+_\d{6}')
metadata_cols = np.where(~data_cols)[0]
data_cols = np.where(data_cols)[0]

In [26]:
annotators = ['Eichholz', 'Ford']

ridxs = np.where(df.RecipientLastName.isin(annotators))[0]

data = df.iloc[ridxs, data_cols]
metadata = df.iloc[ridxs, metadata_cols]

In [27]:
id2annotator = dict(metadata.RecipientLastName)
id2annotator

{5: 'Ford', 6: 'Eichholz'}

In [28]:
tmp = data.T
tmp.reset_index(inplace=True)
tmp[['mention_id', 'q_id']] = tmp['index'].str.split('__', expand=True)
tmp.drop(columns=['index'], inplace=True)
# pivot longer: values from first two columns to rows
tmp = tmp.melt(id_vars=['mention_id', 'q_id'], value_name='value', var_name='annotator')

In [29]:
texts = texts.loc[texts.mention_id.isin(tmp.mention_id), ['mention_id', 'text', 'mention']]

In [30]:
# NOTE: implementation error (now fixed): questions about data quality were mis-IDed as <mention ID>__universal_attributes, here mapped to *__universal_attributes_1 and *__universal_attributes_2
# recode
tmp.q_id = tmp.q_id.str.replace('universal_attributes_', 'data_quality_')

In [31]:
tmp[['q_id', 'q_category']] = tmp.q_id.str.split(r'[_-](?=\d+$)', regex=True, expand=True)

In [32]:
tmp.q_id.value_counts().sort_index()

q_id
comments                    600
data_quality               1200
economic_attributes        4200
non-economic_attributes    6000
stance                      600
universal_attributes        600
Name: count, dtype: int64

In [33]:
tmp[tmp.q_id == 'data_quality'].value.value_counts(dropna=False)

value
NaN           1180
Yes             12
Unsure           5
Yes,Unsure       3
Name: count, dtype: int64

In [34]:
tmp.loc[tmp.q_id == 'data_quality', 'value'] = tmp[tmp.q_id == 'data_quality'].value.replace({np.nan: 'No', 'Yes,Unsure': 'Unsure'})

In [35]:
tmp[tmp.q_id == 'universal_attributes'][['value', 'q_category']].value_counts(dropna=False)

value   q_category
No      1             417
Yes     1             133
Unsure  1              31
NaN     1              19
Name: count, dtype: int64

In [36]:
tmp.loc[tmp.q_id == 'universal_attributes', 'value'] = tmp[tmp.q_id == 'universal_attributes'].value.replace({np.nan: 'No'})

In [37]:
tmp[tmp.q_id == 'economic_attributes'].value.value_counts(dropna=False)

value
NaN           3959
Yes            198
Unsure          32
Yes,Unsure      11
Name: count, dtype: int64

In [38]:
tmp.loc[tmp.q_id == 'economic_attributes', 'value'] = tmp[tmp.q_id == 'economic_attributes'].value.replace({np.nan: 'No', 'Yes,Unsure': 'Unsure'})

In [39]:
tmp[tmp.q_id == 'non-economic_attributes'].value.value_counts(dropna=False)

value
NaN           5648
Yes            297
Unsure          39
Yes,Unsure      16
Name: count, dtype: int64

In [40]:
tmp.loc[tmp.q_id == 'non-economic_attributes', 'value'] = tmp[tmp.q_id == 'non-economic_attributes'].value.replace({np.nan: 'No', 'Yes,Unsure': 'Unsure'})

In [41]:
tmp[tmp.q_id == 'stance'].value.value_counts(dropna=False)

value
Positive    457
Negative     56
Neutral      49
Unsure       28
NaN          10
Name: count, dtype: int64

In [42]:
mid = '13230_201109-192881-2' # '35120_201910-375318-1'
foo = tmp[tmp.mention_id==mid]
# pivot wider: values in 'annotator' column to columns usin values in 'value' as values
foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()

annotator,mention_id,q_id,q_category,5,6
0,13230_201109-192881-2,comments,,Stance uncertain because focal sentence is a q...,
1,13230_201109-192881-2,data_quality,1.0,No,No
2,13230_201109-192881-2,data_quality,2.0,No,No
3,13230_201109-192881-2,economic_attributes,1.0,No,No
4,13230_201109-192881-2,economic_attributes,2.0,No,Yes
5,13230_201109-192881-2,economic_attributes,3.0,No,No
6,13230_201109-192881-2,economic_attributes,4.0,No,No
7,13230_201109-192881-2,economic_attributes,5.0,No,No
8,13230_201109-192881-2,economic_attributes,6.0,No,No
9,13230_201109-192881-2,economic_attributes,7.0,No,No


## TODO: Write in usuful format to disk



In [61]:
import re

econ_attributes_map = {str(i): c for i, c in enumerate(ontology['social_group']['economic_attributes']['attributes'].keys(), start=1)}
nonecon_attributes_map = {str(i): c for i, c in enumerate(ontology['social_group']['non_economic_attributes']['attributes'].keys(), start=1)}
econ_attributes_map['7'] = 'other'
nonecon_attributes_map['10'] = 'other'

clean_cat = lambda x: re.sub(r'[ /]', '_', re.sub(r'</?i>', '', x)).lower()
econ_attributes_map_cleaned = {str(i): clean_cat(c) for i, c in enumerate(ontology['social_group']['economic_attributes']['attributes'].keys(), start=1)}
nonecon_attributes_map_cleaned = {str(i): clean_cat(c) for i, c in enumerate(ontology['social_group']['non_economic_attributes']['attributes'].keys(), start=1)}

data_quality_map = {'1': 'has_formatting_issue', '2': 'has_translation_issue'}

In [73]:
annotations = tmp.copy(deep=True)

annotations.annotator = annotations.annotator.map(id2annotator)

annotations.loc[annotations.q_id == 'economic_attributes', 'category'] = annotations.loc[annotations.q_id == 'economic_attributes', 'q_category'].map(econ_attributes_map)
annotations.loc[annotations.q_id == 'non-economic_attributes', 'category'] = annotations.loc[annotations.q_id == 'non-economic_attributes', 'q_category'].map(nonecon_attributes_map)
annotations.loc[annotations.q_id == 'data_quality', 'category'] = annotations.loc[annotations.q_id == 'data_quality', 'q_category'].map(data_quality_map)

annotations = annotations.merge(texts, on='mention_id', how='left')

cols = ['mention_id', 'text', 'mention', 'q_id', 'q_category', 'category', 'annotator', 'value']
annotations = annotations[cols]
annotations.rename(columns={'value': 'response'}, inplace=True)

annotations.to_csv('../../data/annotations/group_mention_categorization/social-group-mention-categorization-coder-training/parsed/annotations.tsv', sep='\t', index=False)

In [None]:
foo = tmp.groupby(['mention_id', 'q_id', 'annotator'])[tmp.columns].apply(lambda x: x.value.values[0] if x.q_category.values[0] is None else dict(zip(x.q_category, x.value))).reset_index()
# pivot wider again
foo = foo.pivot(index=['mention_id', 'annotator'], columns='q_id', values=0).reset_index()

foo.data_quality = foo.data_quality.apply(lambda x: {data_quality_map[k]: v for k, v in x.items()})
foo[list(data_quality_map.values())] = foo.data_quality.apply(pd.Series)

foo.universal_attributes = foo.universal_attributes.apply(pd.Series)

foo.economic_attributes = foo.economic_attributes.apply(lambda x: {econ_attributes_map_cleaned[k]: v for k, v in x.items()})
foo['non-economic_attributes'] = foo['non-economic_attributes'].apply(lambda x: {nonecon_attributes_map_cleaned[k]: v for k, v in x.items()})

cols = ['mention_id', 'annotator', 'universal_attributes', 'economic_attributes', 'non-economic_attributes', 'stance', 'comments'] + list(data_quality_map.values())
foo = foo[cols]
foo.merge(texts, on='mention_id', how='left')

Unnamed: 0,mention_id,annotator,universal_attributes,economic_attributes,non-economic_attributes,stance,comments,has_formatting_issue,has_translation_issue,text,mention
0,11110_198809-390636-1,5,No,"{'class_membership': 'No', 'employment_status'...","{'age': 'No', 'family': 'Yes', 'gender_sexuali...",Positive,,No,No,Give parents the right to become municipal day...,parents
1,11110_198809-390636-1,6,No,"{'class_membership': 'No', 'employment_status'...","{'age': 'No', 'family': 'Yes', 'gender_sexuali...",Positive,,No,No,Give parents the right to become municipal day...,parents
2,11110_199109-390940-1,5,Yes,"{'class_membership': 'No', 'employment_status'...","{'age': 'No', 'family': 'No', 'gender_sexualit...",Positive,,No,No,It is only within the ecological framework tha...,a society for survival in prosperity and well-...
3,11110_199109-390940-1,6,Unsure,"{'class_membership': 'No', 'employment_status'...","{'age': 'No', 'family': 'No', 'gender_sexualit...",Positive,It's quite universal (society) but with the ec...,No,No,It is only within the ecological framework tha...,a society for survival in prosperity and well-...
4,11110_200609-393907-1,5,No,"{'class_membership': 'No', 'employment_status'...","{'age': 'No', 'family': 'No', 'gender_sexualit...",Positive,,No,No,Everyone who is exposed to violence or threats...,Everyone who is exposed to violence or threats...
...,...,...,...,...,...,...,...,...,...,...,...
595,97330_200809-386545-2,6,No,"{'class_membership': 'No', 'employment_status'...","{'age': 'No', 'family': 'No', 'gender_sexualit...",Neutral,,No,No,We will introduce the internal e-business of t...,attorneys
596,97710_200809-386883-5,5,No,"{'class_membership': 'No', 'employment_status'...","{'age': 'No', 'family': 'No', 'gender_sexualit...",Positive,,No,No,new employment opportunities according to the ...,employees
597,97710_200809-386883-5,6,No,"{'class_membership': 'No', 'employment_status'...","{'age': 'No', 'family': 'No', 'gender_sexualit...",Positive,,No,No,new employment opportunities according to the ...,employees
598,97710_200809-386888-1,5,No,"{'class_membership': 'No', 'employment_status'...","{'age': 'No', 'family': 'No', 'gender_sexualit...",Positive,,No,No,We oppose the restriction of social rights and...,employees


In [45]:
# TODO: write to disc

## Evaluate ICA

In [27]:
from sklearn.metrics import cohen_kappa_score, f1_score
import krippendorff
from typing import List, Dict
def compute_metrics(
        a: pd.Series, 
        b: pd.Series, 
        labels: List[str] = ['Yes', 'No'],
        pos_label: str = 'Yes',
    ) -> Dict:
    
    out = {
        'n': len(a),
        f'prop_{pos_label.lower()}': np.logical_or(a == pos_label, b == pos_label).mean(),
    }
    if len(labels) == 2:
        out['f1_score'] = f1_score(a, b, average='binary', pos_label=pos_label)
    else:
        f1s = f1_score(a, b, average=None, labels=labels)
        out.update({f'f1_score_{l.lower()}': f1s[i] for i, l in enumerate(labels)})
    out['cohens_kappa'] = cohen_kappa_score(a, b)
    a = a.astype('category').cat.codes
    b = b.astype('category').cat.codes
    out['krippendorff_alpha'] = krippendorff.alpha(reliability_data=np.array([a, b]), level_of_measurement='nominal')

    return out

In [28]:
ica = {}

In [29]:
q = 'data_quality'
for i, label in data_quality_map.items():
    foo = tmp[np.logical_and(tmp.q_id==q, tmp.q_category==str(i))]
    # pivot wider: values in 'annotator' column to columns usin values in 'value' as values
    foo = foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()
    foo = foo[~np.logical_or(foo[5] == 'Unsure', foo[6] == 'Unsure')]
    
    ica[(q, i, label)] = compute_metrics(foo[5], foo[6])

In [30]:
foo = tmp[tmp.q_id=='universal_attributes']

# pivot wider: values in 'annotator' column to columns usin values in 'value' as values
foo = foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()
foo = foo[~np.logical_or(foo[5] == 'Unsure', foo[6] == 'Unsure')]

ica[('universal_attributes', None)] = compute_metrics(foo[5], foo[6])

In [31]:
# econ/non-econ attributes
for d in attributes.itertuples():
     
    foo = tmp[np.logical_and(tmp.q_id==d.q_id, tmp.q_category==str(d.q_category))]
    if len(foo) == 0:
        continue
    # pivot wider: values in 'annotator' column to columns usin values in 'value' as values
    foo = foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()
    foo = foo[~np.logical_or(foo[5] == 'Unsure', foo[6] == 'Unsure')]
    
    ica[(d.q_id, d.q_category, d.label)] = compute_metrics(foo[5], foo[6])

In [32]:
foo = tmp[tmp.q_id=='economic_attributes']
foo = foo.groupby(['mention_id', 'annotator']).agg({'value': lambda v: (v != 'No').any()}).reset_index()
foo.value = foo.value.map({True: 'Yes', False: 'No'})
foo = foo.pivot(index=['mention_id'], columns='annotator', values='value').reset_index()

ica[('economic_attributes', 'overall')] = compute_metrics(foo[5], foo[6])

In [33]:
foo = tmp[tmp.q_id=='non-economic_attributes']
foo = foo.groupby(['mention_id', 'annotator']).agg({'value': lambda v: (v != 'No').any()}).reset_index()
foo.value = foo.value.map({True: 'Yes', False: 'No'})
foo = foo.pivot(index=['mention_id'], columns='annotator', values='value').reset_index()

ica[('non-economic_attributes', 'overall')] = compute_metrics(foo[5], foo[6])

In [34]:
cats = ['Positive', 'Neutral', 'Negative']
foo = tmp[np.logical_and(tmp.q_id=='stance', tmp.q_category.isna())]
foo = foo[~foo.value.isna()]
foo.value.value_counts(dropna=False)
foo = foo.pivot(index=['mention_id', 'q_id', 'q_category'], columns='annotator', values='value').reset_index()
foo = foo[np.logical_and(foo[5].isin(cats), foo[6].isin(cats))]

ica[('stance', None)] = compute_metrics(foo[5], foo[6], labels=cats)

In [35]:
pd.DataFrame(ica).T.reset_index().rename(columns={'level_0': 'q_id', 'level_1': 'q_category', 'level_2': 'label'})

Unnamed: 0,q_id,q_category,label,n,prop_yes,f1_score,cohens_kappa,krippendorff_alpha,f1_score_positive,f1_score_neutral,f1_score_negative
0,data_quality,1,has_formatting_issue,299.0,0.016722,0.0,-0.008092,-0.006745,,,
1,data_quality,2,has_translation_issue,293.0,0.020478,0.0,0.0,-0.008621,,,
2,universal_attributes,,,276.0,0.275362,0.764228,0.69869,0.697178,,,
3,economic_attributes,1,class membership,298.0,0.016779,0.571429,0.56644,0.567063,,,
4,economic_attributes,2,employment status,295.0,0.088136,0.555556,0.529036,0.527477,,,
5,economic_attributes,3,education level,298.0,0.010067,0.8,0.798376,0.798646,,,
6,economic_attributes,4,income/wealth/economic status,289.0,0.069204,0.709677,0.693387,0.693755,,,
7,economic_attributes,5,occupation/profession,284.0,0.18662,0.835165,0.804264,0.804064,,,
8,economic_attributes,6,ecology of group,297.0,0.030303,0.8,0.795031,0.795164,,,
9,economic_attributes,7,other,296.0,0.006757,0.0,0.0,-0.001695,,,


## Indentify easy and hard examples

### Universal

In [36]:
# NOTE: agreement rates at text-level within economic attribute categories 
foo = tmp[tmp.q_id=='universal_attributes']
foo = foo.groupby(['mention_id']).agg({'value': lambda v: len(v.unique()) == 1}).reset_index()
foo.value.value_counts()

value
True     254
False     46
Name: count, dtype: int64

In [37]:
# NOTE: agreement rates at text-level within economic attribute categories in texts with min one positive label
foo = tmp[tmp.q_id=='universal_attributes']
# remove cases where all values across q_category and annotator are "No"
idxs = foo.groupby('mention_id').agg({'value': lambda v: v.isin(['No']).all()})
foo = foo[~foo.mention_id.isin(idxs[idxs.value].index.values)]
# now compute agreement
foo = foo.groupby(['mention_id']).agg({'value': lambda v: len(v.unique()) == 1}).reset_index()
foo.value.value_counts()

# NOTE: unsuprisingly, all disagreement cases are ones involving "Yes" labels

value
True     54
False    46
Name: count, dtype: int64

#### Get examples

In [38]:
# NOTE: agreement rates at text-level within economic attribute categories in texts with min one positive label
foo = tmp[tmp.q_id=='universal_attributes']

# cases where all values across annotator are "No"
idxs = foo.groupby('mention_id').agg({'value': lambda v: v.isin(['No']).all()})

neg_expls_univ = foo[foo.mention_id.isin(idxs[idxs.value].index.values)]
neg_expls_univ_list = neg_expls_univ.mention_id.unique().tolist()
neg_expls_univ_df = neg_expls_univ[['mention_id', 'q_id', 'value']].drop_duplicates()

In [39]:
pos_expls_univ = foo[~foo.mention_id.isin(neg_expls_univ_list)]
pos_expls_univ = pos_expls_univ.groupby(['mention_id']).agg({'value': lambda v: len(v.unique()) == 1}).reset_index()
pos_expls_univ_list = pos_expls_univ[pos_expls_univ.value].mention_id.unique().tolist()
pos_expls_univ_df = foo.loc[foo.mention_id.isin(pos_expls_univ_list), ['mention_id', 'q_id', 'value']].drop_duplicates()

In [40]:
grey_expls_univ_list = pos_expls_univ[~pos_expls_univ.value].mention_id.unique().tolist()

grey_expls_univ_df = foo.loc[foo.mention_id.isin(grey_expls_univ_list), :]
grey_expls_univ_df = grey_expls_univ_df.groupby(['mention_id', 'q_id']).agg({'value': Counter}).reset_index()
grey_expls_univ_df = grey_expls_univ_df[grey_expls_univ_df.value.apply(len) > 1]

### Economic attributes

In [41]:
# NOTE: agreement rates at text-level within economic attribute categories 
foo = tmp[tmp.q_id=='economic_attributes']
foo = foo.groupby(['mention_id', 'q_category']).agg({'value': lambda v: len(v.unique()) == 1}).reset_index()
foo = foo.groupby(['mention_id']).agg({'value': 'mean'}).reset_index()
foo.value.value_counts()

value
1.000000    234
0.857143     45
0.714286     16
0.571429      5
Name: count, dtype: int64

In [42]:
# NOTE: agreement rates at text-level within economic attribute categories in texts with min one positive label
foo = tmp[tmp.q_id=='economic_attributes']
# remove cases where all values across q_category and annotator are "No"
idxs = foo.groupby('mention_id').agg({'value': lambda v: v.isin(['No']).all()})
foo = foo[~foo.mention_id.isin(idxs[idxs.value].index.values)]
# now comput agreement
foo = foo.groupby(['mention_id', 'q_category']).agg({'value': lambda v: len(v.unique()) == 1}).reset_index()
foo = foo.groupby(['mention_id']).agg({'value': 'mean'}).reset_index()
foo.value.value_counts()

value
1.000000    57
0.857143    45
0.714286    16
0.571429     5
Name: count, dtype: int64

#### Get examples

In [43]:
# NOTE: agreement rates at text-level within economic attribute categories in texts with min one positive label
foo = tmp[tmp.q_id=='economic_attributes']
# cases where all values across q_category and annotator are "No"
idxs = foo.groupby('mention_id').agg({'value': lambda v: v.isin(['No']).all()})

In [44]:
neg_expls_econ = foo[foo.mention_id.isin(idxs[idxs.value].index.values)]
neg_expls_econ_list = neg_expls_econ.mention_id.unique().tolist()
neg_expls_econ_df = neg_expls_econ[['mention_id', 'q_id', 'q_category', 'value']].drop_duplicates()

In [45]:
pos_expls_econ = foo[~foo.mention_id.isin(idxs[idxs.value].index.values)]
pos_expls_econ = pos_expls_econ.groupby(['mention_id', 'q_category']).agg({'value': lambda v: len(v.unique()) == 1}).reset_index()
pos_expls_econ = pos_expls_econ.groupby(['mention_id']).agg({'value': 'mean'}).reset_index()
pos_expls_econ_list = pos_expls_econ[pos_expls_econ.value == 1.0].mention_id.unique().tolist()

pos_expls_econ_df = foo.loc[foo.mention_id.isin(pos_expls_econ_list), ['mention_id', 'q_id', 'q_category', 'value']].drop_duplicates()
pos_expls_econ_df[pos_expls_econ_df.value == 'Yes'].q_category.map(econ_attributes_map).value_counts()

q_category
occupation_profession            31
income_wealth_economic_status    10
employment_status                 8
ecology_of_group                  4
class_membership                  2
education_level                   2
Name: count, dtype: int64

In [46]:
grey_expls_econ_list = pos_expls_econ[pos_expls_econ.value < 1.0].mention_id.unique().tolist()

grey_expls_econ_df = foo.loc[foo.mention_id.isin(grey_expls_econ_list), :]
grey_expls_econ_df = grey_expls_econ_df.groupby(['mention_id', 'q_id', 'q_category']).agg({'value': Counter}).reset_index()
grey_expls_econ_df = grey_expls_econ_df[grey_expls_econ_df.value.apply(len) > 1]

### Non-econ attributes

In [47]:
foo = tmp[tmp.q_id=='non-economic_attributes']
foo = foo.groupby(['mention_id', 'q_category']).agg({'value': lambda v: len(v.unique()) == 1}).reset_index()
foo = foo.groupby(['mention_id']).agg({'value': 'mean'}).reset_index()
foo.value.value_counts()

value
1.0    220
0.9     63
0.8     15
0.7      2
Name: count, dtype: int64

In [124]:
# NOTE: agreement rates at text-level within economic attribute categories in texts with min one positive label
foo = tmp[tmp.q_id=='non-economic_attributes']
# remove cases where all values across q_category and annotator are "No"
idxs = foo.groupby('mention_id').agg({'value': lambda v: v.isin(['No']).all()})
foo = foo[~foo.mention_id.isin(idxs[idxs.value].index.values)]
# now comput agreement
foo = foo.groupby(['mention_id', 'q_category']).agg({'value': lambda v: len(v.unique()) == 1}).reset_index()
foo = foo.groupby(['mention_id']).agg({'value': 'mean'}).reset_index()
foo.value.value_counts()

value
1.0    89
0.9    63
0.8    15
0.7     2
Name: count, dtype: int64

#### Get examples

In [48]:
# NOTE: agreement rates at text-level within economic attribute categories in texts with min one positive label
foo = tmp[tmp.q_id=='non-economic_attributes']
# cases where all values across q_category and annotator are "No"
idxs = foo.groupby('mention_id').agg({'value': lambda v: v.isin(['No']).all()})

In [49]:
neg_expls_nonecon = foo[foo.mention_id.isin(idxs[idxs.value].index.values)]
neg_expls_nonecon_list = neg_expls_nonecon.mention_id.unique().tolist()
neg_expls_nonecon_df = neg_expls_nonecon[['mention_id', 'q_id', 'q_category', 'value']].drop_duplicates()
neg_expls_nonecon_df.value.value_counts()

value
No    1310
Name: count, dtype: int64

In [50]:
pos_expls_nonecon = foo[~foo.mention_id.isin(idxs[idxs.value].index.values)]
pos_expls_nonecon = pos_expls_nonecon.groupby(['mention_id', 'q_category']).agg({'value': lambda v: len(v.unique()) == 1}).reset_index()
pos_expls_nonecon = pos_expls_nonecon.groupby(['mention_id']).agg({'value': 'mean'}).reset_index()
pos_expls_nonecon_list = pos_expls_nonecon[pos_expls_nonecon.value == 1.0].mention_id.unique().tolist()

pos_expls_nonecon_df = foo.loc[foo.mention_id.isin(pos_expls_nonecon_list), ['mention_id', 'q_id', 'q_category', 'value']].drop_duplicates()

In [51]:
pos_expls_nonecon_df.groupby('mention_id')['value'].agg(lambda v: (v == 'Yes').sum()).value_counts()

value
1    80
2     5
0     4
Name: count, dtype: int64

In [52]:
pos_expls_nonecon_df[pos_expls_nonecon_df.value == 'Yes'].q_category.map(nonecon_attributes_map).value_counts()

q_category
age                 32
nationality         24
family              14
health               8
crime                4
other_attribute      3
place_location       3
gender_sexuality     2
Name: count, dtype: int64

In [53]:
grey_expls_nonecon_list = pos_expls_nonecon[pos_expls_nonecon.value < 1.0].mention_id.unique().tolist()

grey_expls_nonecon_df = foo.loc[foo.mention_id.isin(grey_expls_nonecon_list), :]
grey_expls_nonecon_df = grey_expls_nonecon_df.groupby(['mention_id', 'q_id', 'q_category']).agg({'value': Counter}).reset_index()
grey_expls_nonecon_df = grey_expls_nonecon_df[grey_expls_nonecon_df.value.apply(len) > 1]

### Stance

In [54]:
# NOTE: agreement rates at text-level within economic attribute categories 
foo = tmp[tmp.q_id=='stance']
foo = foo[~foo.value.isna()]

#### Get examples

In [55]:
# cases where all annotators agree
agreement_cases = foo.groupby(['mention_id']).agg({'value': lambda v: len(v.unique()) == 1}).reset_index()
agreement_cases = agreement_cases.mention_id[agreement_cases.value].unique().tolist()

In [56]:
gold_expls_stance_df = pd.concat([foo.loc[np.logical_and(foo.mention_id.isin(agreement_cases), foo.value==v), ['mention_id', 'q_id', 'value']] for v in foo.value.unique()])
gold_expls_stance_df = gold_expls_stance_df.drop_duplicates()
gold_expls_stance_df.value.value_counts()

value
Positive    215
Negative     24
Neutral      12
Unsure        4
Name: count, dtype: int64

In [57]:
grey_expls_stance_df = foo[~foo.mention_id.isin(agreement_cases)]
grey_expls_stance_df = grey_expls_stance_df.groupby(['mention_id', 'q_id']).agg({'value': Counter}).reset_index()
grey_expls_stance_df = grey_expls_stance_df[grey_expls_stance_df.value.apply(len) > 1]
grey_expls_stance_df

Unnamed: 0,mention_id,q_id,value
0,11110_200609-393948-1,stance,"{'Unsure': 1, 'Neutral': 1}"
1,11110_201809-399601-2,stance,"{'Unsure': 1, 'Positive': 1}"
2,11320_197009-389704-1,stance,"{'Neutral': 1, 'Positive': 1}"
3,11320_197909-390063-2,stance,"{'Neutral': 1, 'Positive': 1}"
4,11320_201009-395337-2,stance,"{'Unsure': 1, 'Positive': 1}"
5,11620_197309-389896-2,stance,"{'Positive': 1, 'Neutral': 1}"
6,11620_201809-400440-2,stance,"{'Positive': 1, 'Negative': 1}"
7,12951_200909-340654-5,stance,"{'Positive': 1, 'Neutral': 1}"
8,13230_198112-182709-1,stance,"{'Negative': 1, 'Positive': 1}"
9,13230_198709-183595-1,stance,"{'Unsure': 1, 'Neutral': 1}"


## Export

### Gold examples

In [58]:
gold_examples_df = pd.concat([
    pos_expls_univ_df,
    neg_expls_univ_df,
    pos_expls_econ_df,
    neg_expls_econ_df,
    pos_expls_nonecon_df,
    neg_expls_nonecon_df,
    gold_expls_stance_df,
])
gold_examples_df = gold_examples_df[['mention_id', 'q_id', 'q_category', 'value']]
unsure_gold = gold_examples_df[gold_examples_df.value == 'Unsure']
gold_examples_df = gold_examples_df[gold_examples_df.value != 'Unsure']
gold_examples_df.rename(columns={'value': 'label'}, inplace=True)

gold_examples_df[['q_id', 'label', ]].value_counts(sort=False)

q_id                     label   
economic_attributes      No          1581
                         Yes           57
non-economic_attributes  No          2106
                         Yes           90
stance                   Negative      24
                         Neutral       12
                         Positive     215
universal_attributes     No           200
                         Yes           47
Name: count, dtype: int64

In [59]:
gold_examples_df = gold_examples_df.\
    merge(attributes.rename(columns={'label': 'category'}), how='left').\
    merge(texts, on='mention_id', how='left')

gold_examples_df = gold_examples_df[['mention_id', 'text', 'mention', 'q_id', 'category', 'label']]

In [60]:
gold_examples_df[['q_id', 'category', 'label', ]].value_counts(sort=False, dropna=False)

q_id                     category                       label   
economic_attributes      class membership               No          232
                                                        Yes           2
                         ecology of group               No          230
                                                        Yes           4
                         education level                No          232
                                                        Yes           2
                         employment status              No          226
                                                        Yes           8
                         income/wealth/economic status  No          224
                                                        Yes          10
                         occupation/profession          No          203
                                                        Yes          31
                         other                          No          234

In [140]:
os.makedirs(os.path.join(data_path, 'parsed'), exist_ok=True)
fp = os.path.join(data_path, 'parsed', 'gold_examples.tsv')
gold_examples_df.to_csv(fp, sep='\t', index=False, na_rep='')

### Need to review

In [61]:
unsure_gold

Unnamed: 0,mention_id,q_id,q_category,value
420,13229_199409-186604-1,universal_attributes,,Unsure
530,11620_201809-400440-2,universal_attributes,,Unsure
904,53110_199211-276429-1,universal_attributes,,Unsure
992,11110_201809-399481-1,universal_attributes,,Unsure
1212,14110_199903-198263-2,universal_attributes,,Unsure
1520,13730_201906-195725-1,universal_attributes,,Unsure
5436,97110_199212-382866-1,universal_attributes,,Unsure
1009,11110_201809-399481-1,non-economic_attributes,10.0,Unsure
1537,13730_201906-195725-1,non-economic_attributes,10.0,Unsure
1867,41320_199012-128021-1,non-economic_attributes,10.0,Unsure


In [81]:
need_to_review_df = pd.concat([
    unsure_gold,
    grey_expls_univ_df, 
    grey_expls_econ_df, 
    grey_expls_nonecon_df, 
    grey_expls_stance_df
])
need_to_review_df = need_to_review_df.sort_values(['q_id', 'q_category']).reset_index(drop=True)
need_to_review_df.value.value_counts()

value
{'Yes': 1, 'No': 1}               136
{'No': 1, 'Unsure': 1}             65
{'Unsure': 1, 'Yes': 1}            36
{'Neutral': 1, 'Positive': 1}      16
Unsure                             15
{'Unsure': 1, 'Positive': 1}       13
{'Unsure': 1, 'Neutral': 1}         6
{'Positive': 1, 'Negative': 1}      5
{'Negative': 1, 'Neutral': 1}       3
{'Negative': 1, 'Unsure': 1}        2
Name: count, dtype: int64

In [82]:
need_to_review_df.q_id.value_counts()

q_id
non-economic_attributes    103
economic_attributes         92
universal_attributes        53
stance                      49
Name: count, dtype: int64

In [83]:
len(need_to_review_df), need_to_review_df.mention_id.nunique()

(297, 165)

In [84]:
comments = tmp.loc[tmp.q_id=='comments', ['mention_id', 'value']]
comments = comments[~comments.value.isna()]
comments = comments.groupby('mention_id').agg({'value': lambda c: '\n'.join(c)}).reset_index().rename(columns={'value': 'comments'})

In [85]:
need_to_review_df = need_to_review_df.\
    merge(attributes.rename(columns={'label': 'category'}), how='left').\
    merge(texts, on='mention_id', how='left').\
    merge(comments, on='mention_id', how='left')

In [86]:
need_to_review_df = need_to_review_df[['mention_id', 'text', 'mention', 'q_id', 'category', 'value', 'comments']]
need_to_review_df.value.value_counts()

value
{'Yes': 1, 'No': 1}               136
{'No': 1, 'Unsure': 1}             65
{'Unsure': 1, 'Yes': 1}            36
{'Neutral': 1, 'Positive': 1}      16
Unsure                             15
{'Unsure': 1, 'Positive': 1}       13
{'Unsure': 1, 'Neutral': 1}         6
{'Positive': 1, 'Negative': 1}      5
{'Negative': 1, 'Neutral': 1}       3
{'Negative': 1, 'Unsure': 1}        2
Name: count, dtype: int64

In [87]:
need_to_review_df.rename(columns={'value': 'annotations'}, inplace=True)

In [88]:
os.makedirs(os.path.join(data_path, 'parsed'), exist_ok=True)
fp = os.path.join(data_path, 'parsed', 'review_examples.tsv')
need_to_review_df.to_csv(fp, sep='\t', index=False, na_rep='')

In [89]:
reviewed = pd.read_csv(os.path.join(data_path, 'parsed', 'reviewed_examples.tsv'), sep='\t')

In [90]:
# perform anti-join to remove cases in `reviewed` from `need_to_review_df` using columns ['mention_id', 'q_id', 'category']
tmp = need_to_review_df.merge(reviewed[['mention_id', 'q_id', 'category']], on=['mention_id', 'q_id', 'category'], how='left', indicator=True)

In [94]:
fp = os.path.join(data_path, 'parsed', 'review_examples-02.tsv')
tmp[tmp._merge == 'left_only'].to_csv(fp, sep='\t', index=False, na_rep='')