In [60]:
import os 
import pandas as pd
import numpy as np
from pathlib import Path

data_path = Path('../../../data/annotations/group_mention_categorization')

### Load previous annotations

In [61]:
fp = data_path / 'final_annotations.tsv'
annotations = pd.read_csv(fp, sep='\t')

In [62]:
annotations.q_id.value_counts()

q_id
non-economic_attributes    6600
economic_attributes        4200
Name: count, dtype: int64

In [63]:
annotations.groupby(['mention_id', 'attribute_combination']).filter(lambda x: x.label.nunique() > 1)
# NOTE: should be none !!!

Unnamed: 0,mention_id,text,mention,prev_texts,next_texts,q_id,q_category,category,round,attribute,attribute_combination,label


In [64]:
annotations['attribute_combination_val'] = annotations['attribute_combination'].str.replace(': ', '__').str.replace('non-', 'non').str.replace(r'[^a-z_]+', '_', regex=True)

### Load susbet of reviewed and corrected annotations

In [65]:
fp = data_path / 'attribute-misclassification-review' / 'reviewed.csv'
reviewed = pd.read_csv(fp)
# NOTE: only values for seleceted attribute combinations have been reviewed by expert - but the data has all binary indicators in long format

In [66]:
# pivot data in columns starting with "economic__" or "noneconomic__" to long format using mentio_id as index
reviewed_long = reviewed.melt(
    id_vars=['mention_id'], 
    value_vars=[col for col in reviewed.columns if col.startswith('economic__') or col.startswith('noneconomic__')],
    var_name='attribute_combination_val',
    value_name='corrected_label'
)

In [67]:
reviewed_long['corrected_label'] = reviewed_long.corrected_label.map({0: 'No', 1: 'Yes', np.nan: np.nan})

In [68]:
reviewed_long.value_counts('corrected_label')

corrected_label
No     2712
Yes     264
Name: count, dtype: int64

## merge

In [71]:
out = annotations.merge(reviewed_long, on=['mention_id', 'attribute_combination_val'], how='left', indicator=True)

### Inspect impact of corrections

In [74]:
out.groupby("mention_id").agg({'_merge': 'first'}).value_counts('_merge')
# NOTE: 186 mentions have been reviewed in error analysis

_merge
left_only     414
both          186
right_only      0
Name: count, dtype: int64

In [None]:
out.query('_merge=="both"').groupby("mention_id")[['label', 'corrected_label']].apply(lambda x: (x.label != x.corrected_label).sum()).value_counts()
# NOTE: this is the distribution of number of label changes per reviewed mention-in-context instance

0    145
1     37
2      3
3      1
Name: count, dtype: int64

In [87]:
out['label_change'] = out.apply(lambda x: np.nan if pd.isna(x.corrected_label) else f"{x.label} -> {x.corrected_label}" if x.label != x.corrected_label else np.nan, axis=1)
out.loc[~out.label_change.isna(), ["mention_id", "text", "mention",  "attribute_combination", "label_change"]].sort_values("mention_id")

Unnamed: 0,mention_id,text,mention,attribute_combination,label_change
80,11110_200609-393907-1,Everyone who is exposed to violence or threats...,Everyone who is exposed to violence or threats...,non-economic: crime,No -> Yes
656,11620_201009-396018-1,"In many situations, it is evident from the cir...",the woman who sells her body,non-economic: crime,Yes -> No
1039,12951_199709-332992-1,The Progress Party takes a strong distance fro...,"people based on race, religion and ethnic origin",non-economic: nationality,No -> Yes
1917,14110_199903-198263-2,Every person belongs to a minority.,a minority,non-economic: ethnicity,No -> Yes
2019,14110_201104-200153-1,The holiday banking system also enables annual...,workers in the field of grinding,economic: employment status,No -> Yes
2126,14110_201904-203886-1,Those who do well do as well as before.,Those who do well,economic: education level,Yes -> No
2128,14110_201904-203886-1,Those who do well do as well as before.,Those who do well,economic: income/wealth/economic status,No -> Yes
2267,14820_197903-197512-1,Do not submit to the betrayers of your promises.,the betrayers of your promises,non-economic: shared values/mentalities,No -> Yes
2296,14820_198703-197618-1,Familial farms must be guaranteed adequate opp...,Familial,non-economic: family,No -> Yes
2645,171101_200306-303761-1,"In the matter of environmental pollution, the ...",the members of the Green Ecologist,non-economic: shared values/mentalities,No -> Yes


In [89]:
out.value_counts(['label', 'corrected_label']).unstack(fill_value=0)

corrected_label,No,Yes
label,Unnamed: 1_level_1,Unnamed: 2_level_1
No,2698,32
Yes,14,232


In [90]:
del out["_merge"]
del out["label_change"]

### output corrected annotations

In [91]:
out['label'] = out.apply(lambda row: row['corrected_label'] if not pd.isna(row['corrected_label']) else row['label'], axis=1)
del out['corrected_label']

In [None]:
# verify
out.label.value_counts(dropna=False)

label
No     10081
Yes      719
Name: count, dtype: int64

In [95]:
out.mention_id.nunique()

600

In [93]:
fp = data_path / 'final_corrected_annotations.tsv'
out.to_csv(fp, sep='\t', index=False)