In [65]:
import os 
import pandas as pd
import numpy as np
from pathlib import Path

data_path = Path('../../../data/annotations/group_mention_categorization')

In [66]:
fp = data_path / 'consolidated_annotations_post_review.tsv'
annotations = pd.read_csv(fp, sep='\t')
ignore = ['stance: ', 'universal: ']
annotations.query("attribute_combination not in @ignore", inplace=True)

In [68]:
annotations.groupby(['mention_id', 'attribute_combination']).filter(lambda x: x.label.nunique() > 1)
# NOTE: should be none !!!

Unnamed: 0,mention_id,text,mention,prev_texts,next_texts,q_id,q_category,category,round,attribute,attribute_combination,label


In [None]:
pre_review = annotations[['mention_id', 'attribute_combination', 'label']]

Unnamed: 0,mention_id,attribute_combination,label
0,11110_198809-390636-1,economic: class membership,No
1,11110_198809-390636-1,economic: ecology of group,No
2,11110_198809-390636-1,economic: education level,No
3,11110_198809-390636-1,economic: employment status,No
4,11110_198809-390636-1,economic: income/wealth/economic status,No


# Parse expert review annotations

In [84]:
fp = data_path / 'social-group-mention-categorization-expert-consolidation' / 'annotations_reviewed_wordwise.xlsx'

# list sheets in workbook
sheets = pd.ExcelFile(fp).sheet_names

### check for internal inconsistencies

In [None]:
# dfs = []
# for i, sheet in enumerate(sheets):
#     df = pd.read_excel(fp, sheet_name=sheet)
#     attribute_cols = df.columns[df.columns.str.contains(' DO NOT EDIT')]
#     df = df.melt(id_vars=['mention_id', 'text', 'mention'], value_vars=attribute_cols, var_name='attribute', value_name='category')#.sort_values(by=['mention_id', 'attribute_combination']).reset_index(drop=True)
#     # df['sheet'] = i+1
#     df['attribute'] = df['attribute'].str.removesuffix(' DO NOT EDIT')
#     df['category'] = df['category'].str.split('; ')
#     df = df.explode('category')
#     df = df[df['category'].notna()]
#     df['attribute_combination'] = df['attribute'] + ': ' + df['category']
#     df['label'] = 'Yes'
#     dfs.append(df)

# df = pd.concat(dfs, ignore_index=True)

# # NOTE: should be none !!!
# df.sort_values(['mention_id', 'attribute_combination']).groupby(['mention_id', 'attribute_combination']).filter(lambda x: x.label.nunique() > 1)

### List edits to be implemented

In [140]:
def split_and_strip(x):
    if x is np.NaN:
        return set()
    if isinstance(x, list):
        return set([s.strip() for s in x if s.strip() != ''])
    if isinstance(x, str):
        return set([s.strip() for s in x.split(';') if s.strip() != ''])
    return set()

dfs = []
cols = ['mention_id', 'text', 'mention', 'discard codings', 'add codings']
for i, sheet in enumerate(sheets):
    df = pd.read_excel(fp, sheet_name=sheet)
    idxs = np.logical_or(df['discard codings'].notna(), df['add codings'].notna())
    df = df[idxs]
    df['discard codings'] = df['discard codings'].map(split_and_strip)
    df['add codings'] = df['add codings'].map(split_and_strip)
    df = df[cols]
    df['sheet_nr'] = i+1
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)
df.sort_values(['mention_id', 'sheet_nr'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [141]:
# function checking disagreement between sets
from typing import List, Union
def any_inconsistencies(sets: Union[pd.Series, List[set]]) -> bool:
    if isinstance(sets, pd.Series):
        sets = sets.tolist()
    elif not isinstance(sets, list):
        raise TypeError("Input must be a list or a pandas Series of sets.")
    if not sets:
        return False
    union_set = set.union(*sets)
    for s in sets:
        if s != union_set:
            return True
        if s - union_set:
            return True
    return False

# print(any_inconsistencies([{1, 2}, {1, 2, 3}]))  # should return True
# print(any_inconsistencies([{1, 2}, {3, 4}]))  # should return True
# print(any_inconsistencies([{1, 2}, {2, 3}]))  # should return True
# print(any_inconsistencies([{1, 2}, {1, 2}]))  # should return False

In [142]:
# check if any inconsistencies in the discard codings for a given mention
these = df.groupby('mention_id')['discard codings'].agg(any_inconsistencies)
these = these[these].index.tolist()

# NOTE: should be none !!!
df[df.mention_id.isin(these)]

Unnamed: 0,mention_id,text,mention,discard codings,add codings,sheet_nr


In [143]:
# check if any inconsistencies in the discard codings for a given mention
these = df.groupby('mention_id')['add codings'].agg(any_inconsistencies)
these = these[these].index.tolist()

# NOTE: should be none !!!
df[df.mention_id.isin(these)]

Unnamed: 0,mention_id,text,mention,discard codings,add codings,sheet_nr


In [144]:
set.union(*df['discard codings'].tolist())

{'ecology of group',
 'education level',
 'occupation/profession',
 'shared values/mentalities'}

In [145]:
set.union(*df['add codings'].tolist())

{'employment status', 'shared values/mentalities'}

## Unite

now

1. for each sheet parse info about codings to add/discard
2. left-join with pre-review codings
4. (optionally) tabulate post-review edits

#### "1. take info about codings to add/discard"

In [146]:
reviewed_codings = df.melt(id_vars=['mention_id', 'text', 'mention'], value_vars=['discard codings', 'add codings'], var_name='action', value_name='category').explode('category')
reviewed_codings = reviewed_codings[reviewed_codings['category'].notna()]
reviewed_codings['label'] = reviewed_codings['action'].map({'discard codings': 'No', 'add codings': 'Yes'})
del reviewed_codings['action']

reviewed_codings.groupby(['mention_id', 'category']).filter(lambda x: x.label.nunique() > 1)
# NOTE: should be none

Unnamed: 0,mention_id,text,mention,category,label


In [147]:
reviewed_codings = reviewed_codings.drop_duplicates(subset=['mention_id', 'category', 'label'])

In [148]:
reviewed_codings = reviewed_codings.sort_values(['mention_id', 'category']).reset_index(drop=True)

In [152]:
def make_attribute_combination(x):
    if x is np.NaN:
        return np.NaN
    if x in ['class membership', 'ecology of group', 'education level', 'employment status', 'income/wealth/economic status', 'occupation/profession']:
        return 'economic: '+x
    if x in ['age', 'crime', 'ethnicity', 'family', 'gender/sexuality', 'health', 'nationality', 'place/location', 'religion', 'shared values/mentalities']:
        return 'non-economic: '+x
    return x

reviewed_codings['attribute_combination'] = reviewed_codings['category'].map(make_attribute_combination)

In [156]:
reviewed_codings.value_counts(['attribute_combination', 'label']).unstack().fillna(0).astype(int)

label,No,Yes
attribute_combination,Unnamed: 1_level_1,Unnamed: 2_level_1
economic: ecology of group,4,0
economic: education level,1,0
economic: employment status,0,6
economic: occupation/profession,7,0
non-economic: shared values/mentalities,1,4


In [158]:
reviewed_codings.groupby(['mention_id', 'attribute_combination']).filter(lambda x: x.label.nunique() > 1) # NOTE: should be none !!!
reviewed_codings.groupby(['mention_id', 'attribute_combination']).filter(lambda x: len(x) > 1) # NOTE: should be none !!!

Unnamed: 0,mention_id,text,mention,category,label,attribute_combination


#### "2. left-join with pre-review codings"

In [162]:
id_cols = ['mention_id', 'attribute_combination']
tmp = pre_review.query("attribute_combination not in @ignore").merge(reviewed_codings[id_cols + ['label']], how='left', on=id_cols, suffixes=('', '_reviewed')).sort_values(id_cols).reset_index(drop=True)
cols = ['mention_id', 'text', 'mention']
tmp = annotations[cols].drop_duplicates().reset_index(drop=True).merge(tmp)

In [164]:
tmp.groupby(['mention_id', 'attribute_combination']).filter(lambda x: x.label.nunique() > 1) # NOTE: should be none !!!
tmp.groupby(['mention_id', 'attribute_combination']).filter(lambda x: len(x) > 1) # NOTE: should be none !!!

Unnamed: 0,mention_id,text,mention,attribute_combination,label,label_reviewed


In [165]:
idxs = np.logical_and(tmp['label_reviewed'].notnull(), tmp['label']!=tmp['label_reviewed'])
idxs.mean() # % share of judgments corrected in expert review

0.002037037037037037

In [166]:
for mid, m in tmp[idxs].groupby('mention_id'):
    text = m.text.iloc[0]
    mention = m.mention.iloc[0]
    print(f'{mid}: "' + text.replace(mention, '\u001B[30m\u001B[43m'+mention+'\033[0m') + '"')
    for i, row in m.iterrows():
        print(f"  - \033[1m{row['attribute_combination']}\033[0m:  {row['label']} -> {row['label_reviewed']}")
    print()

11110_199109-390940-1: "It is only within the ecological framework that one can build [30m[43ma society for survival in prosperity and well-being[0m."
  - [1mnon-economic: shared values/mentalities[0m:  No -> Yes

12110_201309-341792-1: "The transition to [30m[43ma future-oriented green society[0m depends on new thinking and active efforts that include and engage, organizations and each of us."
  - [1meconomic: ecology of group[0m:  Yes -> No

13951_199803-188190-1: "We want [30m[43ma society of security and well-being for all[0m."
  - [1mnon-economic: shared values/mentalities[0m:  No -> Yes

14110_201904-203800-2: "Involve all stakeholders in society: [30m[43mworkers[0m, industry, civil society."
  - [1meconomic: employment status[0m:  No -> Yes
  - [1meconomic: occupation/profession[0m:  Yes -> No

21111_200706-59279-1: "Building [30m[43ma truly intergenerational society[0m."
  - [1mnon-economic: shared values/mentalities[0m:  Yes -> No

21112_199111-34952

In [167]:
tmp.loc[idxs, 'label'] = tmp.loc[idxs, 'label_reviewed']
del tmp['label_reviewed']

In [168]:
cols = annotations.columns.tolist()
cols.remove('label')

tmp = annotations[cols].merge(tmp, on=tmp.columns[:-1].tolist(), how='outer')
tmp.loc[idxs, 'round'] = 5

In [170]:
tmp.groupby(['mention_id', 'attribute_combination']).filter(lambda x: x.label.nunique() > 1) # NOTE: should be none !!!
tmp.groupby(['mention_id', 'attribute_combination']).filter(lambda x: len(x) > 1) # NOTE: should be none !!!

Unnamed: 0,mention_id,text,mention,prev_texts,next_texts,q_id,q_category,category,round,attribute,attribute_combination,label


In [172]:
fp = data_path / 'final_annotations.tsv'
tmp.to_csv(fp, sep='\t', index=False)