In [69]:
import os 
import pandas as pd
import numpy as np
from pathlib import Path

data_path = Path('../../../data/annotations/group_mention_categorization')

In [70]:
fp = data_path / 'consolidated_annotations.tsv'
annotations = pd.read_csv(fp, sep='\t')
ignore = ['stance: ', 'universal: ']
annotations.query("attribute_combination not in @ignore", inplace=True)

In [71]:
annotations.groupby(['mention_id', 'attribute_combination']).filter(lambda x: x.label.nunique() > 1)

Unnamed: 0,mention_id,text,mention,prev_texts,next_texts,q_id,q_category,category,label,round,attribute,attribute_combination
2671,41112_199012-127111-2,Their clients are men from all walks of life a...,men from all walks of life and professions,And it’s usually men who organize and earn: as...,Any man can be.\nAnd the state also benefits f...,non-economic_attributes,3.0,gender/sexuality,Yes,1,non-economic,non-economic: gender/sexuality
7912,41112_199012-127111-2,Their clients are men from all walks of life a...,men from all walks of life and professions,And it’s usually men who organize and earn: as...,Any man can be.\nAnd the state also benefits f...,non-economic_attributes,3.0,gender/sexuality,No,2,non-economic,non-economic: gender/sexuality


In [72]:
annotations = annotations[~np.logical_and(annotations['mention_id']=='41112_199012-127111-2', annotations['round']==2)]

In [73]:
pre_review = annotations[['mention_id', 'attribute_combination', 'label']]

# Parse expert review annotations

In [74]:
fp = data_path / 'social-group-mention-categorization-expert-consolidation' / 'reviewed.xlsx'

# list sheets in workbook
sheets = pd.ExcelFile(fp).sheet_names

### check for internal inconsistencies

In [75]:
dfs = []
for i, sheet in enumerate(sheets):
    df = pd.read_excel(fp, sheet_name=sheet)
    attribute_cols = df.columns[df.columns.str.contains(': ')]
    df[attribute_cols] = df[attribute_cols].replace({np.nan: 'No'})
    df = df.melt(id_vars=['mention_id', 'text', 'mention'], value_vars=attribute_cols, var_name='attribute_combination', value_name='label').sort_values(by=['mention_id', 'attribute_combination']).reset_index(drop=True)
    df['sheet'] = i+1
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

In [76]:
# NOTE: should be none !!!
df.sort_values(['mention_id', 'attribute_combination']).groupby(['mention_id', 'attribute_combination']).filter(lambda x: x.label.nunique() > 1)

Unnamed: 0,mention_id,text,mention,attribute_combination,label,sheet


### List edits to be implemented

In [77]:
def split_and_strip(x):
    if x is np.NaN:
        return set()
    if isinstance(x, list):
        return set([s.strip() for s in x if s.strip() != ''])
    if isinstance(x, str):
        return set([s.strip() for s in x.split(';') if s.strip() != ''])
    return set()

dfs = []
cols = ['mention_id', 'text', 'mention', 'discard codings', 'add codings']
for i, sheet in enumerate(sheets):
    df = pd.read_excel(fp, sheet_name=sheet)
    idxs = np.logical_or(df['discard codings'].notna(), df['add codings'].notna())
    df = df[idxs]
    df['discard codings'] = df['discard codings'].map(split_and_strip)
    df['add codings'] = df['add codings'].map(split_and_strip)
    df = df[cols]
    df['sheet_nr'] = i+1
    dfs.append(df)

In [78]:
df = pd.concat(dfs, ignore_index=True)
df.sort_values(['mention_id', 'sheet_nr'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [79]:
# function checking disagreement between sets
from typing import List, Union
def any_inconsistencies(sets: Union[pd.Series, List[set]]) -> bool:
    if isinstance(sets, pd.Series):
        sets = sets.tolist()
    elif not isinstance(sets, list):
        raise TypeError("Input must be a list or a pandas Series of sets.")
    if not sets:
        return False
    union_set = set.union(*sets)
    for s in sets:
        if s != union_set:
            return True
        if s - union_set:
            return True
    return False
print(any_inconsistencies([{1, 2}, {1, 2, 3}]))  # should return True
print(any_inconsistencies([{1, 2}, {3, 4}]))  # should return True
print(any_inconsistencies([{1, 2}, {2, 3}]))  # should return True
print(any_inconsistencies([{1, 2}, {1, 2}]))  # should return False

True
True
True
False


In [80]:
# check if any inconsistencies in the discard codings for a given mention
these = df.groupby('mention_id')['discard codings'].agg(any_inconsistencies)
these = these[these].index.tolist()
# NOTE: should be none !!!
df[df.mention_id.isin(these)]

Unnamed: 0,mention_id,text,mention,discard codings,add codings,sheet_nr


In [81]:
# check if any inconsistencies in the discard codings for a given mention
these = df.groupby('mention_id')['add codings'].agg(any_inconsistencies)
these = these[these].index.tolist()
# NOTE: should be none !!!
df[df.mention_id.isin(these)]

Unnamed: 0,mention_id,text,mention,discard codings,add codings,sheet_nr


In [82]:
# mid = '80710_200907-92925-2'
# pre_review[pre_review.mention_id == mid]

In [83]:
set.union(*df['discard codings'].tolist())

{'age',
 'class membership',
 'crime',
 'ecology of group',
 'education level',
 'employment status',
 'ethnicity',
 'family',
 'gender/sexuality',
 'health',
 'income/wealth/economic status',
 'nationality',
 'occupation/profession',
 'shared values/mentalities'}

In [84]:
set.union(*df['add codings'].tolist())

{'class membership',
 'crime',
 'ecology of group',
 'employment status',
 'ethnicity',
 'family',
 'gender/sexuality',
 'health',
 'income/wealth/economic status',
 'nationality',
 'non-econ: other',
 'occupation/profession',
 'place/location',
 'religion',
 'shared values/mentalities'}

##### check if re-coding decisions are internally consistent

In [85]:
dfs = []
for i, sheet in enumerate(sheets):
    df = pd.read_excel(fp, sheet_name=sheet)

    attribute_cols = df.columns[df.columns.str.contains(':')]
    df[attribute_cols] = df[attribute_cols].replace({np.nan: 'No'})

    df = df.melt(id_vars=['mention_id', 'text', 'mention'], value_vars=attribute_cols, var_name='attribute_combination', value_name='label').sort_values(by=['mention_id', 'attribute_combination']).reset_index(drop=True)

    df['sheet_nr'] = i
    dfs.append(df)

In [86]:
df = pd.concat(dfs, ignore_index=True).sort_values(['attribute_combination', 'mention_id', ]).reset_index(drop=True)
df.groupby(['mention_id', 'attribute_combination']).filter(lambda x: x.label.nunique() > 1)

Unnamed: 0,mention_id,text,mention,attribute_combination,label,sheet_nr


## Unite

now

1. for each sheet parse updated codings from relevant columns containing ":"
2. add info about codings to add/discard
3. left-join with pre-review codings
4. (optionally) tabulate post-review edits

In [87]:
df = df.sort_values(['mention_id', 'attribute_combination']).reset_index(drop=True)

#### "1. for each sheet parse updated codings from relevant columns containing ":""

In [88]:
# NOTE: We can simply use set union because there were no discrepancies/inconsistencies across sheets for any mention
reviewed_codings = df.groupby(['mention_id', 'attribute_combination']).agg({'label': 'first'}).reset_index()

#### "2. add info about codings to add/discard"

In [89]:
dfs = []
cols = ['mention_id', 'text', 'mention', 'discard codings', 'add codings']
for i, sheet in enumerate(sheets):
    df = pd.read_excel(fp, sheet_name=sheet)
    idxs = np.logical_or(df['discard codings'].notna(), df['add codings'].notna())
    df = df[idxs]
    df['discard codings'] = df['discard codings'].map(split_and_strip)
    df['add codings'] = df['add codings'].map(split_and_strip)
    df = df[cols]
    df['sheet_nr'] = i+1
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True).sort_values(['mention_id', 'sheet_nr']).reset_index(drop=True)

In [90]:
edits = df.groupby(['mention_id', 'text', 'mention']).agg({
    'discard codings': lambda x: set.union(*x.tolist()),
    'add codings': lambda x: set.union(*x.tolist())
}).reset_index().sort_values(['mention_id', 'text', 'mention'])

In [91]:
def make_attribute_combination(x):
    if x is np.NaN:
        return np.NaN
    if x in ['class membership', 'ecology of group', 'education level', 'employment status', 'income/wealth/economic status', 'occupation/profession']:
        return 'economic: '+x
    if x in ['age', 'crime', 'ethnicity', 'family', 'gender/sexuality', 'health', 'nationality', 'place/location', 'religion', 'shared values/mentalities']:
        return 'non-economic: '+x
    return x

cols = ['mention_id', 'text', 'mention']
c = 'discard codings'
to_discard = edits[cols+[c]].explode(c)
to_discard = to_discard[to_discard[c].notnull()]
to_discard['attribute_combination'] = to_discard[c].map(make_attribute_combination)
to_discard['label'] = 'No'
del to_discard[c]

c = 'add codings'
to_add = edits[cols+[c]].explode(c)
to_add = to_add[to_add[c].notnull()]
to_add['attribute_combination'] = to_add[c].map(make_attribute_combination)
to_add['label'] = 'Yes' 
del to_add[c]

tmp = to_discard.merge(to_add, how='outer', on=cols+['attribute_combination'], suffixes=('_discard', '_add'))

# any inconsistencies?
idxs = np.logical_and(~tmp[['label_discard', 'label_add']].any(axis=1), tmp['label_discard']!=tmp['label_add'])
# NOTE: shjould be none !!!
idxs.sum()

to_edit = to_discard.merge(to_add, how='outer', on=cols+['attribute_combination', 'label'])

In [92]:
to_edit.attribute_combination.value_counts().sort_index()

attribute_combination
economic: class membership                  5
economic: ecology of group                  8
economic: education level                   1
economic: employment status                13
economic: income/wealth/economic status     6
economic: occupation/profession             9
non-econ: other                             2
non-economic: age                           1
non-economic: crime                         2
non-economic: ethnicity                     3
non-economic: family                        6
non-economic: gender/sexuality              5
non-economic: health                        8
non-economic: nationality                  10
non-economic: place/location                3
non-economic: religion                      2
non-economic: shared values/mentalities    17
Name: count, dtype: int64

In [93]:
to_edit.loc[to_edit['attribute_combination']=='non-econ: other', 'attribute_combination'] = 'non-econ: other'

In [94]:
to_edit.groupby(['mention_id', 'attribute_combination']).filter(lambda x: x.label.nunique() > 1) # NOTE: should be none !!!
to_edit.groupby(['mention_id', 'attribute_combination']).filter(lambda x: len(x) > 1) # NOTE: should be none !!!

Unnamed: 0,mention_id,text,mention,attribute_combination,label


##### check inconstistencies

In [96]:
these = reviewed_codings.merge(to_edit[reviewed_codings.columns], how='inner', on=['mention_id', 'attribute_combination'], suffixes=('_reviewed', '_edited')).query('label_reviewed != label_edited').sort_values(['mention_id', 'attribute_combination']).reset_index(drop=True).sort_values('mention_id')
these.merge(df[['mention_id', 'text', 'mention']].drop_duplicates())
# NOTE: in all these instances, `label_edited` is correct, so we can just use it to update the reviewed_codings

Unnamed: 0,mention_id,attribute_combination,label_reviewed,label_edited,text,mention
0,13229_201906-194411-1,economic: ecology of group,Yes,No,We want a green society that puts people first.,a green society that puts people first
1,14110_201504-201895-1,economic: ecology of group,Yes,No,Building a zero-emission society means enormou...,a zero-emission society
2,14110_201904-203801-1,economic: ecology of group,Yes,No,The transition to a zero-emission society must...,a zero-emission society
3,171101_200306-303761-1,economic: occupation/profession,No,Yes,"In the matter of environmental pollution, the ...",the members of the Green Ecologist
4,171101_200306-303788-1,economic: occupation/profession,No,Yes,"However, to the above, the Members of the Gree...",the Members of the Green Ecologist
5,21112_199111-34952-1,economic: ecology of group,Yes,No,An industry that must fit into an ecological s...,an ecological society
6,21112_199505-37048-2,non-economic: shared values/mentalities,No,Yes,Think of the producers of disposable packaging...,the agricultural lobbyists who are fighting ag...
7,22110_198909-311191-1,economic: ecology of group,Yes,No,The people who represent this ecological minds...,The people who represent this ecological mindset
8,22110_201006-322802-1,economic: income/wealth/economic status,Yes,No,From shareholders and managers who merely meas...,shareholders and managers who merely measure t...
9,31110_199303-208306-1,non-economic: shared values/mentalities,No,Yes,Or go toward an ecological society where the e...,an ecological society


In [97]:
reviewed_codings = reviewed_codings.merge(to_edit[reviewed_codings.columns], how='outer', on=['mention_id', 'attribute_combination'], suffixes=('', '_edited')).sort_values(['mention_id', 'attribute_combination'])

idxs = reviewed_codings['label'].isnull()
reviewed_codings.loc[idxs, 'label'] = reviewed_codings.loc[idxs, 'label_edited']

idxs = np.logical_and(reviewed_codings['label_edited'].notnull(), reviewed_codings['label_edited']!=reviewed_codings['label'])
reviewed_codings.loc[idxs, 'label'] = reviewed_codings.loc[idxs, 'label_edited']

del reviewed_codings['label_edited']

In [98]:
reviewed_codings.groupby(['mention_id', 'attribute_combination']).filter(lambda x: x.label.nunique() > 1) # NOTE: should be none !!!
reviewed_codings.groupby(['mention_id', 'attribute_combination']).filter(lambda x: len(x) > 1) # NOTE: should be none !!!

Unnamed: 0,mention_id,attribute_combination,label


#### "3. left-join with pre-review codings"

In [99]:
tmp = pre_review.query("attribute_combination not in @ignore").merge(reviewed_codings, how='left', on=['mention_id', 'attribute_combination'], suffixes=('', '_reviewed')).sort_values(['mention_id', 'attribute_combination']).reset_index(drop=True)
cols = ['mention_id', 'text', 'mention']
tmp = annotations[cols].drop_duplicates().reset_index(drop=True).merge(tmp)

In [100]:
tmp.groupby(['mention_id', 'attribute_combination']).filter(lambda x: x.label.nunique() > 1) # NOTE: should be none !!!
tmp.groupby(['mention_id', 'attribute_combination']).filter(lambda x: len(x) > 1) # NOTE: should be none !!!

Unnamed: 0,mention_id,text,mention,attribute_combination,label,label_reviewed


In [101]:
idxs = np.logical_and(tmp['label_reviewed'].notnull(), tmp['label']!=tmp['label_reviewed'])
idxs.mean() # % share of judgments corrected in expert review

0.011481481481481481

In [102]:
for mid, m in tmp[idxs].groupby('mention_id'):
    text = m.text.iloc[0]
    mention = m.mention.iloc[0]
    print(f'{mid}: "' + text.replace(mention, '\u001B[30m\u001B[43m'+mention+'\033[0m') + '"')
    for i, row in m.iterrows():
        print(f"  - \033[1m{row['attribute_combination']}\033[0m:  {row['label']} -> {row['label_reviewed']}")
    print()

11110_200209-393006-1: "[30m[43mStudents with children[0m have great difficulties bringing the economy together."
  - [1mnon-economic: age[0m:  Yes -> No

11320_200609-394026-2: "Elderly care and care should be of high quality.3 500 experienced elderly care employees will be offered to become [30m[43mmentors[0m."
  - [1mnon-economic: health[0m:  Yes -> No

11320_201409-397483-1: "[30m[43mEveryone working in the elderly care sector[0m should be trained in dementia and multi-disease."
  - [1mnon-economic: health[0m:  Yes -> No

12951_199309-331714-1: "[30m[43mUnemployed persons who do not want to work[0m will lose their insurance."
  - [1mnon-economic: shared values/mentalities[0m:  No -> Yes

13229_199409-186604-1: "You can be sure to find us on the same page as the movements that work for better social conditions for [30m[43mthe Copenhagen people[0m."
  - [1mnon-economic: nationality[0m:  Yes -> No
  - [1mnon-economic: place/location[0m:  No -> Yes

13229_199

In [103]:
tmp.loc[idxs, 'label'] = tmp.loc[idxs, 'label_reviewed']
del tmp['label_reviewed']

In [104]:
cols = annotations.columns.tolist()
cols.remove('label')

tmp = annotations[cols].merge(tmp, on=tmp.columns[:-1].tolist(), how='outer')
tmp.loc[idxs, 'round'] = 4

In [107]:
tmp.groupby(['mention_id', 'attribute_combination']).filter(lambda x: x.label.nunique() > 1) # NOTE: should be none !!!
tmp.groupby(['mention_id', 'attribute_combination']).filter(lambda x: len(x) > 1) # NOTE: should be none !!!

Unnamed: 0,mention_id,text,mention,prev_texts,next_texts,q_id,q_category,category,round,attribute,attribute_combination,label


In [108]:
fp = data_path / 'consolidated_annotations_post_review.tsv'
tmp.to_csv(fp, sep='\t', index=False)