## Setup

In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

import regex
from nltk import download as nltk_download

nltk_download('punkt', quiet=True)
nltk_download('stopwords', quiet=True)
nltk_download('wordnet', quiet=True)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

In [None]:
data_path = Path('../../../data/annotations/group_mention_categorization')

## Preparing the dataset

In [3]:
fp = data_path / 'final_annotations.tsv'
annotations = pd.read_csv(fp, sep='\t')
# ignore = ['stance: ', 'universal: ']
# annotations.query("attribute_combination not in @ignore", inplace=True)

In [4]:
# verify
annotations.q_id.isna().sum()

0

In [5]:
annotations['sentence_id'] = annotations['mention_id'].str.split('-', expand=True).iloc[:, :-1].apply(lambda x: '-'.join(x), axis=1)

In [6]:
# gather attribute combinations with label=='Yes' at the mention level
mentions_df = annotations.groupby(['sentence_id', 'mention_id', 'text', 'mention'])[['attribute_combination', 'label']].apply(lambda x: sorted(set(x.attribute_combination[x.label=='Yes']))).reset_index()
mentions_df.rename(columns={0: 'attributes'}, inplace=True)
mentions_df['span'] = mentions_df.apply(lambda x: regex.search(regex.escape(x['mention']), x['text']).span(), axis=1)

## Get multi-label indicators

### Attribute level

In [7]:
df = annotations.query("category!='other'")

# normalize attribute combinations names
df.loc[:, 'attribute_combination'] = df['attribute_combination'].str.replace(': ', '__').str.replace('non-', 'non').str.replace(r'[^a-z_]+', '_', regex=True)

features = sorted(set(df['attribute_combination']))
df.loc[:, 'label'] = df['label'].map({'Yes': 1, 'No': 0})

# pivot labels for attribute_combination to columns using mention_id as id vars
df = df.pivot(index=['sentence_id', 'mention_id'], columns='attribute_combination', values='label').reset_index()
df.columns.name = None

In [8]:
cnts = df[features].sum(axis=0)
MIN_COUNT = 10
drop_these = cnts[cnts < MIN_COUNT].index.tolist()
if drop_these:
    print(f"Dropping features with less than {MIN_COUNT} positive examples: {drop_these}")
    for f in drop_these:
        features.remove(f)
    df.drop(columns=drop_these, inplace=True)

### dimension level

In [9]:
for dim in ['economic', 'noneconomic']:
    dim_features = [f for f in features if f.startswith(dim)]
    df[dim] = df[dim_features].astype(bool).any(axis=1).astype(int)

## Data splitting

### Prevent data leakage

Note that some mentions are (near) duplicates.
Random sampling into train/dev/test would cause data leakage.
Hence, I

1. identify near duplicate mentions using the token-level Jaccard similarity,
2. group these into components using a similarity threshold of > 0.5
3. block by component membership during data splitting

Additionally, I connect mentions in the same sentence to further limit leakage (i.e., make the task harder).

In [10]:
# tokenize mentions (applying lowercasing, punct. removal, stopword removal, and lemmatization)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_mention(mention):
    # lowercase
    mention = mention.lower()
    # tokenize
    tokens = word_tokenize(mention)
    # remove punctuation and stopwords, and lemmatize
    tokens = [
        lemmatizer.lemmatize(token)
        for token in tokens
        if token not in string.punctuation and token not in stop_words
    ]
    return tokens

mentions_df['mention_tokens'] = mentions_df['mention'].apply(preprocess_mention)

In [11]:
# detect near duplicates using jaccard similarity
def jaccard_sim(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0.0
    return intersection / union

def find_near_duplicates(df, threshold=0.8):
    near_duplicates = set()
    n = len(df)
    for i in range(n):
        for j in range(i + 1, n):
            sim = jaccard_sim(df.iloc[i]['mention_tokens'], df.iloc[j]['mention_tokens'])
            if sim > threshold:
                near_duplicates.add((i, j, sim)) 
    return near_duplicates

near_duplicates = find_near_duplicates(mentions_df, threshold=0.5)

In [12]:
near_duplicates_df = pd.DataFrame(list(near_duplicates), columns=['idx1', 'idx2', 'similarity'])
near_duplicates_df['mention_a'] = near_duplicates_df['idx1'].apply(lambda x: mentions_df.iloc[x]['mention'])
near_duplicates_df['mention_b'] = near_duplicates_df['idx2'].apply(lambda x: mentions_df.iloc[x]['mention'])

In [13]:
# number of near-duplicate pairs (≠ near duplicates components!)
len(near_duplicates_df)

393

In [14]:
# get distribution of similarity scores
near_duplicates_df.similarity.value_counts().sort_index(ascending=False)

similarity
1.000000    283
0.888889      1
0.875000      2
0.857143      3
0.833333      5
0.800000      4
0.750000     14
0.714286      8
0.666667     43
0.625000      9
0.600000      3
0.571429     10
0.555556      6
0.545455      2
Name: count, dtype: int64

In [15]:
# look at near duplicates below (token-level) identity
near_duplicates_df.query("similarity < 1.0").sort_values(by='similarity', ascending=True)

Unnamed: 0,idx1,idx2,similarity,mention_a,mention_b
154,274,381,0.545455,"lesbian, gay, bisexual, trans*, inter* and que...","Lesbian, Gay, Bisexual, Trans, Intersex, Queer..."
120,376,381,0.545455,"Lesbian, gay, bisexual, transgender, intersex ...","Lesbian, Gay, Bisexual, Trans, Intersex, Queer..."
91,274,289,0.555556,"lesbian, gay, bisexual, trans*, inter* and que...","lesbian, gay, bisexual, trans and intersex people"
155,289,376,0.555556,"lesbian, gay, bisexual, trans and intersex people","Lesbian, gay, bisexual, transgender, intersex ..."
359,289,380,0.555556,"lesbian, gay, bisexual, trans and intersex people","Lesbian, Gay Bisexual, Trans, Intersex, Queer ..."
...,...,...,...,...,...
281,57,59,0.857143,"people based on race, religion and ethnic origin","people based on race, gender religion and ethn..."
170,57,61,0.857143,"people based on race, religion and ethnic origin","people based on race, gender, religion or ethn..."
163,274,275,0.875000,"lesbian, gay, bisexual, trans*, inter* and que...","lesbian, gay, bisexual, trans*, inter* and que..."
175,274,294,0.875000,"lesbian, gay, bisexual, trans*, inter* and que...","lesbian, gay, bisexual, trans, inter and queer..."


In [16]:
# create a graph of near-duplicate mentions
import networkx as nx
G = nx.Graph()

# add edges for near-duplicate mentions
for _, row in near_duplicates_df.iterrows():
    G.add_edge(row['idx1'], row['idx2'])

In [17]:
# get connected components of near-duplicate mentions
connected_components = list(nx.connected_components(G))
len(connected_components)

58

In [18]:
# distribution of sizes of near duplicate components
pd.Series(map(len, connected_components)).value_counts().sort_index(ascending=False)

16     1
10     1
9      2
8      2
7      2
6      2
5      4
4      3
3     13
2     28
Name: count, dtype: int64

In [19]:
# show mentions in connected components larger than 3
for i, comp in enumerate(connected_components):
    if len(comp) > 3:
        mentions = mentions_df.loc[list(comp)].mention.tolist()
        print(f"Component {i} (N={len(comp)}):")
        print("-"*10)
        print(*list(set(mentions)), sep='\n')
        print()

Component 0 (N=8):
----------
our society
The society
society

Component 1 (N=16):
----------
lesbian, gay, bisexual, transgender and intersex people
lesbian, gay, bisexual, trans and intersex people
lesbian, gay, bisexual, trans and intersex persons
Lesbian, Gay, Bisexual, and Transgender Persons
lesbian, gay, bisexual, transgender people
Lesbian, Gay Bisexual, Trans, Intersex, Queer and Asexual (LGBTIQA+) and
lesbian, gay, bisexual, trans, inter and queer people
gay, lesbian, bisexual, transgender and intersex persons
Lesbian, Gay, Bisexual and Transgender people
gay, lesbian, bisexual and transgender people
Lesbian, Gay, Bisexual, Transgender or Intersex (LGBTI) people
lesbian, gay, bisexual, trans*, inter* and queer (LSBTIQ*) people
Lesbian, gay, bisexual, transgender, intersex and queer (LGBTIQ) people
Lesbian, Gay, Bisexual, Trans, Intersex, Queer and Asexual (LGBTIQA+) people
gay, lesbian, bisexual and transgender
lesbian, gay, bisexual, trans*, inter* and queer people

Componen

In [20]:
# add edges for mentions in the same sentence
for sentence_id, group in mentions_df.groupby('sentence_id'):
    indices = group.index.tolist()
    n = len(indices)
    for i in range(n):
        for j in range(i + 1, n):
            G.add_edge(indices[i], indices[j])

In [21]:
# get connected components of near-duplicate and same-sentence mentions
connected_components = list(nx.connected_components(G))
n_comps = len(connected_components)
n_comps

63

In [22]:
# add nodes for all mentions not already in the graph
# NOTE: this is necessary to ensure mentions that have no near-duplicates and do not share a sentence with any other mention have their own component IDs
for idx in mentions_df.index:
    if idx not in G:
        G.add_node(idx)

In [23]:
# get connected components of near-duplicate and same-sentence mentions
connected_components = list(nx.connected_components(G))
n_comps = len(connected_components)
n_comps

430

In [24]:
mention_components_df = pd.DataFrame([(c, i) for c, comp in enumerate(connected_components) for i in comp], columns=['component_id', 'mention_idx'])
mention_components_df.set_index('mention_idx', inplace=True)
mention_components_df.index.name = None
mention_components_df.sort_index(inplace=True)
mention_components_df.head()

Unnamed: 0,component_id
0,19
1,63
2,64
3,65
4,66


In [25]:
mentions_df = mentions_df.join(mention_components_df, how='left')

In [26]:
assert mentions_df.component_id.isnull().sum()==0

### add metadata

In [27]:
metacols = ["sentence_id", "mention_id", "text", "mention", "span", "component_id"]
df = mentions_df[metacols].merge(df, on=["sentence_id", "mention_id"], how="outer", indicator=True)
df._merge.value_counts()

_merge
both          600
left_only       0
right_only      0
Name: count, dtype: int64

In [28]:
del df['_merge']

### create folds

We want to block by component membership (using grouping) but keep the label distribution as similar across splits as possible.
So first, I create a "signature" of attribute-level labels that can be used for stratification

In [29]:
df.loc[:, 'signature'] = df.loc[:,features].apply(lambda x: ''.join(map(str, x.tolist())), axis=1)
df['signature'] = df['signature'].where(df['signature'].isin(df['signature'].value_counts()[df['signature'].value_counts() >= 10].index), '_')

In [30]:
df.reset_index(drop=True, inplace=True)

In [31]:
from sklearn.model_selection import StratifiedGroupKFold
n_splits = 5 # <=> test_size = 0.20
dev_size = 0.15

n_ = len(df)
n_dev = int(n_ * dev_size)

import warnings

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)
    splitter = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
    idxs = {}
    for fold, (tmp, tst) in enumerate(splitter.split(df.index, y=df['signature'], groups=df['component_id'])):
        sub_splitter = StratifiedGroupKFold(n_splits=int(len(tmp)/n_dev), shuffle=True, random_state=42)
        trn, val = next(sub_splitter.split(tmp, y=df.iloc[tmp]['signature'], groups=df.iloc[tmp]['component_id']))
        print(len(trn), len(val), len(tst))
        idxs[fold] = (tmp[trn], tmp[val], tst)

388 81 131
391 88 121
371 97 132
394 98 108
412 80 108


In [32]:
del df['component_id']
del df['signature']

### write folds to disk

In [None]:
splits_path = data_path / "splits" / "error_detection"

In [34]:
for fold, (trn, val, tst) in idxs.items():
    dest = splits_path / f"fold{fold+1:02d}"
    os.makedirs(dest, exist_ok=True)
    df.iloc[trn].to_pickle(dest / "train.pkl")
    df.iloc[val].to_pickle(dest / "val.pkl")
    df.iloc[tst].to_pickle(dest / "test.pkl")