In [1]:
import os
import json

import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


  from tqdm.autonotebook import tqdm, trange


## Load data

In [2]:
data_path = os.path.join('..', '..', '..', 'data')
annotations_path = os.path.join(data_path, 'annotations', 'group_mention_categorization')

### Group mentions with attribute annotations

In [3]:
# TODO: update file after integrating reviewed cases
fp = os.path.join(annotations_path, 'social-group-mention-categorization-llm-review', 'consolidated_annotations.tsv')
labeled_mentions = pd.read_csv(fp, sep='\t')

In [6]:
labeled_mentions['attribute'] = labeled_mentions.q_id.str.removesuffix('_attributes')

In [7]:
attributes = ['economic', 'non-economic']
labeled_mentions = labeled_mentions[labeled_mentions.attribute.isin(attributes)]

In [8]:
# inspect distribution of econ attributes
tmp = labeled_mentions[labeled_mentions.attribute==attributes[0]].value_counts(['category', 'label']).reset_index().pivot(index='category', columns='label', values='count')
tmp.fillna(0, inplace=True)
tmp = tmp.astype(int)
tmp['n'] = tmp[['No', 'Yes']].sum(axis=1)
tmp['prevalence'] = tmp['Yes'] / tmp['n']
tmp.round(3)

label,No,Yes,n,prevalence
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
class membership,278,34,312,0.109
ecology of group,297,15,312,0.048
education level,300,12,312,0.038
employment status,273,39,312,0.125
income/wealth/economic status,265,47,312,0.151
occupation/profession,232,80,312,0.256
other,310,2,312,0.006


In [9]:
# inspect distribution of non-econ attributes
tmp = labeled_mentions[labeled_mentions.attribute==attributes[1]].value_counts(['category', 'label']).reset_index().pivot(index='category', columns='label', values='count')
tmp.fillna(0, inplace=True)
tmp['n'] = tmp[['No', 'Yes']].sum(axis=1)
tmp['prevalence'] = tmp['Yes'] / tmp['n']
tmp.round(3)

label,No,Yes,n,prevalence
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,275,37,312,0.119
crime,299,13,312,0.042
ethnicity,303,9,312,0.029
family,299,13,312,0.042
gender/sexuality,306,6,312,0.019
health,296,16,312,0.051
nationality,280,32,312,0.103
other,309,3,312,0.01
place/location,300,12,312,0.038
religion,305,7,312,0.022


**NOTE:** some categories like 'economic > education level' or 'non-economic > religion' are severely underrepresented and hence likely unrepresentative of the conceptual core of the given category. This creates problems for (few-shot) learning. 

&rarr; need to sample more examples

In [10]:
labeled_mentions.query("category=='religion' and label=='Yes'").mention.drop_duplicates().values

array(['Foreign political agitators acting under the cover of Islam',
       'individuals who are discriminated against by companies or government agencies',
       'a multicultural society', 'A multicultural society',
       'white national, ethnic, religious and regional communities',
       'many of us', 'groups'], dtype=object)

### Social group mentions without attribute annotations

In [11]:
fp = os.path.join(data_path, 'labeled', 'manifesto_sentences_predicted_group_mentions_spans.tsv')
unlabeled_mentions = pd.read_csv(fp, sep='\t')

In [12]:
# subset to social group mentions
unlabeled_mentions = unlabeled_mentions.loc[unlabeled_mentions.label=='social group', ['sentence_id', 'sentence_text', 'span_nr', 'text']]

In [13]:
# discard already labeled mentions
unlabeled_mentions['mention_id'] = unlabeled_mentions['sentence_id'] + '-' + unlabeled_mentions['span_nr'].astype(str)
unlabeled_mentions = unlabeled_mentions[~unlabeled_mentions.mention_id.isin(labeled_mentions['mention_id'].tolist())]

In [14]:
# remove duplicates
unlabeled_mentions = unlabeled_mentions.sample(frac=1.0, random_state=42)
unlabeled_mentions = unlabeled_mentions.drop_duplicates('text')

## Sample additional unlabeled mentions for attribute annotation 

apply two criteria:

- similarity to attribute category definition
- try to balance the label distribution by over-sampling currently under-represented attribute categories

### Determine attribute category-specific "quotas"

In [15]:
ANNOTATION_BUDGET = 200

In [16]:
# inspect distribution of non-econ attributes
tmp = labeled_mentions.value_counts(['category', 'label']).reset_index().pivot(index='category', columns='label', values='count')
tmp.fillna(0, inplace=True)
tmp = tmp.astype(int)
tmp['n'] = tmp[['No', 'Yes']].sum(axis=1)

weights = tmp['n'] / tmp['Yes']
weights = weights[weights.index!='other']

quotas = np.round(weights / weights.sum() * ANNOTATION_BUDGET).astype(int)
quotas_dict = dict(zip(weights.index, quotas))
quotas_dict


{'age': 5,
 'class membership': 6,
 'crime': 15,
 'ecology of group': 13,
 'education level': 16,
 'employment status': 5,
 'ethnicity': 21,
 'family': 15,
 'gender/sexuality': 32,
 'health': 12,
 'income/wealth/economic status': 4,
 'nationality': 6,
 'occupation/profession': 2,
 'place/location': 16,
 'religion': 28,
 'shared values/mentalities': 3}

### embed mentions for duplicate removal and query-similarity based sampling

In [17]:
model_id = 'paraphrase-mpnet-base-v2'
embedder = SentenceTransformer(model_id, device='mps')

In [18]:
src_mentions = labeled_mentions.mention.drop_duplicates().tolist()
tgt_mentions = unlabeled_mentions.text.to_list()

embeddings = embedder.encode(src_mentions + tgt_mentions, batch_size=64, normalize_embeddings=True, show_progress_bar=True)

Batches:   0%|          | 0/1070 [00:00<?, ?it/s]

In [19]:
src_embeddings = embeddings[:len(src_mentions)]
tgt_embeddings = embeddings[len(src_mentions):]

### remove mentions that are very similar to already annotated ones

In [20]:
sims = cosine_similarity(tgt_embeddings, src_embeddings)

In [21]:
idxs = sims.max(axis=1)<0.85

unlabeled_mentions = unlabeled_mentions.iloc[idxs]
tgt_embeddings = tgt_embeddings[idxs]

In [22]:
len(unlabeled_mentions)

66788

### compute mentions' similarities to attribute definitions

In [23]:
import json5 as json

fp = os.path.join(annotations_path, 'attribute_definitions.json')
with open(fp) as f:
    attribute_defs = json.load(f)

In [24]:
adefs_embeddings = embedder.encode(list(attribute_defs.values()))

### score unlabeled mentions' similarity to attribute definitions

In [25]:
scored = cosine_similarity(tgt_embeddings, adefs_embeddings)
scored.shape # shape (# mention, # attributes)

(66788, 16)

### select most likely mentions for each attribute dimension

In [26]:
idxs = {}

quotas = [quotas_dict[a] for a in attribute_defs.keys()]
for i, (a, n) in enumerate(zip(attribute_defs.keys(), quotas)):
    these = np.argsort(scored[:,i])[::-1][:n]
    idxs[a] = unlabeled_mentions.iloc[these]

In [27]:
sampled = pd.concat(idxs).reset_index(level=0, names=['attribute'])
len(sampled)

199

In [28]:
for a in attribute_defs.keys():
    print(a, sampled.loc[sampled.attribute==a, 'text'].iloc[:5].to_list(), sep = ':\t')

class membership:	['special members of the (upper) middle class', 'social class', 'people from working-class backgrounds', 'classes of society', 'working-class people']
employment status:	['persons with different status (employed, self-employed', '-employed people', '-employed persons', 'employed / self-employed', 'unemployed persons of this category']
education level:	['those in education and training', 'graduates from specific forms of education', 'Higher vocational education students', 'people with a university degree or comparable qualifications', 'people either of origin in a structured education']
income/wealth/economic status:	['categories of people', '-income people', 'people with lower or higher incomes', 'people with low and middle incomes']
occupation/profession:	['such persons in the public sector', 'People who work in healthcare, nursing, education, social and educational professions']
ecology of group:	['social and environmental-oriented professionals', 'people at the cen

In [29]:
# NOTE: because some mentions are in the most-similar cases of multiple dimensions (as expected due to multilabel logic), fewer unique mentions than annotation budget
print(sampled.groupby('text').ngroups)
# sampled.groupby('text').size().sort_values(ascending=False)

179


## Write to disk

### aggregate at mention level

In [30]:
cols = ['sentence_id', 'mention_id', 'sentence_text','text']
out = sampled.groupby(cols).agg({'attribute': list}).reset_index()
out = out.rename(columns={'text': 'mention', 'sentence_text': 'text', 'attribute': 'attribute_candidates'})

In [77]:
# tmp = sampled.groupby(unlabeled_mentions.columns.tolist()).agg({'attribute': (list, 'count')}).reset_index().sort_values('mention_id')
# tmp[tmp.iloc[:, -1] == 3]

### add prev and following sentences

In [31]:
fp = os.path.join(data_path, 'manifestos', 'all_manifesto_sentences_translated.tsv')
sentences_df = pd.read_csv(fp, sep='\t')

In [32]:
sentences_df.rename(columns={'text_mt_m2m_100_1.2b': 'text_en'}, inplace=True)
sentences_df['manifesto_id'] = sentences_df.sentence_id.str.split('-', expand=True)[0]

# for each line, within manifesto, get the two texts occuring before and after the line in separate columns 
sentences_df['prev_texts'] = sentences_df.groupby('manifesto_id')['text_en'].shift([2, 1], fill_value='').values.tolist()
sentences_df['prev_texts'] = sentences_df['prev_texts'].apply(lambda x: [t for t in x if t != ''])
sentences_df['next_texts'] = sentences_df.groupby('manifesto_id')['text_en'].shift([-1, -2], fill_value='').values.tolist()
sentences_df['next_texts'] = sentences_df['next_texts'].apply(lambda x: [t for t in x if t != ''])

In [33]:
# merge with sampled sentenes
out = pd.merge(out, sentences_df[['sentence_id', 'prev_texts', 'next_texts']], on='sentence_id', how='left')

### write to disk

In [35]:
dest = os.path.join(annotations_path, 'social-group-mention-categorization-round03')
os.makedirs(dest, exist_ok=True)

fp = os.path.join(dest, 'sample.tsv') 
if not os.path.exists(fp):
    out.to_csv(fp, sep='\t', index=False)