## TODOS

- [ ] repliate for botg group types
- [ ] put stuff in utils
- [ ] also tray k-means and select the one with better eval scores
- [x] check which embedding model used for relevance ranking => `intfloat/multilingual-e5-large-instruct`

In [1]:
import os


from dataclasses import dataclass
from typing import List, Literal, Union

import torch.nn.functional as F

import torch
from transformers import AutoTokenizer, AutoModel
import math

from tqdm.auto import tqdm

import gc
import torch



import numpy as np

# for topic modeling
## dimensionality reduction
from umap import UMAP 
## clustering
from sklearn.cluster import KMeans
from hdbscan import HDBSCAN 
## bag-of-words topic representations
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

from bertopic import BERTopic # <== topic modeling
from bertopic.representation import KeyBERTInspired


from datetime import datetime
# function to get a timstamp
ts = lambda: datetime.now().strftime("%Y-%m-%d %H-%M-%S")
log = lambda msg: print(f'[{ts()}] {msg}')


from typing import Union, List, Literal, Tuple, Dict
from numpy.typing import NDArray


import pandas as pd

from utils.io import read_jsonlines


In [None]:
base_path = os.path.join('..', '..')
data_path = os.path.join(base_path, 'data', 'annotations')
SEED = 42

In [2]:
jobs = [nm for nm in os.listdir(data_path) if nm.startswith('group-mention-annotation-batch-')]

fps = [os.path.join(data_path, job, 'review_annotations.jsonl') for job in jobs]

parse_entry = lambda x: {k: x[k] for k in ['id', 'text', 'label']}
data = [parse_entry(line) for fp in fps for line in read_jsonlines(fp)]

def parse_annotation(text, annotation, keep_text: bool):
    out = {
        'start': annotation[0],
        'end': annotation[1],
        'type': annotation[2],
        'mention': text[annotation[0]:annotation[1]]
    }
    if keep_text:
        out['text'] = text
    return out
    

def unnest_sequence_annotations(data, **kwargs):
    return [
        {'text_id': line['id'], 'mention_nr': i+1} | parse_annotation(line['text'], lab, **kwargs)
        for line in data 
        for i, lab in enumerate(line['label'])
    ]

In [3]:
df = pd.DataFrame(unnest_sequence_annotations(data, keep_text=True))

Unnamed: 0,text_id,mention_nr,start,end,type,mention,text
0,11110_198809-390636,1,5,12,social group,parents,Give parents the right to become municipal day...
1,11110_199109-390960,1,44,51,social group,society,"Therefore, we oppose the despolitisation of so..."
2,11110_199109-390960,2,55,81,organizational group,multinational corporations,"Therefore, we oppose the despolitisation of so..."
3,11110_199109-390960,3,200,238,social group,party leaders or officials in Brussels,"Therefore, we oppose the despolitisation of so..."
4,11110_199109-390940,1,62,113,social group,a society for survival in prosperity and well-...,It is only within the ecological framework tha...


In [4]:
examples = df[df.type == 'social group']

In [15]:
def clean_memory(device: Union[str, torch.device]):
    gc.collect()
    if str(device) == 'cuda':
        torch.cuda.empty_cache()
    elif str(device) == 'mps':
        torch.mps.empty_cache()
    else:
        pass

@dataclass
class E5SentenceEmbedder:
    model_name: str = 'intfloat/multilingual-e5-base'
    device: Literal['cuda', 'mps', 'cpu'] = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

    def __post_init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name)
        self.model.to(self.device);

    @staticmethod
    def _average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        """
        Source: https://huggingface.co/intfloat/multilingual-e5-base
        """
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    
    def encode(self, texts: List[str], batch_size: int=16, normalize: bool=True) -> torch.Tensor:
        """
        Source: based on https://huggingface.co/intfloat/multilingual-e5-base
        """
        # Each input text should start with "passage: ", even for non-English texts.
        # For tasks other than retrieval, you can simply use the "query: " prefix.
        texts = ['query: ' + text if not text.lower().startswith('query: ') else text for text in texts]

        embeddings = []
        n_ = len(texts)
        for i in tqdm(range(0, n_, batch_size), total=math.ceil(n_/batch_size)):
            batch_dict = self.tokenizer(texts[i:min(i+batch_size, n_)], max_length=512, padding=True, truncation=True, return_tensors='pt')
            with torch.no_grad():
                outputs = self.model(**batch_dict.to(self.model.device))
            tmp = self._average_pool(outputs.last_hidden_state, batch_dict['attention_mask']).half().cpu()
            embeddings.append(tmp)
            del tmp
            clean_memory(str(self.device))
        embeddings = torch.cat(embeddings, dim=0)
        
        # normalize embeddings
        if normalize:
            embeddings = F.normalize(embeddings, p=2, dim=1)

        return embeddings


In [16]:
def tune_bertopic_model(docs: List[str], embeddings: NDArray, min_cluster_size: int, seed: int=42):

    # dimensionality reduction
    umap_model = UMAP(
        n_neighbors=30, 
        n_components=50,
        min_dist=0.0, 
        metric='cosine', 
        random_state=seed
    )

    # clustering
    cluster_model = HDBSCAN(
        min_cluster_size=min_cluster_size,
        metric='euclidean', # <== default
        cluster_selection_method='eom', # <== default
        prediction_data=True # <== required (default)
    )

    # word vectorization
    vectorizer_model = CountVectorizer(
        max_df=0.85, 
        min_df=2, 
        tokenizer=lambda x: word_tokenize(x, language='english'),
    )

    # topic BoW representation
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

    topic_model = BERTopic(

        # components
        embedding_model=E5SentenceEmbedder(device='mps'),
        umap_model=umap_model,
        hdbscan_model=cluster_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        representation_model=KeyBERTInspired(top_n_words=10),
        
        # parameters
        nr_topics=None, # <== I've removed this to avoid merging topics after estimation
        calculate_probabilities=False, # WARNING: this would slow down BERTopic significantly at large amounts of data (>100_000 documents)
        top_n_words=10,

        verbose=False
    )

    np.random.seed(seed) # <== set seed for reproducibility
    topics, probs = topic_model.fit_transform(docs, embeddings=embeddings)

    return topic_model

In [13]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

def compute_coherece(
        model: BERTopic, 
        docs: Union[pd.Series, List[str]], 
        coherence_metric: Literal['u_mass', 'c_v', 'c_uci', 'c_npmi']='c_v',
        exclude_outlier_topic: bool=True,
    ) -> Tuple[Dict[str, Union[float, Dict[int, float]]], CoherenceModel]:
    """
    Compute coherence scores for a BERTopic model.

    Parameters:
        model (BERTopic): The BERTopic model.
        docs (Union[pd.Series, List[str]]): The documents to compute coherence on.
            Must be a pandas Series or a list of strings.
        coherence_metric (Literal['u_mass', 'c_v', 'c_uci', 'c_npmi'], optional): The coherence metric to use. 
            Allowed values are 'u_mass', 'c_v', 'c_uci', 'c_npmi' (see https://radimrehurek.com/gensim/models/coherencemodel.html)
            Defaults to 'c_v'.

    Returns:
        Tuple[Dict[str, Union[float, Dict[int, float]]], CoherenceModel]: 
           A tuple containing the coherence scores and the coherence model.
            - scores (Dict[str, Union[float, Dict[int, float]]]): The coherence scores.
                - overall (float): The overall coherence score.
                - by_topic (Dict[int, float]): The coherence score for each topic.
            - coherence_model (CoherenceModel): The coherence model object.

    """
    topic_words = [
        [word for word, _ in words] # for top-n words words in topic
        for tid, words in model.topic_representations_.items() # iterate over topics
        if (tid > -1 if exclude_outlier_topic else True)
    ]

    topics = model.topics_
    if exclude_outlier_topic:
        docs = [doc for doc, tid in zip(docs, topics) if tid > -1]
        topics = [tid for tid in topics if tid > -1]

    # extract vectorizer and analyzer from BERTopic
    vectorizer = model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    if isinstance(docs, list):
        docs = np.array(docs)
    cleaned_docs = model._preprocess_text(docs)
    toks = [analyzer(doc) for doc in cleaned_docs]

    topics = [tid for tid, toks_ in zip(topics, toks) if len(toks_) > 0]
    toks = [t for t in toks if len(t) > 0]

    # pre-process documents
    documents = pd.DataFrame({"Document": toks, "ID": range(len(toks)), "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': 'sum'})

    # extract features for Topic Coherence evaluation
    # words = vectorizer.get_feature_names_out()
    tokens = documents_per_topic.Document.to_list()
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]

    # compile coherence model
    coherence_model = CoherenceModel(
        topics=topic_words, 
        texts=tokens, 
        corpus=corpus,
        dictionary=dictionary, 
        coherence=coherence_metric
    )

    # evaluate coherence
    scores = {
        'overall': coherence_model.get_coherence(),
        'by_topic': {tid: c for tid, c in enumerate(coherence_model.get_coherence_per_topic())}
    }
    
    return scores, coherence_model


from sklearn.metrics import (
    silhouette_score, # <== compute overall, corpus-level score
    silhouette_samples # <== compute sample/document-level scores
)

def compute_silhouette_scores(model: BERTopic, seed):
    overall = silhouette_score(
        X=model.umap_model.embedding_, 
        labels=model.topics_, 
        sample_size=None, 
        random_state=seed
    )
    by_topic = silhouette_samples(X=model.umap_model.embedding_, labels=model.topics_)
    by_topic = pd.DataFrame({'topic': model.topics_, 'silhouette_score': by_topic})
    by_topic = by_topic.groupby('topic').agg(['mean', 'std'])
    # remove stacked columns
    by_topic.columns = by_topic.columns.droplevel(0)
    by_topic.reset_index(inplace=True)
    out = {
        'overall': overall,
        'by_topic': by_topic
    }
    return out


In [19]:
import os
import torch
if torch.backends.mps.is_available():
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [47]:
embedder = E5SentenceEmbedder(device='mps')
embeddings = embedder.encode(examples.mention.to_list(), batch_size=32).numpy()
mentions = examples.mention.to_list()

min_cluster_sizes = [5, 8, 10, 15, 30, 50, 75, 100]
results = {}
for mcs in min_cluster_sizes:
    log(f'fitting model with min. cluster size = {mcs}')
    topic_model = tune_bertopic_model(docs=mentions, embeddings=embeddings, min_cluster_size=mcs)
    results[mcs] = {}
    results[mcs]['coherence'], _ = compute_coherece(topic_model, mentions)
    results[mcs]['silhouette'] = compute_silhouette_scores(topic_model, seed=42)
    results[mcs]['n_clusters'] = len(set(topic_model.topics_))
    results[mcs]['share_outlier_docs'] = topic_model.get_topic_freq(-1)/len(mentions)

[2024-09-02 19-33-28] fitting model with min. cluster size = 5
[2024-09-02 19-33-51] fitting model with min. cluster size = 8
[2024-09-02 19-34-14] fitting model with min. cluster size = 10
[2024-09-02 19-34-41] fitting model with min. cluster size = 15
[2024-09-02 19-35-10] fitting model with min. cluster size = 30
[2024-09-02 19-35-38] fitting model with min. cluster size = 50
[2024-09-02 19-36-06] fitting model with min. cluster size = 75
[2024-09-02 19-36-32] fitting model with min. cluster size = 100


In [75]:
def parse_results(res):
    out = {
        'coherence_overall': res['coherence']['overall'],
    }
    tmp = np.array(list(res['coherence']['by_topic'].values()))
    out['coherence_topic_mean'], out['coherence_topic_std'] = tmp.mean(), tmp.std()

    out['silhouette_overall'] = res['silhouette']['overall']
    tmp = res['silhouette']['by_topic']
    tmp = tmp.loc[tmp.topic != -1, 'mean']
    out['silhouette_topic_mean'], out['silhouette_topic_std'] = tmp.mean(), tmp.std()

    out['n_clusters'] = res['n_clusters']
    out['share_outlier_docs'] = res['share_outlier_docs']

    return out

In [79]:
pd.DataFrame([parse_results(res) for res in results.values()], index=results.keys()).T



Unnamed: 0,5,8,10,15,30,50,75,100
coherence_overall,0.618425,0.603573,0.608943,0.614598,0.548584,0.575422,0.488649,0.463737
coherence_topic_mean,0.618425,0.603573,0.608943,0.614598,0.548584,0.575422,0.488649,0.463737
coherence_topic_std,0.176533,0.192988,0.18199,0.19167,0.178124,0.142821,0.165014,0.133129
silhouette_overall,0.466847,0.471584,0.506012,0.497499,0.382972,0.284968,0.253242,0.176193
silhouette_topic_mean,0.727877,0.77143,0.785583,0.799151,0.812837,0.816776,0.848728,0.769265
silhouette_topic_std,0.166053,0.15936,0.16924,0.171609,0.178668,0.188743,0.223755,0.228073
n_clusters,194.0,140.0,109.0,78.0,44.0,27.0,12.0,7.0
share_outlier_docs,0.173856,0.189465,0.170171,0.177542,0.234338,0.313028,0.241058,0.337308


In [None]:
# extract the metrics into a data frame
metrics_df = pd.concat(
    [
        pd.DataFrame(results).T.coherence.apply(lambda d: np.mean(list(d['by_topic'].values()))),
        pd.DataFrame(metrics).T.coherence.apply(lambda d: np.std(list(d['by_topic'].values()))),
        pd.DataFrame(metrics).T.silhouette.apply(lambda d: d['overall']),
        pd.DataFrame(metrics).T.silhouette.apply(lambda d: d['by_topic']['mean'].mean()),
        pd.DataFrame(metrics).T.silhouette.apply(lambda d: d['by_topic']['mean'].std()),
        pd.DataFrame(metrics).T.imbalance.apply(lambda d: np.mean([v for v in d.values() if v > 0])),
        pd.DataFrame(metrics).T.imbalance.apply(lambda d: np.std([v for v in d.values() if v > 0])),
        pd.DataFrame(metrics).T.entropy.apply(lambda d: np.mean(list(d.values()))),
        pd.DataFrame(metrics).T.entropy.apply(lambda d: np.std(list(d.values()))),
        pd.DataFrame(metrics).T.crosslingual_consistency.apply(lambda d: np.mean(list(d['averages'].values()))),
        pd.DataFrame(metrics).T.crosslingual_consistency.apply(lambda d: np.std(list(d['averages'].values())))
    ],
    axis=1,
    keys=
        ['coherence_mean', 'coherence_std'] + \
        ['silhouette_overall', 'silhouette_mean', 'silhouette_std'] + \
        ['lanuage_imbalance_mean', 'lanuage_imbalance_std'] + \
        ['language_entropy_mean', 'language_entropy_std'] + \
        ['crosslingual_consistency_mean', 'crosslingual_consistency_std']
)
metrics_df.T

In [None]:
import random
import warnings
def show_examples(topic_id, n = 10):
    topic_words = [w for w, _ in topic_model.get_topic(topic_id) if w.strip() != '']
    print('Topic words:', *topic_words[:n], sep='\n  - ')
    # for w in topic_words: print(f'  "{w}"')
    exs = list(set([m.strip().lower() for t, m in zip(topics, examples.mention) if t == topic_id]))
    if n > len(exs):
        n = len(exs)
        warnings.warn('WARNING: fewer unique mentions then selected sample size. setting n='+str(n))
    exs = random.Random(42).sample(exs, n)
    print()
    print('example mentions:', *exs, sep='\n  - ')

In [None]:
show_examples(0)

In [None]:
show_examples(1)

In [None]:
show_examples(2)

In [None]:
show_examples(3)

In [None]:
show_examples(4)
# note: shows need for mutlilabel classification scheme

In [None]:
show_examples(5)

In [None]:
show_examples(6)

In [None]:
show_examples(7)

In [None]:
show_examples(8)

In [None]:
show_examples(9)

In [None]:
show_examples(10)

In [None]:
show_examples(11)

In [None]:
show_examples(12)

In [None]:
show_examples(13)

In [None]:
show_examples(14)

In [None]:
show_examples(15)

In [None]:
show_examples(16)

In [None]:
show_examples(17)

In [None]:
show_examples(98)

In [None]:
show_examples(8)

In [None]:
import matplotlib.pyplot as plt

topic_model.get_topic_freq()['Count'].plot(kind='hist', figsize=(5, 3))
plt.show()

In [None]:
_, representative_docs, _, repr_doc_ids = \
    topic_model._extract_representative_docs(
        c_tf_idf=topic_model.c_tf_idf_,
        documents=pd.DataFrame({
            'Document': examples.mention.to_list(),
            'Topic': topics,
            'ID': range(len(topics))
        }),
        topics=topic_model.topic_representations_,
        nr_samples=len(examples),
        nr_repr_docs=len(examples),
    )

In [None]:
topic_model.representative_docs_[5]

In [None]:
topic_id = 5

list(set([examples.mention.to_list()[i].strip().lower() for i in repr_doc_ids[topic_id]]))[:10]

In [None]:
tmp = pd.DataFrame([(tid-1, i, r) for tid, idxs in enumerate(repr_doc_ids) for r, i in enumerate(idxs)])
tmp.columns = ['topic', 'idx', 'rank']
mentions = examples.mention.str.strip().str.lower().tolist()
tmp['mention'] = tmp.idx.apply(lambda i: mentions[i])
tmp = tmp.groupby(['topic', 'mention']).agg({'rank': 'median'}).reset_index()
tmp['rank'] = tmp['rank'].astype(int)
tmp.sort_values(['topic', 'rank'], inplace=True)

In [None]:
representative_docs = {tid: list(d.mention) for tid, d in tmp.groupby('topic').head(10).groupby('topic').agg({'mention': lambda x: x}).iterrows()}


In [None]:
coherence_scores, _ = compute_coherece(topic_model, examples.mention, coherence_metric='c_v')

In [None]:
# visualize
def plot_topic_coherence_scores(scores, add_overall=True):
    coherences_df = pd.DataFrame(
        scores['by_topic'].values(), 
        index=range(len(scores['by_topic'])), 
        columns=['coherence']
    )
    # create new plot
    plt.figure(figsize=(7, 5))
    coherences_df.sort_values(by='coherence', inplace=True)
    coherences_df['coherence'].plot(kind='barh')
    if add_overall:
        # draw a vertical line at the overall coherence score
        plt.axvline(scores['overall'], color='red', linestyle='--')
    plt.xlim(0, 1)
    plt.show()

plot_topic_coherence_scores(coherence_scores)

In [None]:
from sklearn.metrics import (
    silhouette_score, # <== compute overall, corpus-level score
    silhouette_samples # <== compute sample/document-level scores
)

In [None]:
# corpus level
overall_silhouette_score = silhouette_score(
    X=topic_model.umap_model.embedding_, 
    labels=topic_model.topics_, 
    sample_size=None, # <== None means do not subsample observations
    random_state=42 # <== only used to make subsampling reproducible (if applied)
)
overall_silhouette_score

In [None]:
silhouette_scores = silhouette_samples(X=topic_model.umap_model.embedding_, labels=topic_model.topics_)

# mean and standard deviation of silhouette scores by topic
silhouette_scores_by_topic = pd.DataFrame({'topic': topic_model.topics_, 'silhouette_score': silhouette_scores})
silhouette_scores_by_topic = silhouette_scores_by_topic.groupby('topic').agg(['mean', 'std'])
# remove stacked columns
silhouette_scores_by_topic.columns = silhouette_scores_by_topic.columns.droplevel(0)
silhouette_scores_by_topic.reset_index(inplace=True)
silhouette_scores_by_topic.sort_values(by='mean', inplace=True)

In [None]:
silhouette_scores_by_topic.to_dict(orient='records')

In [None]:
tid = 70
print('size:', topic_model.get_topic_freq(tid))
print([w for w, s in topic_model.get_topic(tid) if w.strip() != ''])
representative_docs[tid]


In [None]:
import seaborn as sns

# plot a boxplot (along x-axis) by topic (y-axis)
plt.figure(figsize=(7, 5))
sns.boxplot(x=topic_model.topics_, y=silhouette_scores, fliersize=.2)
