In [1]:
import nltk
import torch
import torch.nn.functional as F
from datasets import load_dataset
import pandas as pd
import spacy
import ahocorasick
import json
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForMaskedLM, AutoConfig, AutoTokenizer
from tqdm.notebook import tqdm

⚙️  Running in WANDB offline mode


In [2]:
model = SentenceTransformer("sachinn1/xl-durel")
model.eval()

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False, 'architecture': 'XLMRobertaModel'})
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
def sentence_split(batch):
    examples = {
        "sentence": [],
        "id": [],
    }
    for i, text in enumerate(batch["text"]):
        date = int(batch["dump"][i][8:12])
        for idx, sent in enumerate(nltk.sent_tokenize(text)):
            examples["sentence"].append(sent)
            examples["id"].append(batch["id"][i] + f"-{idx}")
    return examples

In [6]:
adverbs = set()
with open("adv_ends_ly.txt") as file:
    for idx, line in enumerate(file.readlines()):
        line = line.strip()
        if not line.endswith("ly"):
            continue
        adverbs.add(line)

In [10]:
def get_adverb(sent):
    adv = [x for x in nltk.word_tokenize(sent["sentence"]) if x in adverbs]
    if len(adv) == 1:
        return {"adverb": adv[0]}
    else:
        return {"adverb": None}

# process opensubtitles

In [16]:
data = load_dataset("open_subtitles", lang1="en", lang2="fr", split="train")
data = data.map(lambda x: {"sentence": x["translation"]["en"]}).select_columns("sentence")
data = data.filter(lambda x: len(x["sentence"].split()) >= 4)

Map (num_proc=4):   0%|          | 0/29986414 [00:00<?, ? examples/s]

Filter:   0%|          | 0/29986414 [00:00<?, ? examples/s]

In [17]:
data[0]

{'sentence': "But it's better to learn the scale early.", 'adverb': 'early'}

In [8]:
corpus = [data]

# process fineweb-edu

In [9]:
# data = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split="train")

In [10]:
# columns = data.column_names
# data = data.map(sentence_split, remove_columns=columns, batched=True, num_proc=8)
# data = data.filter(lambda x: len(x["sentence"]) > 4), num_proc=8)
# data = data.filter(lambda x: sum([1 for end_index, val in automaton.iter(x["sentence"].lower())]) == 1, num_proc=8)

In [11]:
data = data.map(get_adverb, num_proc=4)
data = data.filter(lambda x: x["adverb"] is not None)

Map (num_proc=8):   0%|          | 0/1358877 [00:00<?, ? examples/s]

In [12]:
data = data.map(lambda x: {"sentence_t":x["sentence"].lower().replace(x["adverb"], f"<t>{x["adverb"]}</t>")}, num_proc=4)

Map (num_proc=4):   0%|          | 0/1358877 [00:00<?, ? examples/s]

In [13]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import pandas as pd

# Suppose your dataframe looks like this:
# df = pd.DataFrame({"adverb": [...], "sentence": [...]})

df = data.to_pandas()

clusters = []
for adverb, group in tqdm(df[~df.adverb.isin(["really", "actually", "seriously", "only"])].groupby("adverb")):
    # Encode sentences
    if group.shape[0] < 3:
        continue
    if group.shape[0] > 300:
        group = group.sample(300)
    embs = model.encode(group.sentence_t.values, convert_to_numpy=True)
    sim = model.similarity(embs, embs)
    dist = 1 - sim

    # Choose a clustering algorithm
    clustering = AgglomerativeClustering(
        n_clusters=None,              # let distance_threshold decide
        distance_threshold=0.4,
        metric="precomputed", 
        linkage="average"
    )
    
    # Fit & assign cluster labels
    labels = clustering.fit_predict(dist)
    
    # Store them in dataframe
    group = group.copy()
    group["cluster"] = labels
    clusters.append(group)

# Concatenate all clustered groups
clustered_df = pd.concat(clusters, ignore_index=True)

print(clustered_df.head())


  0%|          | 0/1557 [00:00<?, ?it/s]

                                            sentence    adverb  \
0      I've apologized abjectly, what more can I do?  abjectly   
1  But this jackal in a lion's skin... who by thr...  abjectly   
2             You have gone and abjectly appealed...  abjectly   
3               Now, look, I've apologized abjectly.  abjectly   
4                    I am abjectly at your disposal.  abjectly   

                                          sentence_t  cluster  
0  i've apologized <t>abjectly</t>, what more can...        0  
1  but this jackal in a lion's skin... who by thr...        1  
2      you have gone and <t>abjectly</t> appealed...        0  
3        now, look, i've apologized <t>abjectly</t>.        0  
4             i am <t>abjectly</t> at your disposal.        2  


In [14]:
#del model
model_name = "facebookai/xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)
model.to("cuda")
model.eval()

Some weights of the model checkpoint at facebookai/xlm-roberta-large were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


XLMRobertaForMaskedLM(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=102

In [15]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [16]:
def pseudo_entropy_batch_target_word(texts, targets, tokenizer, model, batch_size=8):
    """
    Compute pseudo-entropy for a list of (sentence, target_word) pairs in batches.
    Each sentence will have exactly one target word masked.

    Args:
        texts: list[str] — input sentences
        targets: list[str] — target words to mask (same length as texts)
        tokenizer: Hugging Face tokenizer
        model: Masked LM model (e.g., BERT)
        batch_size: int — number of sentences per batch

    Returns:
        list[float]: entropy for the masked target word in each sentence.
    """
    assert len(texts) == len(targets), "texts and targets must have the same length"
    entropies = []

    for batch_start in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[batch_start: batch_start + batch_size]
        batch_targets = targets[batch_start: batch_start + batch_size]

        tokens = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64)
        tokens = {k: v.to(model.device) for k, v in tokens.items()}

        input_ids = tokens["input_ids"]
        attention_mask = tokens["attention_mask"]
        batch_size_current, seq_len = input_ids.shape

        masked_inputs = input_ids.clone()
        mask_positions = torch.zeros(batch_size_current, dtype=torch.long, device=model.device)

        for b in range(batch_size_current):
            target = batch_targets[b]
            target_token_ids = tokenizer.encode(target, add_special_tokens=False)

            # Find where the target word appears in the tokenized sentence
            input_id_list = input_ids[b].tolist()
            found_pos = -1
            for i in range(len(input_id_list) - len(target_token_ids) + 1):
                if input_id_list[i:i+len(target_token_ids)] == target_token_ids:
                    found_pos = i
                    break

            if found_pos == -1:
                # If not found, skip masking
                mask_positions[b] = 0
                continue

            # Mask the first subtoken of the target word (simple approach)
            masked_inputs[b, found_pos] = tokenizer.mask_token_id
            mask_positions[b] = found_pos

        with torch.no_grad():
            logits = model(input_ids=masked_inputs, attention_mask=attention_mask).logits

        # Compute entropy for each masked position
        probs = F.softmax(logits[torch.arange(batch_size_current), mask_positions, :], dim=-1)
        entropy = -torch.sum(probs * torch.log(probs + 1e-12), dim=-1)
        entropies.extend(entropy.cpu().tolist())

    return entropies

In [17]:
texts = clustered_df["sentence"].tolist()
targets = clustered_df["adverb"].tolist()
clustered_df["entropy"] = pseudo_entropy_batch_target_word(texts, targets, tokenizer, model, batch_size=32)

  0%|          | 0/3571 [00:00<?, ?it/s]

In [18]:
filtered = (
    clustered_df.groupby(['adverb', 'cluster'])
    .filter(lambda g: len(g) >= 3)
)

# Then take the sentence with the lowest entropy per group
result = filtered.loc[
    filtered.groupby(['adverb', 'cluster'])['entropy'].idxmin(),
    ['adverb', 'cluster', 'sentence', 'entropy']
]

# (Optional) sort the result
result = result.sort_values(['adverb', 'cluster']).reset_index(drop=True)
result.to_csv("adverbs.csv")

In [19]:
result.groupby("adverb").count()["entropy"].mean()

np.float64(1.6145104895104896)

In [20]:
adverbs_unique = pd.Series(clustered_df.adverb.unique()).sample(50).values