In [1]:
import pandas as pd
import sys
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import numpy as np

sys.path.append(str(Path.cwd().parent.parent))
from models.encoder.common import SEED
from utils import DATA_DIR

ds_large = pd.read_parquet(DATA_DIR / "datasets" / "large" / "combined.parquet")
ds_small = pd.read_parquet(DATA_DIR / "datasets" / "small" / "agreed.parquet")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Fill labels 4 and 5 to atleast average number of samples per class for the other classes from large dataset
label_counts = ds_small["label"].value_counts()
print(f"Label counts before balancing:\n{label_counts}")

target_labels = [4, 5]
other_counts = label_counts.loc[~label_counts.index.isin(target_labels)]
avg_count = int(other_counts.mean())
for label in [4, 5]:
    current_count = label_counts[label]
    added = 0
    if current_count < avg_count:
        needed = avg_count - current_count
        samples_to_add = ds_large[ds_large["label"] == label].sample(
            n=needed, random_state=SEED
        )
        ds_small = pd.concat([ds_small, samples_to_add], ignore_index=True)
        added += needed
    print(f"Added {added} samples for label {label}")

print(f"Final small dataset size: {len(ds_small)}")
label_counts = ds_small["label"].value_counts()
print(f"Label counts after balancing:\n{label_counts}")

# Ensure the 'features' column has a consistent numeric dtype across all rows to avoid
# pyarrow/arrow errors when writing to parquet (mixing float32 and float64).
# Convert each features entry to a list of Python floats with a fixed dtype.
ds_small["features"] = ds_small["features"].apply(
    lambda arr: np.asarray(arr, dtype=np.float32).astype(float).tolist()
)

ds_small.to_parquet(DATA_DIR / "datasets" / "small" / "balanced.parquet", index=True)

sm_train, sm_test = train_test_split(
    ds_small, test_size=0.4, random_state=SEED, stratify=ds_small["label"]
)
pd.DataFrame(sm_train).to_parquet(
    DATA_DIR / "datasets" / "small" / "train.parquet", index=True
)
pd.DataFrame(sm_test).to_parquet(
    DATA_DIR / "datasets" / "small" / "test.parquet", index=True
)

Label counts before balancing:
label
3    131
0    122
2    113
1     70
4     58
5      6
Name: count, dtype: int64
Added 51 samples for label 4
Added 103 samples for label 5
Final small dataset size: 654
Label counts after balancing:
label
3    131
0    122
2    113
4    109
5    109
1     70
Name: count, dtype: int64


### ModernBERT

In [4]:
from text2features import FeatureService
from text2features_paths import (
    FEATURE_PIPELINE_RESOURCES,
)
from sklearn.utils import gen_batches

feature_service = FeatureService(
    feature_pipeline_resources=FEATURE_PIPELINE_RESOURCES,
)

BATCH_SIZE = 32

Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 90898.23it/s]


In [16]:
batches = gen_batches(len(ds_small), BATCH_SIZE)
embedding_series = []

for batch in tqdm(batches, desc="Extracting embeddings"):
    texts = ds_small["text"].iloc[batch].tolist()
    features = feature_service.get_modernbert_embeddings(texts)
    embedding_series.extend(features)

embedding_df = ds_small[["text"]].copy()
embedding_df["cls"] = embedding_series
embedding_df.to_parquet(
    DATA_DIR / "datasets" / "small" / "modernbert_cls_embeddings.parquet", index=False
)

Extracting embeddings: 21it [05:35, 15.99s/it]


In [5]:
batches = gen_batches(len(ds_large), BATCH_SIZE)
embedding_series = []

for batch in tqdm(batches, desc="Extracting embeddings"):
    texts = ds_large["text"].iloc[batch].tolist()
    features = feature_service.get_modernbert_embeddings(texts)
    embedding_series.extend(features)

embedding_df = ds_large[["text"]].copy()
embedding_df["cls"] = embedding_series
embedding_df.to_parquet(
    DATA_DIR / "datasets" / "large" / "modernbert_cls_embeddings.parquet", index=False
)

Extracting embeddings: 3125it [6:43:54,  7.76s/it]


### MiniLM

In [11]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

2025-11-17 11:48:10.298589: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763376490.399135  723760 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763376490.427716  723760 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-17 11:48:10.661105: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
embedding_df = ds_small[["text"]].copy()
embedding_df["cls"] = model.encode(ds_small["text"].tolist()).tolist()
embedding_df.to_parquet(
    DATA_DIR / "datasets" / "small" / "minilm_embeddings.parquet", index=False
)

In [None]:
embedding_df = ds_large[["text"]].copy()
embedding_df["cls"] = model.encode(ds_large["text"].tolist()).tolist()
embedding_df.to_parquet(
    DATA_DIR / "datasets" / "large" / "minilm_embeddings.parquet", index=False
)

### Validate saved data

In [None]:
import pandas as pd
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent.parent))
from utils import DATA_DIR

sm_minilm = pd.read_parquet(
    DATA_DIR / "datasets" / "small" / "minilm_embeddings.parquet"
)
sm_modernbert = pd.read_parquet(
    DATA_DIR / "datasets" / "small" / "modernbert_cls_embeddings.parquet"
)

ds_small = pd.read_parquet(DATA_DIR / "datasets" / "small" / "balanced.parquet")
SM_LENGTH = len(ds_small)
assert len(sm_minilm) == SM_LENGTH, f"{len(sm_minilm)} != {SM_LENGTH}"
assert len(sm_modernbert) == SM_LENGTH, f"{len(sm_modernbert)} != {SM_LENGTH}"

# Duplicates in 'text' column
assert sm_minilm["text"].nunique() == SM_LENGTH
assert sm_minilm["cls"].apply(tuple).nunique() == SM_LENGTH
assert sm_modernbert["text"].nunique() == SM_LENGTH
assert sm_modernbert["cls"].apply(tuple).nunique() == SM_LENGTH

# NaNs in 'cls' lists
assert not sm_minilm.explode("cls")["cls"].isna().any()
assert not sm_modernbert.explode("cls")["cls"].isna().any()

In [2]:
import pandas as pd
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent.parent))
from utils import DATA_DIR

lg_minilm = pd.read_parquet(
    DATA_DIR / "datasets" / "large" / "minilm_embeddings.parquet"
)
lg_modernbert = pd.read_parquet(
    DATA_DIR / "datasets" / "large" / "modernbert_cls_embeddings.parquet"
)

LG_LENGTH = 100000

assert len(lg_minilm) == LG_LENGTH, f"{len(lg_minilm)} != {LG_LENGTH}"
assert len(lg_modernbert) == LG_LENGTH, f"{len(lg_modernbert)} != {LG_LENGTH}"

# Duplicates in 'text' column
assert lg_minilm["text"].nunique() == LG_LENGTH
assert lg_minilm["cls"].apply(tuple).nunique() == LG_LENGTH

assert lg_modernbert["text"].nunique() == LG_LENGTH
assert lg_modernbert["cls"].apply(tuple).nunique() == LG_LENGTH

# NaNs in 'cls' lists
assert not lg_minilm.explode("cls")["cls"].isna().any()
assert not lg_modernbert.explode("cls")["cls"].isna().any()

In [3]:
print(len(lg_modernbert["cls"].iloc[0]))

1024
