In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from tqdm import tqdm
import json
import pandas as pd

In [3]:
# ✅ Choose Model (Change to "roberta-base" or other if needed)
MODEL_NAME = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
model = DistilBertModel.from_pretrained(MODEL_NAME)
model.eval()  # Set to evaluation mode

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [4]:
# ✅ Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [5]:
# 🔹 Function to Generate Embeddings
def get_embeddings(texts, batch_size=32, pooling="cls"):
    """Generate BERT/RoBERTa embeddings for a list of texts."""
    all_embeddings = []
    
    # Ensure valid pooling method
    if pooling not in ["cls", "mean"]:
        raise ValueError("Invalid pooling method. Choose from 'cls' or 'mean'.")

    # Process texts in batches
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch_texts = texts[i : i + batch_size]

        with torch.no_grad():  # Disable gradient tracking
            inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
            inputs = {key: val.to(device) for key, val in inputs.items()}  # Move to GPU if available
            
            outputs = model(**inputs)
        
            # Extract embeddings
            if pooling == "cls":
                batch_embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token
            elif pooling == "mean":
                batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
        
        all_embeddings.append(batch_embeddings)

    # Stack all tensors and convert to NumPy at the end
    return torch.cat(all_embeddings).cpu().numpy()

In [6]:

# ✅ Load WOS JSON files normally
with open("./data/processed/wos_train_final.json", "r", encoding="utf-8") as f:
    wos_train = json.load(f)

with open("./data/processed/wos_test_final.json", "r", encoding="utf-8") as f:
    wos_test = json.load(f)

In [7]:
# ✅ Load NYT JSONL files line by line
def load_jsonl(filepath):
    data = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))  # Convert each line into a dictionary
    return data

nyt_train = load_jsonl("./data/processed/nyt_train.jsonl")
nyt_test = load_jsonl("./data/processed/nyt_test.jsonl")

print(f"✅ Loaded NYT Train: {len(nyt_train)} samples")
print(f"✅ Loaded NYT Test: {len(nyt_test)} samples")

✅ Loaded NYT Train: 1753211 samples
✅ Loaded NYT Test: 438303 samples


In [8]:
# Convert to DataFrame
nyt_train_df = pd.DataFrame(nyt_train)
nyt_test_df = pd.DataFrame(nyt_test)

In [9]:
# Load WOS JSON with proper conversion
wos_train_df = pd.read_json("./data/processed/wos_train_final.json", orient="records")  
wos_test_df = pd.read_json("./data/processed/wos_test_final.json", orient="records")

In [10]:
print(wos_train_df.columns)  # Check actual column names
print(nyt_train_df.columns)


RangeIndex(start=0, stop=37588, step=1)
Index(['text', 'tokens', 'labels', 'level1', 'level2', 'keywords'], dtype='object')


In [11]:
print(type(wos_train))  # Should be <class 'list'>


<class 'list'>


In [12]:
print(type(wos_train[0]))  # Should be <class 'dict'>

<class 'list'>


In [13]:
# Flatten wos_train if it's a nested list
if isinstance(wos_train[0], list):  
    wos_train = [item for sublist in wos_train for item in sublist]


In [14]:
print(type(wos_train[0]))  # Should now be a dict
print(wos_train[0].keys())  # Check available keys


<class 'dict'>
dict_keys(['text', 'tokens', 'labels', 'level1', 'level2', 'keywords'])


In [15]:
train_texts = [entry["text"] for entry in wos_train] + [entry["text"] for entry in nyt_train]


In [16]:
# Flatten wos_test if it's a nested list
if isinstance(wos_test[0], list):  
    wos_test = [item for sublist in wos_test for item in sublist]

In [17]:
# Check type after flattening
print(type(wos_test[0]))  # Should be dict
print(wos_test[0].keys())  # Ensure 'text' is a key

<class 'dict'>
dict_keys(['text', 'tokens', 'labels', 'level1', 'level2', 'keywords'])


In [18]:
# Extract test texts
test_texts = [entry["text"] for entry in wos_test] + [entry["text"] for entry in nyt_test]

In [None]:
train_embeddings = get_embeddings(train_texts)
test_embeddings = get_embeddings(test_texts)

Generating Embeddings:   0%|▏                                                  | 224/55963 [25:44<104:18:56,  6.74s/it]

In [None]:
# ✅ Save embeddings
torch.save(train_embeddings, "./data/processed/train_embeddings.pt")
torch.save(test_embeddings, "./data/processed/test_embeddings.pt")

print("✅ Embeddings extracted and saved!")