In [1]:
import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

In [2]:
INPUT_FILE = "./data/rcv1_final_hierarchy.csv"
EMBED_DIR = "gpt_embeddings"
MODEL_NAME = "all-MiniLM-L6-v2"

In [3]:
OUTPUT_NAME = f"rcv1_hierarchy_t7.0_maxsub5_depth3_synonyms0_{MODEL_NAME}_embed.npy"
OUTPUT_PATH = os.path.join(EMBED_DIR, OUTPUT_NAME)

In [4]:
def main():
    if not os.path.exists(INPUT_FILE):
        print(f"Error: {INPUT_FILE} not found. Run Step 2 first!")
        return

    df = pd.read_csv(INPUT_FILE)
    texts = df['topic'].tolist()

    if os.path.exists(OUTPUT_PATH):
        print(f"Embeddings already exist at {OUTPUT_PATH}")
        return

    print(f"Loading local model: {MODEL_NAME}...")
    model = SentenceTransformer(MODEL_NAME)

    print(f"Generating embeddings for {len(texts)} documents...")
    # This runs on your local hardware
    embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

    os.makedirs(EMBED_DIR, exist_ok=True)
    np.save(OUTPUT_PATH, embeddings)
    print(f"Success! Saved local embeddings to {OUTPUT_PATH}")

if __name__ == "__main__":
    main()

Loading local model: all-MiniLM-L6-v2...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings for 20000 documents...


Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Success! Saved local embeddings to gpt_embeddings/rcv1_hierarchy_t7.0_maxsub5_depth3_synonyms0_all-MiniLM-L6-v2_embed.npy
