In [7]:
import os
import sys
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

In [8]:
target_folder = "CMSE495"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

current_dir = os.getcwd()
while os.path.basename(current_dir) != target_folder:
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    if parent_dir == current_dir:
        raise FileNotFoundError(f"{target_folder} not found in the directory tree.")
    current_dir = parent_dir

os.chdir(current_dir)
sys.path.insert(0, current_dir)

In [9]:
theme = "rcv1"
t = 7.0
max_sub = 5
depth = 3
synonyms = 0
add_noise = 0.0
branching = "balanced"
embedding_model = "all-MiniLM-L6-v2"

os.makedirs('gpt_embeddings', exist_ok=True)

if float(add_noise) > 0:
    embed_file = f'gpt_embeddings/{theme}_hierarchy_t{t}_maxsub{max_sub}_depth{depth}_synonyms{synonyms}_noise{add_noise}_{embedding_model}_embed.npy'
else:
    embed_file = f'gpt_embeddings/{theme}_hierarchy_t{t}_maxsub{max_sub}_depth{depth}_synonyms{synonyms}_{embedding_model}_embed.npy'

In [10]:
def main():
    input_csv = f'data_generation/generated_data/{theme}_hierarchy_t{t}_maxsub{max_sub}_depth{depth}_synonyms{synonyms}_noise{add_noise}_{branching}.csv'
    
    if not os.path.exists(input_csv):
        print(f"Error: Run File 1 first. {input_csv} is missing.")
        return

    topic_data = pd.read_csv(input_csv)
    
    print(f"Embedding {len(topic_data)} docs using {embedding_model}...")
    model = SentenceTransformer(embedding_model)
    embeddings = model.encode(topic_data['topic'].tolist(), show_progress_bar=True)
    
    np.save(embed_file, embeddings)
    print(f"Success! Saved: {embed_file}")

if __name__ == "__main__":
    main()

Embedding 10788 docs using all-MiniLM-L6-v2...


Batches:   0%|          | 0/338 [00:00<?, ?it/s]

Success! Saved: gpt_embeddings/rcv1_hierarchy_t7.0_maxsub5_depth3_synonyms0_all-MiniLM-L6-v2_embed.npy
