In [7]:
import os
import sys
import pandas as pd
import nltk
from nltk.corpus import reuters

In [8]:
target_folder = "CMSE495"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

current_dir = os.getcwd()
while os.path.basename(current_dir) != target_folder:
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    if parent_dir == current_dir:
        raise FileNotFoundError(f"{target_folder} not found in the directory tree.")
    current_dir = parent_dir

os.chdir(current_dir)
sys.path.insert(0, current_dir)

theme = "rcv1"
t = 7.0
max_sub = 5
depth = 3
synonyms = 0
add_noise = 0.0
branching = "balanced"

DATA_GEN_DIR = "data_generation/generated_data"
os.makedirs(DATA_GEN_DIR, exist_ok=True)

filename = f'{DATA_GEN_DIR}/{theme}_hierarchy_t{t}_maxsub{max_sub}_depth{depth}_synonyms{synonyms}_noise{add_noise}_{branching}.csv'

def main():
    print("Fetching NLTK Reuters...")
    nltk.download('reuters')
    
    file_ids = reuters.fileids()
    data_list = []
    for f in file_ids:
        text = reuters.raw(f).replace('\n', ' ').strip()
        cats = reuters.categories(f)
        if not cats: continue

        data_list.append({
            "item_id": f,
            "topic": text,
            "category 0": cats[0],
            "category 1": cats[0],
            "category 2": cats[0]
        })

    df = pd.DataFrame(data_list)
    df.to_csv(filename, index=False)
    print(f"Success! Data saved to {filename}")

if __name__ == "__main__":
    main()

Fetching NLTK Reuters...


[nltk_data] Downloading package reuters to /home/raosidha/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


Success! Data saved to data_generation/generated_data/rcv1_hierarchy_t7.0_maxsub5_depth3_synonyms0_noise0.0_balanced.csv
