In [6]:
import pandas as pd
import os

In [7]:
INPUT_FILE = "./data/rcv1_processed.csv"
OUTPUT_FILE = "./data/rcv1_final_hierarchy.csv"

In [8]:
TOP_LEVELS = {'CCAT', 'ECAT', 'GCAT', 'MCAT'}

In [9]:
def clean_hierarchy(label_string):
    """
    Finds a single hierarchical path from a list of RCV1 tags.
    """
    if not isinstance(label_string, str):
        return "None", "None", "None"
    
    labels = label_string.split('|')
    
    found_tops = [l for l in labels if l in TOP_LEVELS]
    if not found_tops:
        return "OTHER", "OTHER", "OTHER"
    
    cat0 = found_tops[0]
    
    prefix = cat0[0]
    children = [l for l in labels if l.startswith(prefix) and l != cat0]
    children.sort(key=len)
    
    cat1 = children[0] if len(children) > 0 else cat0
    cat2 = children[1] if len(children) > 1 else cat1
    
    return cat0, cat1, cat2

In [10]:
def main():
    if not os.path.exists(INPUT_FILE):
        print(f"Error: {INPUT_FILE} not found. Run the Import script first!")
        return

    print(f"Loading {INPUT_FILE}...")
    df = pd.read_csv(INPUT_FILE)

    print("Extracting hierarchy paths...")

    hierarchy = df['all_labels'].apply(lambda x: pd.Series(clean_hierarchy(x)))
    hierarchy.columns = ['category 0', 'category 1', 'category 2']
    
    final_df = pd.concat([df[['item_id', 'topic']], hierarchy], axis=1)
    
    final_df.to_csv(OUTPUT_FILE, index=False)
    print(f"Success! Saved to {OUTPUT_FILE}")
    print("\nSample of your new hierarchy:")
    print(final_df[['category 0', 'category 1', 'category 2']].head())

if __name__ == "__main__":
    main()

Loading ./data/rcv1_processed.csv...
Extracting hierarchy paths...
Success! Saved to ./data/rcv1_final_hierarchy.csv

Sample of your new hierarchy:
  category 0 category 1 category 2
0       ECAT        E11        E11
1       CCAT        C24        C24
2       CCAT        C15       C151
3       CCAT        C15       C151
4       CCAT        C11        C22
