In [2]:
import os
import sys
from dotenv import load_dotenv
load_dotenv() 

# Set the target folder name you want to reach
target_folder = "phate-for-text"

# Get the current working directory
current_dir = os.getcwd()

# Loop to move up the directory tree until we reach the target folder
while os.path.basename(current_dir) != target_folder:
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    if parent_dir == current_dir:
        # If we reach the root directory and haven't found the target, exit
        raise FileNotFoundError(f"{target_folder} not found in the directory tree.")
    current_dir = parent_dir

# Change the working directory to the folder where "phate-for-text" is found
os.chdir(current_dir)

# Add the "phate-for-text" directory to sys.path
sys.path.insert(0, current_dir)

In [3]:
import os
import pandas as pd
import re

In [4]:
def extract_filename_info(filename):
    pattern = (
        r"results_all_methods_(?P<theme>.+)_hierarchy_t(?P<t>[\d\.]+)_"
        r"maxsub(?P<max_sub>\d+)_depth(?P<depth>\d+)"
        r"(?:_synonyms(?P<synonyms>\d+))?"
        r"(?:_noise(?P<noise>[\d\.]+))?"
        r"(?:_(?P<branching>increasing|decreasing|constant|random))?"
        r"\.csv$"
    )

    match = re.match(pattern, filename)
    
    if match:
        info = match.groupdict()
        info["synonyms"] = info["synonyms"] if info["synonyms"] else "0"
        info["noise"] = info["noise"] if info["noise"] else "0"
        info["branching"] = info["branching"] if info["branching"] else "constant"
        return info
    return None


def process_files(folder_path, string_filters):
    param_str = ["'k': 4", "'t': 2", "'alpha': 2", "'alpha': 8","alpha_end", "t_end", "k_end"]
    
    combined_df = pd.DataFrame()
    processed_files = []

    for file in os.listdir(folder_path):
        if file.endswith(".csv") and file.startswith("results_all_methods_"):
            if not all(s in file for s in string_filters):
                continue
            if 'Indonesia' in file:
                continue

            file_path = os.path.join(folder_path, file)
            file_info = extract_filename_info(file)

            df = pd.read_csv(file_path)

            # Filter out rows based on undesired 'cluster_params' values
            if "cluster_params" in df.columns:
                df = df[~df["cluster_params"].astype(str).apply(lambda param: any(p in param for p in param_str))]

            # Create 'cluster_level' column based on ordinal encoding of 'level'
            if "level" in df.columns:
                unique_levels = sorted(df["level"].unique())
                level_to_cluster = {lvl: i + 1 for i, lvl in enumerate(unique_levels)}
                df["level"] = df["level"].map(level_to_cluster)

            # Add extracted filename info as new columns
            for key, value in file_info.items():
                df[key] = value
            
            combined_df = pd.concat([combined_df, df], ignore_index=True)
            processed_files.append(file)

    output_filename = "processed_results_" + "_".join(string_filters) + ".csv"
    output_path = os.path.join(folder_path, output_filename)
    combined_df.to_csv(output_path, index=False)
    return combined_df, processed_files

In [6]:

# Usage example
embedding = 'text-embedding-3-large'
folder_path = f"{embedding}_results"  # Change this to the actual folder path
# second argument is just any strings to identify which datasets you want to merge
combined_df, processed_files = process_files(folder_path,['Offshore energy impacts on fisheries','t1.0','maxsub5_depth3','random'])


In [27]:
for i in ['maxsub5_depth3','maxsub3_depth5']:
    for j in ['Offshore energy impacts on fisheries','Energy, Ecosystems, and Humans']:
        combined_df, processed_files = process_files(folder_path,[j,'t1.0',i,'random'])
