In [None]:
import os
import sys
from dotenv import load_dotenv
load_dotenv() 

# Set the target folder name you want to reach
target_folder = "phate-for-text"

# Get the current working directory
current_dir = os.getcwd()

# Loop to move up the directory tree until we reach the target folder
while os.path.basename(current_dir) != target_folder:
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    if parent_dir == current_dir:
        # If we reach the root directory and haven't found the target, exit
        raise FileNotFoundError(f"{target_folder} not found in the directory tree.")
    current_dir = parent_dir

# Change the working directory to the folder where "phate-for-text" is found
os.chdir(current_dir)

# Add the "phate-for-text" directory to sys.path
sys.path.insert(0, current_dir)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
import plotly.io as pio
import json
import ast

In [None]:
# "Energy, Ecosystems, and Humans" "Offshore energy impacts on fisheries" "West Java, Indonesia"

In [None]:
# None indicates all posibilities for that data parameter are included

theme =None 
t=1.0
maxsub = 5
depth = 3
synonyms= None
noise=None
branching = 'random'
param_lst = {'theme':theme,'t':t,'maxsub':maxsub,'depth':depth,'synonyms':synonyms,'noise':noise,'':branching}

embedding = 'text-embedding-3-large'

In [None]:

if None not in param_lst.values():
    topic_data = pd.read_csv(f'data_generation/generated_data/{theme}_hierarchy_t{t}_maxsub{maxsub}_depth{depth}_synonyms{synonyms}_noise{noise}_{branching}.csv')
    num_seed_topics = len(topic_data['category 0'].unique())

In [None]:
if None not in param_lst.values():
    bertopic_file = f'{embedding}_results/results_all_methods_{theme}_hierarchy_t{t}_maxsub{maxsub}_depth{depth}_synonyms{synonyms}_{branching}.csv'
else:
    file_string = "processed_results"
    for key,val in param_lst.items():
        if val is not None:
            if key !='theme':
                file_string += f'_{key}{val}'
            else:
                file_string += f'_{val}'

    file_string+='.csv'
    bertopic_file = f'{embedding}_results/'+file_string
            
bertopic_result_df=pd.read_csv(bertopic_file)

In [None]:
# bertopic_result_df = bertopic_result_df[bertopic_result_df['reduction_params']!="{}"]
bertopic_result_df = bertopic_result_df[bertopic_result_df['cluster_params']!="{}"]

In [None]:
bertopic_result_df.rename(columns={"t": "temp"}, inplace=True) # avoid overlap between temepreature for gpt and phate parameter

In [None]:
# Impute default parameters for analysis
replacement_map = {
    "{}": "{'n_components': 3, 'min_dist': 0.1, 'n_neighbors': 15}",
    "{'n_components': 300}": "{'n_components': 300, 'min_dist': 0.1, 'n_neighbors': 15}",
    "{'n_components': 100}": "{'n_components': 100, 'min_dist': 0.1, 'n_neighbors': 15}"
}

# Apply changes in-place to rows where reduction_method is 'UMAP'
bertopic_result_df.loc[
    bertopic_result_df['reduction_method'] == 'UMAP', 'reduction_params'
] = bertopic_result_df.loc[
    bertopic_result_df['reduction_method'] == 'UMAP', 'reduction_params'
].apply(lambda x: replacement_map.get(str(x), x))


In [None]:
def analyze_feature_importance(df, target_col, predictor_cols, n_estimators=100, random_state=42, plot_violin=False):
    """
    Trains a Random Forest model to evaluate feature importance and optimal value ranges.
    
    Parameters:
    - df: Pandas DataFrame containing the data
    - target_col: String, name of the target column
    - predictor_cols: List of strings, names of predictor columns
    - n_estimators: Number of trees in the Random Forest (default: 100)
    - random_state: Random seed for reproducibility (default: 42)
    - plot_violin: Boolean, whether to plot violin plots for each predictor (default: False)
    
    Returns:
    - feature_importance: DataFrame with feature importance scores
    - optimal_ranges: Dictionary mapping features to optimal value estimates
    """
    # Keep NaNs in predictors but drop NaNs in the target column
    df = df.dropna(subset=[target_col])
    
    # Separate categorical and numeric columns
    categorical_cols = [col for col in predictor_cols if df[col].dtype == 'object']
    numeric_cols = [col for col in predictor_cols if col not in categorical_cols]
    # Copy dataframe for encoding
    df_encoded = df.copy()
    label_encoders = {}

    # Encode categorical variables (treat NaNs as "Missing")
    for col in categorical_cols:
        df_encoded[col] = df_encoded[col].astype(str).fillna("Missing")  # Convert NaN to string
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])
        label_encoders[col] = le


    # Handle NaNs in numeric columns by replacing them with a placeholder (-99999)
    df_encoded[numeric_cols] = df_encoded[numeric_cols].dropna()

    # Define features and target
    X = df_encoded[predictor_cols]
    y = df[target_col]
    
    # Train Random Forest model
    rf = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    rf.fit(X, y)
    
    # Compute feature importance
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(8, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Feature Importance in Random Forest')
    plt.show()
    
    # Compute optimal ranges for features
    optimal_ranges = {}
    optimal_scores = {}
    for feature in predictor_cols:
        feature_vals = np.sort(df_encoded[feature].dropna().unique())  # Sorted unique values from encoded df
        if len(feature_vals) == 0:
            optimal_ranges[feature] = None
            continue
        
        pred_vals = [rf.predict(X[df_encoded[feature] == val]) for val in feature_vals]
        max_pred_idx = np.argmax([np.mean(pred) for pred in pred_vals])
        optimal_value = feature_vals[max_pred_idx]
        optimal_score = np.max(pred_vals[max_pred_idx])
        optimal_scores[feature] = optimal_score
        

        # Decode categorical variables back to original values
        if feature in label_encoders:
            optimal_value = label_encoders[feature].inverse_transform([optimal_value])[0]
        
        optimal_ranges[feature] = optimal_value
    # Optionally plot violin plots
    if plot_violin:
        for feature in predictor_cols:
            plt.figure(figsize=(10, 5))
            if feature in categorical_cols:
                df_plot = df.copy()
                df_plot[feature] = df_plot[feature].astype(str).fillna("Missing")  # Keep NaNs as "Missing"
                sns.violinplot(x=df_plot[feature], y=target_col, data=df_plot)
            else:
                df_plot = df.copy()
                df_plot[feature] = df_plot[feature].fillna('None')  # Keep NaNs explicitly
                sns.violinplot(x=df_plot[feature].astype(str), y=target_col, data=df_plot)
            
            plt.title(f'Distribution of {target_col} by {feature}')
            plt.xlabel(feature)
            plt.ylabel(target_col)
            plt.xticks(rotation=45)
            plt.show()
    
    return feature_importance, optimal_ranges,optimal_scores

def filter_and_expand(df, category, filter_val):
    """
    Filters the DataFrame based on a specific value in the given method column
    and expands the dictionary in the corresponding params column into separate columns.
    
    :param df: DataFrame to process
    :param category: Either 'reduction' or 'cluster' to specify which method to filter
    :param filter_val: Value to filter for
    :return: Processed DataFrame
    """
    method_col = f"{category}_method"
    params_col = f"{category}_params"
    
    # Filter the DataFrame
    filtered_df = df[df[method_col] == filter_val].copy()
    
    # Convert string dictionaries to actual dictionaries (if necessary)
    if filtered_df[params_col].dtype == 'object':
        filtered_df[params_col] = filtered_df[params_col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    # print(filtered_df[params_col])
    # Expand the parameters dictionary into separate columns
    params_df = filtered_df[params_col].apply(pd.Series)
    # params_df = params_df.dropna()
    # Concatenate the expanded parameters with the original DataFrame
    filtered_df = pd.concat([filtered_df.drop(columns=[params_col]), params_df], axis=1)
    
    return filtered_df, params_df.columns


In [None]:
def analyze_feature_statistics(df, target_col, predictor_cols):
    """
    Analyzes feature relevance using statistical associations with the target column.

    Parameters:
    - df: Pandas DataFrame containing the data
    - target_col: String, name of the target column
    - predictor_cols: List of strings, names of predictor columns

    Returns:
    - feature_scores: DataFrame with statistical association scores
    - optimal_values: Dictionary mapping features to optimal values (based on max target mean)
    """
    df = df.dropna(subset=[target_col])
    categorical_cols = [col for col in predictor_cols if df[col].dtype == 'object']
    numeric_cols = [col for col in predictor_cols if col not in categorical_cols]

    feature_scores = []
    optimal_values = {}

    # Handle numeric features
    for col in numeric_cols:
        temp_df = df[[col, target_col]].dropna()
        if temp_df.empty:
            continue
        corr = temp_df[col].corr(temp_df[target_col])
        grouped = temp_df.groupby(col)[target_col].mean()
        if not grouped.empty:
            opt_val = grouped.idxmax()
        else:
            opt_val = None
        feature_scores.append({'Feature': col, 'Score': abs(corr), 'Type': 'Numeric'})
        optimal_values[col] = opt_val

    # Handle categorical features
    for col in categorical_cols:
        df[col] = df[col].astype(str).fillna("Missing")
        grouped = df.groupby(col)[target_col].mean()
        if not grouped.empty:
            opt_val = grouped.idxmax()
            score = grouped.max() - grouped.min()
        else:
            opt_val = None
            score = 0
        feature_scores.append({'Feature': col, 'Score': score, 'Type': 'Categorical'})
        optimal_values[col] = opt_val

    feature_scores_df = pd.DataFrame(feature_scores).sort_values(by='Score', ascending=False)

    return feature_scores_df, optimal_values

## Compare Methods

In [None]:
score = 'ARI'
bertopic_result_df[f'mean_{score}'] = bertopic_result_df.groupby(['reduction_method', 'cluster_method', 'reduction_params', 'cluster_params'])[f'{score}'].transform('mean')
feature_importance, optimal_ranges,optimal_scores = analyze_feature_importance(bertopic_result_df, f'mean_{score}',['reduction_method',	'cluster_method'], plot_violin=True)

print(optimal_ranges)
optimal_scores

## Compare parameters for any given Mmthod

In [None]:
df_param,cols = filter_and_expand(bertopic_result_df, 'reduction', 'UMAP')
df_param1 = df_param
feature_importance, optimal_ranges, optimal_scores = analyze_feature_importance(df_param, f'mean_{score}',cols, plot_violin=True)
print(optimal_scores)
print(optimal_ranges)

In [None]:
df_param,cols = filter_and_expand(bertopic_result_df, 'reduction', 'PHATE')
df_param1 = df_param
feature_importance, optimal_ranges, optimal_scores = analyze_feature_importance(df_param, f'mean_{score}',cols, plot_violin=True)
print(optimal_scores)
print(optimal_ranges)

In [None]:
analyze_feature_statistics(df_param,f'mean_{score}',cols)