In [10]:
import os
import sys
from dotenv import load_dotenv
load_dotenv() 

# Set the target folder name you want to reach
target_folder = "phate-for-text"

# Get the current working directory
current_dir = os.getcwd()

# Loop to move up the directory tree until we reach the target folder
while os.path.basename(current_dir) != target_folder:
    parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
    if parent_dir == current_dir:
        # If we reach the root directory and haven't found the target, exit
        raise FileNotFoundError(f"{target_folder} not found in the directory tree.")
    current_dir = parent_dir

# Change the working directory to the folder where "phate-for-text" is found
os.chdir(current_dir)

# Add the "phate-for-text" directory to sys.path
sys.path.insert(0, current_dir)

In [11]:
import pandas as pd


In [12]:

# Example dictionary: replace this with your actual dictionary
data_sources = {
    "Amazon":"text-embedding-3-large_results/other_amz_results.csv",
    "Web of science":"text-embedding-3-large_results/other_WOS_results.csv",
    "DBpedia":"text-embedding-3-large_results/other_dbpedia_results_test.csv",
    "Ecosystems (d)":"text-embedding-3-large_results/processed_results_Energy, Ecosystems, and Humans_t1.0_maxsub3_depth5_random.csv",
    "Fisheries (d)":"text-embedding-3-large_results/processed_results_Offshore energy impacts on fisheries_t1.0_maxsub3_depth5_random.csv",
    "Ecosystems (s)":"text-embedding-3-large_results/processed_results_Energy, Ecosystems, and Humans_t1.0_maxsub5_depth3_random.csv",
    "Fisheries (s)":"text-embedding-3-large_results/processed_results_Offshore energy impacts on fisheries_t1.0_maxsub5_depth3_random.csv",
}



In [13]:
results = []
maximum = False

for source, filepath in data_sources.items():
    df = pd.read_csv(filepath)
    df=df.fillna("None")


    if 'reduction_params' in df.columns:
        df['Params']= df['reduction_params']+df['cluster_params']
    elif 'Params' not in df.columns:
        df['Params'] = ['None']*len(df)
    
    # Group by the three columns and take the median of the score columns

    if maximum:
        grouped_mean = df.groupby(['reduction_method', 'cluster_method', 'level','Params'])[['ARI']].mean().reset_index()
    
        # Then group by the two columns and take the mean
        grouped_max = grouped_mean.groupby(['reduction_method', 'cluster_method'])[['ARI']].max().reset_index()
        grouped_max['source'] = source
    
    # Append to list
        results.append(grouped_max)
    else:
        grouped_median = df.groupby(['reduction_method', 'cluster_method', 'level'])[['ARI']].median().reset_index()
        
        # Then group by the two columns and take the meanmi
        grouped_mean = grouped_median.groupby(['reduction_method', 'cluster_method'])[['ARI']].mean().reset_index()
        
        # Add the source column
        grouped_mean['source'] = source
    
    # Append to list
        results.append(grouped_mean)

# Concatenate all results
final_df = pd.concat(results, ignore_index=True)

# Optional: display or save
final_df=final_df.replace({"DC":"Diffusion condensation"})
final_df=final_df.replace({"Diffusion Condensation":"Diffusion condensation"})
final_df = final_df[final_df['reduction_method']!="BASE-PHATE"]
final_df = final_df[final_df['reduction_method']!="None"]
final_df = final_df[final_df['reduction_method']!="tSNE"]

In [14]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
final_df[final_df['cluster_method']=="Diffusion Condensation"].sort_values(by='ARI',ascending= False)

Unnamed: 0,reduction_method,cluster_method,ARI,source


In [15]:
import matplotlib.pyplot as plt

# Assuming you already have your DataFrame
# Replace this with your actual DataFrame
df = final_df

# Create a multi-index DataFrame for the pivot table
reduction_order = ['PHATE', 'PCA', 'UMAP', 'T-SNE']
cluster_order = ['Diffusion condensation', 'Agglomerative','HDBSCAN']
source_order = ['Web of science', 'DBpedia', 'Amazon','Fisheries (d)', 'Fisheries (s)','Ecosystems (d)','Ecosystems (s)']

# Set categorical types with specified order
df['reduction_method'] = pd.Categorical(df['reduction_method'], categories=reduction_order, ordered=True)
df['cluster_method'] = pd.Categorical(df['cluster_method'], categories=cluster_order, ordered=True)
df['source'] = pd.Categorical(df['source'], categories=source_order, ordered=True)

# Now pivot
pivot = df.pivot_table(
    index=['reduction_method', 'cluster_method'],
    columns='source',
    values='ARI'  # No need for ['ARI'] unless you have multiple values
)

# Optional: sort the index to reflect the custom order
pivot = pivot.sort_index()
def pivot_to_latex(pivot_table, file_name="pivot_table.tex"):
    # Determine max and second max values per column
    max_values = pivot_table.max(axis=0)
    second_max_values = pivot_table.apply(lambda col: col[col != col.max()].max())

    with open(file_name, "w") as f:
        f.write("\\begin{table}\n")
        f.write("\\begin{document}\n")
        f.write("[ht]\n")
        f.write("\\centering\n")
        f.write("\\caption{Comparison of clustering metrics by reduction and cluster method}\n")
        f.write("\\label{tab:pivot_table}\n")
        f.write("\\begin{adjustbox}{max width=\\textwidth}\n")
        f.write("\\begin{tabular}{ll" + "c" * len(pivot_table.columns) + "}\n")
        f.write("\\toprule\n")
        print( pivot_table.columns)
        # Write the column headers
        headers = "reduction method & cluster method & " + " & ".join(
            [f"{source}" for source in pivot_table.columns]) + " \\\\\n"
        f.write(headers)
        f.write("\\midrule\n")

        # Write the table rows
        for (reduction_method, cluster_method), row in pivot_table.iterrows():
            f.write(f"{reduction_method} & {cluster_method} & ")
            row_values = []
            for col, value in row.items():
                if value == max_values[col]:
                    formatted = f"\\textbf{{{value:.3f}}}"
                elif value == second_max_values[col]:
                    formatted = f"\\textit{{{value:.3f}}}"
                else:
                    formatted = f"{value:.3f}"
                row_values.append(formatted)
            f.write(" & ".join(row_values) + " \\\\\n")

        f.write("\\bottomrule\n")
        f.write("\\end{tabular}\n")
        f.write("\\end{adjustbox}\n")
        f.write("\\end{table}\n")
        f.write("\\end{document}\n")


# Assuming `pivot` is already created from your DataFrame:
# pivot = df.pivot_table(...) as you described

pivot_to_latex(pivot, "pivot_table.tex")
# print("LaTeX code has been saved to 'pivot_table.tex'")

print("LaTeX code has been saved to 'pivot_table.tex'")


CategoricalIndex(['Web of science', 'DBpedia', 'Amazon', 'Fisheries (d)',
                  'Fisheries (s)', 'Ecosystems (d)', 'Ecosystems (s)'],
                 categories=['Web of science', 'DBpedia', 'Amazon', 'Fisheries (d)', 'Fisheries (s)', 'Ecosystems (d)', 'Ecosystems (s)'], ordered=True, dtype='category', name='source')
LaTeX code has been saved to 'pivot_table.tex'


In [16]:
pivot

Unnamed: 0_level_0,source,Web of science,DBpedia,Amazon,Fisheries (d),Fisheries (s),Ecosystems (d),Ecosystems (s)
reduction_method,cluster_method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
PHATE,Diffusion condensation,0.005842,0.358797,0.246718,0.185601,0.207137,0.178871,0.230118
PHATE,Agglomerative,0.224911,0.467109,0.315135,0.210014,0.245929,0.207039,0.248099
PHATE,HDBSCAN,0.000121,0.007816,0.000637,0.015402,0.018614,0.082103,0.09724
PCA,Diffusion condensation,-0.001385,0.283691,0.246718,0.163419,0.171305,0.160698,0.199161
PCA,Agglomerative,0.2522,0.379651,0.343061,0.182017,0.240366,0.146692,0.245135
PCA,HDBSCAN,-3e-06,4.9e-05,-7.3e-05,0.055415,0.000385,0.032584,0.112675
UMAP,Diffusion condensation,0.00393,0.165275,0.212233,0.130139,0.125598,0.121873,0.133857
UMAP,Agglomerative,0.279785,0.418425,0.391752,0.245529,0.228249,0.228762,0.228528
UMAP,HDBSCAN,0.000183,0.000881,0.05837,0.142991,0.018243,0.075198,0.205423
