In [None]:
import pandas as pd
import glob
import matplotlib.pyplot as plt

# Load the data
def gen_scatter_clustering_comparison(file_comp):
    df = pd.read_csv(file_comp)

    df['clustering_llm'] = df['clustering_llm'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)
    df['clustering_medical_specialist'] = df['clustering_medical_specialist'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)

    # Create the scatter plot
    plt.figure(figsize=(10, 6))
    plt.title(f'\n{file_comp[len("self_order_clustering_metrics_"):-4]}')
    plt.scatter(df['clustering_llm'], df['clustering_medical_specialist'], alpha=0.5)
    plt.xlabel('clustering_llm')
    plt.ylabel('clustering_medical_specialist')
    plt.grid(True)
    plt.show()

# Get the list of CSV files that match the pattern
csv_files = glob.glob("self_order_clustering_metrics_medical_specialist_vs*.csv")

# Run the function for each file
for file in csv_files:
    gen_scatter_clustering_comparison(file)

In [None]:
def gen_clustering_correlation(file_comp):
    df = pd.read_csv(file_comp)

    df['clustering_llm'] = df['clustering_llm'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)
    df['clustering_medical_specialist'] = df['clustering_medical_specialist'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)

    # Calculate the correlation
    correlation = df['clustering_llm'].corr(df['clustering_medical_specialist'])
    print(f'Clustering correlation: {file_comp[len("self_order_clustering_metrics_"):-4]}: {correlation}')


# Run the function for each file
for file in csv_files:
    gen_clustering_correlation(file)

In [None]:
def gen_scatter_self_order_comparison(file_comp):
    df = pd.read_csv(file_comp)

    df['self_order_llm'] = df['self_order_llm'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)
    df['self_order_medical_specialist'] = df['self_order_medical_specialist'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)

    # Create the scatter plot
    plt.figure(figsize=(10, 6))
    plt.title(f'\n{file_comp[len("self_order_clustering_metrics_"):-4]}')
    plt.scatter(df['self_order_llm'], df['self_order_medical_specialist'], alpha=0.5)
    plt.xticks([0,1, 2, 3, 4, 5, 6, 7, 8])
    plt.xlim(df['self_order_medical_specialist'].min(), df['self_order_medical_specialist'].max())
    plt.xlabel('self_order_llm')
    plt.ylabel('self_order_medical_specialist')
    plt.grid(True)
    plt.show()

# Run the function for each file
for file in csv_files:
    gen_scatter_self_order_comparison(file)

In [None]:
def gen_self_order_correlation(file_comp):
    df = pd.read_csv(file_comp)

    df['self_order_llm'] = df['self_order_llm'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)
    df['self_order_medical_specialist'] = df['self_order_medical_specialist'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)

    # Calculate the correlation
    correlation = df['self_order_llm'].corr(df['self_order_medical_specialist'])
    print(f'Self Order correlation: {file_comp[len("self_order_clustering_metrics_"):-4]}: {correlation}')


# Run the function for each file
for file in csv_files:
    gen_self_order_correlation(file)

## Ignoring negative values

In [None]:
import pandas as pd
import glob
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
def gen_scatter_clustering_comparison(file_comp):
    df = pd.read_csv(file_comp)

    df['clustering_llm'] = df['clustering_llm'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)
    df['clustering_medical_specialist'] = df['clustering_medical_specialist'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)
    
    # Filter out negative values
    df = df[(df['clustering_llm'] >= 0) & (df['clustering_medical_specialist'] >= 0)]
    
    # Create the scatter plot with regression line
    plt.figure(figsize=(10, 6))
    plt.title(f'\n{file_comp[len("self_order_clustering_metrics_"):-4]}')
    sns.regplot(x='clustering_llm', y='clustering_medical_specialist', data=df, scatter_kws={'alpha':0.5})
    plt.xlabel('clustering_llm')
    plt.ylabel('clustering_medical_specialist')
    plt.grid(True)
    plt.show()
    plt.figure(figsize=(10, 6))
    df = df[(df['clustering_llm'] >= 0) & (df['clustering_medical_specialist'] >= 0)]
    plt.title(f'\n{file_comp[len("self_order_clustering_metrics_"):-4]}')
    plt.scatter(df['clustering_llm'], df['clustering_medical_specialist'], alpha=0.5)
    plt.xlabel('clustering_llm')
    plt.ylabel('clustering_medical_specialist')
    plt.grid(True)
    plt.show()

# Get the list of CSV files that match the pattern
csv_files = glob.glob("self_order_clustering_metrics_medical_specialist_vs*.csv")

# Run the function for each file
for file in csv_files:
    gen_scatter_clustering_comparison(file)

In [None]:
import pandas as pd

# Initialize an empty list to store the correlation results
correlation_results = []

# Function to calculate and store correlation
def store_correlation(file_comp):
    df = pd.read_csv(file_comp)

    df['clustering_llm'] = df['clustering_llm'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)
    df['clustering_medical_specialist'] = df['clustering_medical_specialist'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)

    # Calculate the correlation
    correlation = df['clustering_llm'].corr(df['clustering_medical_specialist'])
    correlation_results.append({'file': file_comp[len("self_order_clustering_metrics_"):-4], 'correlation': correlation})

# Run the function for each file
for file in csv_files:
    store_correlation(file)

# Convert the results to a DataFrame
correlation_df = pd.DataFrame(correlation_results)

# Sort the DataFrame by the file name
correlation_df = correlation_df.sort_values(by='file')

# Save the DataFrame to a CSV file
correlation_df.to_csv('clustering_correlation_results.csv', index=False)

In [None]:
# def gen_scatter_clustering_comparison(file_comp):
#     df = pd.read_csv(file_comp)

#     df['clustering_llm'] = df['clustering_llm'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)
#     df['clustering_medical_specialist'] = df['clustering_medical_specialist'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)

#     # Create the scatter plot
#     plt.figure(figsize=(10, 6))
#     plt.title(f'\n{file_comp[len("self_order_clustering_metrics_"):-4]}')
#     plt.scatter(df['clustering_llm'], df['clustering_medical_specialist'], alpha=0.5)
#     plt.xlabel('clustering_llm')
#     plt.ylabel('clustering_medical_specialist')
#     plt.grid(True)
#     plt.show()

# # Get the list of CSV files that match the pattern
# csv_files = glob.glob("self_order_clustering_metrics_medical_specialist_vs*.csv")

def gen_heatmap_clustering_comparison(file_comp):
        df = pd.read_csv(file_comp)

        df['clustering_llm'] = df['clustering_llm'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)
        df['clustering_medical_specialist'] = df['clustering_medical_specialist'].replace("0.0/0.0", 0).replace("0/0", 0).astype(float)

        # Create the heatmap
        plt.figure(figsize=(10, 6))
        plt.title(f'\n{file_comp[len("self_order_clustering_metrics_"):-4]}')
        sns.heatmap(df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.show()
    # Run the function for each file
for file in csv_files:
    gen_heatmap_clustering_comparison(file)

