In [None]:
import pandas as pd

metrics_all = {'Human_84': pd.read_csv('medical_specialist/annotations-dpoc-medical_specialist_metrics_86.csv'),
               'Human_435': pd.read_csv('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv'),
               'BioBERT': pd.read_csv('biobert_balanced/annotations-dpoc-biobert_metrics.csv'),
               'BioBERT_Llama': pd.read_csv('biobert-llama_balanced/annotations-dpoc-biobert-llama_metrics.csv'),
               'Llama': pd.read_csv('llama/annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv'),
            #    'Llama_aug': pd.read_csv('llama/annotations-dpoc-llm_augmented_10_tf_idf_custom_shot_metrics.csv')
               'Llama_aug': pd.read_csv('llama/annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv')}

In [None]:
import pandas as pd
import os # Import os to check for file existence
import numpy as np # Import numpy for calculations

def calculate_correlations(metrics_dict):
    """
    Calculates the Pearson correlation between the *biannual year* (1-2, 3-4, 5-6) of the student and 'cluster membership' 
    for each DataFrame in the input dictionary.

    Args:
        metrics_dict (dict): A dictionary where keys are approach names (str)
                             and values are pandas DataFrames.

    Returns:
        dict: A dictionary where keys are the approach names and values
              are the calculated correlation coefficients. If columns
              are missing for an approach, the value will be None.
    """
    correlations = {}
    
    # Define the column names we need
    year_col = 'year'
    cluster_col = 'cluster kmeans 3 cat'

    for approach_name, df in metrics_dict.items():
        print(f"--- Processing: {approach_name} ---")
        
        # Check if the required columns exist in the DataFrame
        if year_col in df.columns and cluster_col in df.columns:
            try:
                # Calculate the Pearson correlation coefficient
                # Ensure data is numeric before calculating correlation
                year_data = pd.to_numeric(df[year_col], errors='coerce')
                cluster_data = pd.to_numeric(df[cluster_col], errors='coerce')
                
                # --- NEW: Map years to biannual groups (1,2)->1, (3,4)->2, (5,6)->3 ---
                biannual_year_data = np.ceil(year_data / 2)
                
                # Drop rows where data could not be coerced to numeric
                valid_data = pd.DataFrame({'biannual_year': biannual_year_data, 'cluster': cluster_data}).dropna()
                
                if not valid_data.empty:
                    # Calculate Pearson correlation (default, but explicitly stated)
                    corr = valid_data['biannual_year'].corr(valid_data['cluster'], method='pearson')
                    correlations[approach_name] = corr
                    print(f"Biannual Correlation: {corr}")
                else:
                    print(f"No valid numeric data to correlate after cleaning.")
                    correlations[approach_name] = None
                    
            except Exception as e:
                print(f"Error calculating correlation for {approach_name}: {e}")
                correlations[approach_name] = None
        else:
            print(f"Skipping {approach_name}: Missing required columns.")
            if year_col not in df.columns:
                print(f"  Missing column: '{year_col}'")
            if cluster_col not in df.columns:
                print(f"  Missing column: '{cluster_col}'")
            correlations[approach_name] = None
            
    return correlations

# --- Example Usage ---
if __name__ == "__main__":
    # This block will only run when the script is executed directly
    
    # Define the file paths
    file_paths = {
        'Human_84': 'medical_specialist/annotations-dpoc-medical_specialist_metrics_84.csv',
        'Human_435': 'medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv',
        'BioBERT': 'biobert_balanced/annotations-dpoc-biobert_metrics.csv',
        'BioBERT_Llama': 'biobert-llama_balanced/annotations-dpoc-biobert-llama_metrics.csv',
        'Llama': 'llama/annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv',
    }
    
    metrics_all = {}
    
    print("Loading CSV files...")
    # Load dataframes, handling potential FileNotFoundError
    for name, path in file_paths.items():
        if os.path.exists(path):
            try:
                metrics_all[name] = pd.read_csv(path)
                print(f"Successfully loaded: {path}")
            except Exception as e:
                print(f"Error reading {path}: {e}")
                metrics_all[name] = pd.DataFrame() # Add empty df to avoid errors later
        else:
            print(f"File not found: {path}. Skipping '{name}'.")
            # Create mock data if files don't exist, so the function can be tested
            print(f"Creating mock data for '{name}' for demonstration.")
            # Updated mock data to include years 5 & 6
            metrics_all[name] = pd.DataFrame({
                'year of the student': [1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 1, 3, 5],
                'cluster membership': [0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0],
                'other_data': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm']
            })

    # Check if we have any data to process
    if metrics_all:
        # Calculate the correlations
        all_correlations = calculate_correlations(metrics_all)
        
        print("\n--- Final Correlation Results ---")
        for approach, corr_value in all_correlations.items():
            if corr_value is not None:
                print(f"{approach}: {corr_value:.4f}")
            else:
                print(f"{approach}: Calculation failed or columns missing")
    else:
        print("\nNo data was loaded. Please check your file paths.")



In [None]:
import pandas as pd
import os # Import os to check for file existence
import numpy as np # Import numpy for calculations
import matplotlib.pyplot as plt # For plotting
import seaborn as sns # For nicer plots

def analyze_and_plot_clusters(metrics_dict):
    """
    Calculates the Pearson correlation between the *biannual year* (1-2, 3-4, 5-6) 
    of the student and 'cluster kmeans 3 cat' for each DataFrame.
    
    Also generates and saves a boxplot visualizing the distribution of 
    biannual years within each cluster.

    Args:
        metrics_dict (dict): A dictionary where keys are approach names (str)
                             and values are pandas DataFrames.

    Returns:
        tuple: (dict, list)
            - A dictionary of correlation coefficients.
            - A list of filenames for the saved plots.
    """
    correlations = {}
    plot_files = []
    
    # Define the column names we need
    year_col = 'year'
    cluster_col = 'cluster kmeans 3 cat'

    for approach_name, df in metrics_dict.items():
        print(f"--- Processing: {approach_name} ---")
        
        # Check if the required columns exist in the DataFrame
        if year_col in df.columns and cluster_col in df.columns:
            try:
                # Ensure data is numeric before calculating correlation
                year_data = pd.to_numeric(df[year_col], errors='coerce')
                cluster_data = pd.to_numeric(df[cluster_col], errors='coerce')
                
                # Map years to biannual groups (1,2)->1, (3,4)->2, (5,6)->3
                biannual_year_data = np.ceil(year_data / 2)
                
                # Create a clean DataFrame for analysis and plotting
                valid_data = pd.DataFrame({
                    'biannual_year': biannual_year_data, 
                    'cluster': cluster_data
                }).dropna()
                
                if not valid_data.empty:
                    # 1. Calculate Pearson correlation
                    corr = valid_data['biannual_year'].corr(valid_data['cluster'], method='pearson')
                    correlations[approach_name] = corr
                    print(f"Biannual Correlation: {corr:.4f}")
                    
                    # 2. Generate and save the plot
                    plt.figure(figsize=(10, 7))
                    # A boxplot shows the distribution (median, quartiles)
                    sns.boxplot(
                        data=valid_data, 
                        x='cluster', 
                        y='biannual_year',
                        palette='viridis'
                    )
                    # A stripplot (jitter) shows the individual data points
                    sns.stripplot(
                        data=valid_data, 
                        x='cluster', 
                        y='biannual_year', 
                        color='0.3', # dark grey
                        alpha=0.4, 
                        jitter=0.2
                    )
                    
                    plt.title(f'Distribution of Biannual Year by Cluster\nApproach: {approach_name} (Pearson r: {corr:.3f})', fontsize=16)
                    plt.xlabel(cluster_col, fontsize=12)
                    plt.ylabel('Biannual Year Group (1-2, 3-4, 5-6)', fontsize=12)
                    
                    # Save the figure
                    plot_filename = f"{approach_name}_cluster_year_distribution.png"
                    plt.savefig(plot_filename)
                    plt.close() # Close the plot to avoid displaying it inline
                    
                    plot_files.append(plot_filename)
                    print(f"Saved plot to {plot_filename}")
                    
                else:
                    print(f"No valid numeric data to correlate after cleaning.")
                    correlations[approach_name] = None
                    
            except Exception as e:
                print(f"Error during analysis for {approach_name}: {e}")
                correlations[approach_name] = None
        else:
            print(f"Skipping {approach_name}: Missing required columns.")
            if year_col not in df.columns:
                print(f"  Missing column: '{year_col}'")
            if cluster_col not in df.columns:
                print(f"  Missing column: '{cluster_col}'")
            correlations[approach_name] = None
            
    return correlations, plot_files

# --- Example Usage ---
if __name__ == "__main__":
    # This block will only run when the script is executed directly
    
    # Define the file paths
    file_paths = {
        'Human_84': 'medical_specialist/annotations-dpoc-medical_specialist_metrics_84.csv',
        'Human_435': 'medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv',
        'BioBERT': 'biobert_balanced/annotations-dpoc-biobert_metrics.csv',
        'BioBERT_Llama': 'biobert-llama_balanced/annotations-dpoc-biobert-llama_metrics.csv',
        'Llama': 'llama/annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv',
    }
    
    metrics_all = {}
    
    print("Loading CSV files...")
    # Load dataframes, handling potential FileNotFoundError
    for name, path in file_paths.items():
        if os.path.exists(path):
            try:
                metrics_all[name] = pd.read_csv(path)
                print(f"Successfully loaded: {path}")
            except Exception as e:
                print(f"Error reading {path}: {e}")
                metrics_all[name] = pd.DataFrame() # Add empty df to avoid errors later
        else:
            print(f"File not found: {path}. Skipping '{name}'.")
            # Create mock data if files don't exist, so the function can be tested
            print(f"Creating mock data for '{name}' for demonstration.")
            
            # --- FIXED MOCK DATA to use the new column names ---
            metrics_all[name] = pd.DataFrame({
                'year': [1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 1, 3, 5],
                'cluster kmeans 3 cat': [0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0],
                'other_data': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm']
            })

    # Check if we have any data to process
    if metrics_all:
        # Calculate the correlations and generate plots
        all_correlations, saved_plots = analyze_and_plot_clusters(metrics_all)
        
        print("\n--- Final Correlation Results ---")
        for approach, corr_value in all_correlations.items():
            if corr_value is not None:
                print(f"{approach}: {corr_value:.4f}")
            else:
                print(f"{approach}: Calculation failed or columns missing")
                
        print("\n--- Saved Plot Files ---")
        for plot_file in saved_plots:
            print(plot_file)
    else:
        print("\nNo data was loaded. Please check your file paths.")




In [None]:
import pandas as pd
from scipy.stats import chi2_contingency
import numpy as np

# --- User's Data Loading ---
# Note: These file paths are from your request.
# Please ensure these files are in the correct location relative to this script.
try:
    metrics_all = {
        'Human_84': pd.read_csv('medical_specialist/annotations-dpoc-medical_specialist_metrics_84.csv'),
        'Human_435': pd.read_csv('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv'),
        'BioBERT': pd.read_csv('biobert_balanced/annotations-dpoc-biobert_metrics.csv'),
        'BioBERT_Llama': pd.read_csv('biobert-llama_balanced/annotations-dpoc-biobert-llama_metrics.csv'),
        'Llama': pd.read_csv('llama/annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv'),
        'Llama_aug': pd.read_csv('llama/annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv')
    }
    print("Data loaded successfully.\n")
except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    print("Please ensure the file paths are correct. Creating dummy data to demonstrate the analysis.\n")
    # Create dummy data if files aren't found, to show the script works
    data = {
        'year': np.random.choice([1, 2, 3, 4, 5, 6], 200),
        'cluster kmeans 3 cat': np.random.choice(['A', 'B', 'C'], 200)
    }
    metrics_all = {'Dummy_Data': pd.DataFrame(data)}

# --- Analysis ---

# Define the columns of interest
year_col = "year"
cluster_col = "cluster kmeans 3 cat"
alpha = 0.05  # Significance level

# Define the function to map years to biannual groups
def group_years(year):
    if year in [1, 2]:
        return 'Year 1-2'
    elif year in [3, 4]:
        return 'Year 3-4'
    elif year in [5, 6]:
        return 'Year 5-6'
    else:
        return np.nan

# Iterate over each DataFrame in the dictionary
for model_name, df in metrics_all.items():
    print(f"--- Analyzing: {model_name} ---")

    # Make a copy to avoid SettingWithCopyWarning
    df_analysis = df.copy()

    # 1. Check if columns exist
    if year_col not in df_analysis.columns or cluster_col not in df_analysis.columns:
        print(f"Skipping {model_name}: Missing required columns ('{year_col}' or '{cluster_col}').\n")
        continue

    # 2. Create the biannual year group
    df_analysis['biannual_group'] = df_analysis[year_col].apply(group_years)

    # 3. Handle missing data
    original_count = len(df_analysis)
    df_analysis.dropna(subset=['biannual_group', cluster_col], inplace=True)
    valid_count = len(df_analysis)
    
    if valid_count < original_count:
        print(f"Note: Removed {original_count - valid_count} rows with missing data in 'year' or 'cluster' columns.")
    
    if valid_count == 0:
        print("Skipping: No valid data remaining after filtering NaNs.\n")
        continue

    # 4. Create a contingency table (crosstab)
    contingency_table = pd.crosstab(df_analysis['biannual_group'], df_analysis[cluster_col])
    
    print("\nContingency Table (Observed Frequencies):")
    print(contingency_table)

    # 5. Perform the Chi-squared test
    try:
        chi2_stat, p_value, dof, expected_freqs = chi2_contingency(contingency_table)
        
        print(f"\nExpected Frequencies (from Chi-squared test):")
        print(pd.DataFrame(expected_freqs, 
                           index=contingency_table.index, 
                           columns=contingency_table.columns).round(2))

        # Check assumption: expected frequency > 5
        if (expected_freqs < 5).any():
            print("\nWarning: Some cells have an expected frequency of less than 5.")
            print("The Chi-squared test results may be less reliable.")

        print(f"\nChi-squared Test Results:")
        print(f"  Chi-squared statistic: {chi2_stat:.4f}")
        print(f"  Degrees of freedom (dof): {dof}")
        print(f"  p-value: {p_value:.4f}")

        # 6. Interpret the results
        if p_value < alpha:
            print(f"\nInterpretation (p < {alpha}):")
            print(f"  There IS a statistically significant association between")
            print(f"  the biannual year group and cluster membership ('{cluster_col}').")
        else:
            print(f"\nInterpretation (p >= {alpha}):")
            print(f"  There is NO statistically significant association between")
            print(f"  the biannual year group and cluster membership ('{cluster_col}').")
    
    except ValueError as e:
        print(f"\nError during Chi-squared test: {e}")
        print("This can happen if the contingency table has a row or column with all zeros.")

    print("\n" + "-"*50 + "\n")

print("Analysis complete.")
