In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_mean_category_values(csv_file, data_name =''):
    """
    Plots a line plot with 'year' as the X-axis and the mean category values as the Y-axis.
    Each category is represented by a different colored line.
    
    Parameters:
        csv_file (str): Path to the CSV file containing the data.
    """
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file)
    
    # List of category columns
    categories = ["pathophysiology", "epidemiology", "etiology", "history", 
                  "physical", "exams", "differential", "therapeutic"]
    
    # Group by 'year' and calculate the mean for each category
    mean_values = df.groupby("year")[categories].mean()
    
    # Plot the data
    plt.figure(figsize=(10, 6))
    for category in categories:
        plt.plot(mean_values.index, mean_values[category], label=category)
    
    # Add labels, title, and legend
    plt.xlabel("Year")
    plt.ylabel("Mean Category Values")
    plt.title("Mean Category Values by Year " + data_name)
    plt.legend(title="Categories", bbox_to_anchor=(1.0, 1), loc='upper left')
    plt.grid(True)
    
    # Show the plot
    # plt.tight_layout()
    plt.show()

# Example usage:
# plot_mean_category_values("aligned_annotations-dpoc-medical_specialist_metrics_435.csv")

In [None]:
plot_mean_category_values('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv', '- Medical specialist')
plot_mean_category_values('llama/annotations-dpoc-llm_3_static_shot_metrics.csv', '- Llama')

In [None]:
def plot_mean_category_values_subplots(csv_file, data_name =''):
    # Load and process data
    df = pd.read_csv(csv_file)
    categories = ["pathophysiology", "epidemiology", "etiology", "history",
                  "physical", "exams", "differential", "therapeutic"]
    mean_values = df.groupby("year")[categories].mean().reset_index()

    # Create subplots
    fig, axes = plt.subplots(2, 4, figsize=(16, 8), sharey=True)
    axes = axes.flatten()
    
    # Plot each category in its own subplot
    for idx, (ax, category) in enumerate(zip(axes, categories)):
        ax.plot(mean_values["year"], mean_values[category], 
                marker='o', linestyle='-', linewidth=2)
        ax.set_title(category.title())
        ax.set_xlabel("Year")
        ax.set_ylabel("Mean Value" if idx in [0, 4] else "")  # Only left-side labels
        ax.set_xticks(mean_values["year"])
        ax.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.suptitle(f"Mean Category Values by Year {data_name}", y=1.02, fontsize=14)
    plt.show()
plot_mean_category_values_subplots('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv', '- Medical specialist')
plot_mean_category_values_subplots('llama/annotations-dpoc-llm_3_static_shot_metrics.csv', '- Llama')

In [None]:
def plot_mean_category_values_enhanced(csv_file, data_name =''):
    # Load and process data
    df = pd.read_csv(csv_file)
    categories = ["pathophysiology", "epidemiology", "etiology", "history",
                  "physical", "exams", "differential", "therapeutic"]
    mean_values = df.groupby("year")[categories].mean().reset_index()

    # Configure style
    plt.figure(figsize=(12, 6))
    plt.style.use('seaborn-v0_8-whitegrid')
    
    # Create a list of 8 distinct colors
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', 
              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f']
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p']
    
    # Plot each category with unique style
    for i, category in enumerate(categories):
        plt.plot(mean_values["year"], mean_values[category],
                 color=colors[i],
                 marker=markers[i],
                 markersize=8,
                 linewidth=2,
                 linestyle='--' if i % 2 else '-',
                 label=category.title())

    plt.title(f"Mean Category Values by Year {data_name}", fontsize=14)
    plt.xlabel("Year", fontsize=12)
    plt.ylabel("Mean Value", fontsize=12)
    plt.xticks(mean_values["year"])
    plt.legend(title="Categories", 
              bbox_to_anchor=(1.05, 1), 
              loc='upper left',
              frameon=True)
    plt.tight_layout()
    plt.show()
plot_mean_category_values_enhanced('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv', '- Medical specialist')
plot_mean_category_values_enhanced('llama/annotations-dpoc-llm_3_static_shot_metrics.csv', '- Llama')

In [None]:
def plot_mean_category_values_enhanced(csv_file, data_name =''):
    # Load and process data
    df = pd.read_csv(csv_file)
    categories = ["pathophysiology", "epidemiology", "etiology", "history",
                  "physical", "exams", "differential", "therapeutic"]
    mean_values = df.groupby("year")[categories].mean().reset_index()

    # Configure style
    plt.figure(figsize=(12, 6))
    plt.style.use('seaborn-v0_8-whitegrid')
    
    # Create a list of 8 distinct colors
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', 
              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f']
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p']
    
    # Plot each category with unique style
    for i, category in enumerate(categories):
        plt.plot(mean_values["year"], mean_values[category],
                 color=colors[i],
                 marker=markers[i],
                 markersize=8,
                 linewidth=2,
                 linestyle='--' if i % 2 else '-',
                 label=category.title())

    plt.title(f"Mean Category Values by Year {data_name}", fontsize=14)
    plt.xlabel("Year", fontsize=12)
    plt.ylabel("Mean Value", fontsize=12)
    plt.xticks(mean_values["year"])
    plt.legend(title="Categories", 
              bbox_to_anchor=(1.05, 1), 
              loc='upper left',
              frameon=True)
    plt.tight_layout()
    plt.show()
plot_mean_category_values_enhanced('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv', '- Medical specialist')
plot_mean_category_values_enhanced('llama/annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv', '- Llama')

In [None]:
def plot_mean_category_values_enhanced(csv_file, data_name ='', ax=None):
    # Load and process data
    df = pd.read_csv(csv_file)
    categories = ["pathophysiology", "epidemiology", "etiology", "history",
                  "physical", "exams", "differential", "therapeutic"]
    mean_values = df.groupby("year")[categories].mean().reset_index()

    # Create a list of 8 distinct colors
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', 
              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f']
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p']

    # Use provided axis or create a new one
    if ax is None:
        fig, ax = plt.subplots(figsize=(6, 3))

    # Plot each category with unique style
    for i, category in enumerate(categories):
        ax.plot(mean_values["year"], mean_values[category],
                color=colors[i],
                marker=markers[i],
                markersize=8,
                linewidth=2,
                linestyle='--' if i % 2 else '-',
                label=category.title())

    ax.set_title(f"Mean Category Values by Year {data_name}", fontsize=12)
    ax.set_xlabel("Year", fontsize=10)
    ax.set_ylabel("Mean Value", fontsize=10)
    ax.set_xticks(mean_values["year"])
    ax.set_yticks(range(0,8))
    ax.grid(True, alpha=0.3)

# Prepare 2x2 grid for the four plots
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
plot_mean_category_values_enhanced('medical_specialist/annotations-dpoc-medical_specialist_metrics_86.csv', '- Medical specialist', ax=axes[0, 0])
plot_mean_category_values_enhanced('biobert-llama_balanced/annotations-dpoc-biobert-llama_metrics.csv', '- biobert-llama', ax=axes[0, 1])
plot_mean_category_values_enhanced('biobert_balanced/annotations-dpoc-biobert_metrics.csv', '- biobert', ax=axes[1, 0])
plot_mean_category_values_enhanced('llama/86_filtered/annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv', '- Llama', ax=axes[1, 1])
plt.tight_layout()
plt.show()

In [None]:
def plot_mean_category_values_enhanced(csv_file, data_name =''):
    # Load and process data
    df = pd.read_csv(csv_file)
    categories = ["pathophysiology", "epidemiology", "etiology", "history",
            "physical", "exams", "differential", "therapeutic"]
    # Normalize each row so that the sum of category values is 1
    df_norm = df.copy()
    df_norm[categories] = df[categories].div(df[categories].sum(axis=1), axis=0)
    mean_values = df_norm.groupby("year")[categories].mean().reset_index()

    # Configure style
    plt.figure(figsize=(12, 6))
    plt.style.use('seaborn-v0_8-whitegrid')
    
    # Create a list of 8 distinct colors
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', 
              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f']
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p']
    
    # Plot each category with unique style
    for i, category in enumerate(categories):
        plt.plot(mean_values["year"], mean_values[category],
                 color=colors[i],
                 marker=markers[i],
                 markersize=8,
                 linewidth=2,
                 linestyle='--' if i % 2 else '-',
                 label=category.title())

    plt.title(f"Normalized Mean Category Values by Year {data_name}", fontsize=14)
    plt.xlabel("Year", fontsize=12)
    plt.ylabel("Mean Value", fontsize=12)
    plt.xticks(mean_values["year"])
    plt.legend(title="Categories", 
              bbox_to_anchor=(1.05, 1), 
              loc='upper left',
              frameon=True)
    plt.tight_layout()
    plt.show()
plot_mean_category_values_enhanced('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv', '- Medical specialist')
plot_mean_category_values_enhanced('llama/annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv', '- Llama')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D # Import Line2D for custom legend handles
import numpy as np # Import numpy for Jaccard calculation

def plot_mean_category_values_enhanced_grouped_per_interval(csv_file, data_name=''):
    """
    Plots normalized mean category values from two sources: 2-year intervals and clusters.

    - Solid lines represent data grouped by 2-year intervals.
    - Dashed lines represent data grouped by pre-computed clusters.
    - Calculates and displays the Jaccard Index for each interval to show the similarity
      between the year interval data and cluster data.
    """
    # Load and process data
    df = pd.read_csv(csv_file)
    categories = ["pathophysiology", "epidemiology", "etiology", "history",
                  "physical", "exams", "differential", "therapeutic"]
    # Normalize each row so that the sum of category values is 1
    df_norm = df.copy()
    df_norm[categories] = df[categories].div(df[categories].sum(axis=1), axis=0)

    # Create a new column for 2-year groups (e.g., 1-2, 3-4, ...)
    df_norm['year_group'] = ((df_norm['year'] - 1) // 2) * 2 + 1
    mean_values = df_norm.groupby('year_group')[categories].mean().reset_index()

    # Prepare interval labels like "1-2", "3-4", ...
    intervals = [f"{int(start)}-{int(start)+1}" for start in mean_values["year_group"]]
    intervals = [
        f"{intervals[0]} (novice)",
        f"{intervals[1]} (developing)",
        f"{intervals[2]} (proficient)"
    ]
    # --- Jaccard Index Calculation ---
    # Calculate the Jaccard similarity between year-interval means and cluster means for each interval.
    # Assumes that the cluster order (0, 1, 2) corresponds to the year intervals ('1-2', '3-4', '5-6')
    cluster_means = df_norm.groupby('cluster kmeans 3 cat')[categories].mean()
    jaccard_scores = []

    # Ensure cluster_means and mean_values have the same length to avoid errors
    if len(cluster_means) == len(mean_values):
        for i in range(len(intervals)):
            year_vector = mean_values.loc[i, categories].values
            cluster_vector = cluster_means.loc[i, categories].values

            # Jaccard = sum(min(A,B)) / sum(max(A,B))
            intersection = np.sum(np.minimum(year_vector, cluster_vector))
            union = np.sum(np.maximum(year_vector, cluster_vector))
            
            # Avoid division by zero
            jaccard_index = intersection / union if union != 0 else 0
            jaccard_scores.append(jaccard_index)

    # --- Plotting Configuration ---
    plt.figure(figsize=(13, 7)) # Increased figure size for better layout
    plt.style.use('seaborn-v0_8-whitegrid')

    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f']
    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p']

    cluster_ids = cluster_means.index.map({0: intervals[0], 1: intervals[1], 2: intervals[2]})

    # --- Plotting Loop ---
    for i, category in enumerate(categories):
        # Plot year interval data (solid line)
        plt.plot(intervals, mean_values[category],
                 color=colors[i],
                 marker=markers[i],
                 markersize=8,
                 linewidth=2,
                 linestyle='-',
                 label=category.title()) # This label is for the category color
        
        # Plot cluster data (dashed line)
        plt.plot(cluster_ids, cluster_means[category],
                 marker=markers[i],
                 linestyle='--',
                 color=colors[i],
                 markersize=8,
                 linewidth=2)

    plt.title(f"Category Distribution by 2-Year Interval and Cluster{data_name}", fontsize=16)
    # plt.xlabel("Year Interval", fontsize=12)
    plt.ylabel("Mean Value", fontsize=12)
    plt.ylim(0, 0.6)
    # Add information to each interval label
    
    plt.xticks(intervals)

    # --- Add Jaccard Index Text to the Plot ---
    if jaccard_scores:
        # Position the text slightly below the x-axis
        y_pos = -0.1
        plt.text(0.5, y_pos - 0.014, "Jaccard Index:", transform=plt.gca().transAxes,
                 ha='center', fontsize=10, weight='bold')
        for i, score in enumerate(jaccard_scores):
            plt.text(i, y_pos, f"{score:.3f}", ha='center', va='top', fontsize=10)
    
    # --- Create Custom Legend ---
    # Get the handles and labels from the plot (for category colors)
    handles, labels = plt.gca().get_legend_handles_labels()

    # Create custom handles for the line styles
    solid_line = Line2D([0], [0], color='gray', lw=2, linestyle='-', label='Year Interval Data')
    dashed_line = Line2D([0], [0], color='gray', lw=2, linestyle='--', label='Cluster Data')

    # Add the new handles to the existing ones
    handles.extend([solid_line, dashed_line])

    plt.legend(handles=handles,
               title="Legend",
               bbox_to_anchor=(1.04, 1),
               loc='upper left',
               frameon=True)
    
    # Adjust plot layout to prevent labels from being cut off
    plt.subplots_adjust(right=0.80, bottom=0.15)
    
    plt.show()

# --- Calling the function with your data files ---
plot_mean_category_values_enhanced_grouped_per_interval('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv', '- Medical specialist')

plot_mean_category_values_enhanced_grouped_per_interval('llama/annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv', '- Llama 10 shot tfidf')
plot_mean_category_values_enhanced_grouped_per_interval('llama/annotations-dpoc-llm_10_static_shot_metrics.csv', '- Llama 10 shot static')
plot_mean_category_values_enhanced_grouped_per_interval('llama/annotations-dpoc-llm_3_static_shot_metrics.csv', '- Llama 3 shot static')

plot_mean_category_values_enhanced_grouped_per_interval('biobert_balanced/annotations-dpoc-biobert_metrics.csv', '- Fine-tuned BioBERTpt')
plot_mean_category_values_enhanced_grouped_per_interval('biobert-llama_balanced/annotations-dpoc-biobert-llama_metrics.csv', '- Llama-BioBERTpt')

In [None]:
import numpy as np

def plot_radar_mean_category_values_per_interval(csv_file, data_name =''):
    df = pd.read_csv(csv_file)
    categories = ["pathophysiology", "epidemiology", "etiology", "history",
                  "physical", "exams", "differential", "therapeutic"]
    # Normalize each row so that the sum of category values is 1
    df_norm = df.copy()
    df_norm[categories] = df[categories].div(df[categories].sum(axis=1), axis=0)
    df_norm['year_group'] = ((df_norm['year'] - 1) // 2) * 2 + 1
    mean_values = df_norm.groupby('year_group')[categories].mean().reset_index()
    intervals = [f"{int(start)}-{int(start)+1}" for start in mean_values["year_group"]]

    num_vars = len(categories)
    angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
    angles += angles[:1]

    n_intervals = len(intervals)
    fig, axes = plt.subplots(1, n_intervals, subplot_kw=dict(polar=True), figsize=(5*n_intervals, 5))
    if n_intervals == 1:
        axes = [axes]
    plt.style.use('seaborn-v0_8-whitegrid')

    for idx, (interval, row) in enumerate(zip(intervals, mean_values[categories].values)):
        values = row.tolist()
        values += values[:1]
        ax = axes[idx]
        ax.plot(angles, values, linewidth=2)
        ax.fill(angles, values, alpha=0.25)
        ax.set_xticks(angles[:-1])
        ax.set_xticklabels([cat.title() for cat in categories], fontsize=11)
        ax.set_yticks(np.linspace(0, 0.6, 7))
        ax.set_ylim(0, 0.6)
        ax.set_title(f"Years {interval}", fontsize=13, pad=20)
        ax.grid(True, alpha=0.3)
    fig.suptitle(f"Normalized Mean Category Values by 2-Year Groups (Radar) {data_name}", fontsize=16, y=1.08)
    plt.tight_layout()
    plt.show()

plot_radar_mean_category_values_per_interval('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv', '- Medical specialist')
plot_radar_mean_category_values_per_interval('llama/annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv', '- Llama')
