In [None]:
import pandas as pd
import ast
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix,roc_auc_score

before_5_non_acoustic =  ''
before_5_non_linguistic =''

before_10_non_acoustic =  ''
before_10_non_linguistic =''

In [None]:
model_dict={before_5_non_acoustic : ['wav2vec','whisper','hubert'], # Replace with actual top3 embedding names 
before_10_non_acoustic : ['wav2vec','hubert','xvector'], # Replace with actual top3 embedding names 
before_5_non_linguistic : ['lama3','e5-large','cross-en-fr-roberta'], # Replace with actual top3 embedding names 
before_10_non_linguistic : ['cross-en-fr-roberta','distiluse-v1','e5-large']} # Replace with actual top3 embedding names 

In [None]:
df_stats = pd.read_excel('speakes_pairs.xlsx')

# Extract and format names, replacing spaces with underscores
cn_names = [elem.replace(" ", "_") for elem in df_stats['Name'].tolist()]
pd_names = [elem.split("\t")[0].replace(" ", "_") for elem in df_stats['Use as a control for'].tolist()]

# Assign labels: 1 for control names (cn_names), 0 for patient names (pd_names)
label_dict = {name: 1 for name in cn_names}
label_dict.update({name: 0 for name in pd_names})



In [5]:
import pandas as pd
import os

def combine_speaker_label(files):
    """
    Combines the 'speaker' and 'label' columns from multiple files into a single DataFrame.
    
    Args:
    - files (list of str): List of file paths.
    
    Returns:
    - combined_df (pd.DataFrame): DataFrame with combined 'speaker' and 'label' data.
    """
    combined_df = pd.DataFrame(columns=['Speaker', 'Predicted_Label'])
    
    for file in files:
        # Load each file
        df = pd.read_csv(file)
        
        # Check if 'speaker' and 'label' columns exist
        if 'Speaker' in df.columns and 'Predicted_Label' in df.columns:
            # Extract the 'speaker' and 'label' columns and append to combined_df
            combined_df = pd.concat([combined_df, df[['Speaker', 'Predicted_Label']]], ignore_index=True)
        else:
            print(f"File {file} does not contain 'speaker' or 'label' columns and was skipped.")

    return combined_df



In [None]:
def generate_table_and_calculate_metrics(files, label_dict):
    """
    Combines 'Speaker' and 'Predicted_Label' columns from multiple files, groups by 'Speaker' to calculate
    the average of 'Predicted_Label', applies a threshold, maps true labels, and calculates metrics.
    
    Args:
    - files (list of str): List of file paths.
    - label_dict (dict): Dictionary mapping speakers to true labels.
    
    Returns:
    - pd.DataFrame: DataFrame with 'Speaker', thresholded 'Predicted_Label', and 'True_Label'.
    - dict: Dictionary containing accuracy, F1 score, sensitivity, and specificity.
    """
        # Combine data from files
    combined_df = combine_speaker_label(files)

    # Group by 'Speaker' and calculate the mean of 'Predicted_Label'
    combined_df = combined_df.groupby('Speaker', as_index=False).mean()

    raw_predictions = combined_df['Predicted_Label'].copy()

    # Apply threshold to 'Predicted_Label': >= 0.5 becomes 1, < 0.5 becomes 0
    combined_df['Predicted_Label'] = combined_df['Predicted_Label'].apply(lambda x: 1 if x >= 0.5 else 0)

    # Map true labels to a new column 'True_Label' using label_dict
    combined_df['True_Label'] = combined_df['Speaker'].map(label_dict)

    # Step 3: Calculate metrics
    valid_rows = combined_df['True_Label'].notna()  # Filter out rows without true labels
    y_true = combined_df.loc[valid_rows, 'True_Label']
    y_pred = combined_df.loc[valid_rows, 'Predicted_Label']
    y_scores = raw_predictions[valid_rows]  # Use raw predictions for AUC

    # Calculate metrics using sklearn
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    sensitivity = recall_score(y_true, y_pred)  # Sensitivity is the same as recall
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else None
    auc = roc_auc_score(y_true, y_scores)  # Calculate AUC with raw scores
    # Store metrics in a dictionary
    metrics = {
        'accuracy': accuracy,
        'f1_score': f1,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'AUC': auc
    }

    return combined_df, metrics

In [None]:
files = [before_5_non_acoustic,before_5_non_linguistic]  # Replace with actual file paths
combined_df, metrics = generate_table_and_calculate_metrics(files,label_dict)
metrics

{'accuracy': 0.8,
 'f1_score': 0.8125,
 'sensitivity': 0.8666666666666667,
 'specificity': 0.7333333333333333,
 'AUC': 0.8577777777777778}

In [None]:
files = [before_10_non_acoustic,before_10_non_linguistic]  # Replace with actual file paths
combined_df, metrics = generate_table_and_calculate_metrics(files,label_dict)
metrics

{'accuracy': 0.7166666666666667,
 'f1_score': 0.7384615384615385,
 'sensitivity': 0.8,
 'specificity': 0.6333333333333333,
 'AUC': 0.7744444444444445}