In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")


In [2]:
agg = pd.read_csv('aggregated_table_2024_12_01_6pm.csv')
agg = agg[agg['Model'] != 'GPT4']

In [3]:
agg_trad = pd.read_csv('traiditional_ML_results_combined.csv')
# agg_trad = agg_trad[agg_trad['Model'] == 'LogisticRegression']

In [4]:
agg.tail()

Unnamed: 0,Model,Prompt Type,Num Features,Sample Size,Class 1 Proportion,Set ID,Run Number,Accuracy,Precision,Recall,F1 Score,Prediction,PR_AUC,ROC_AUC
3771,13B-8bit,t_annony,20.0,32.0,0.5,Set_1_Prop_0.5,1.0,0.67,0.142857,0.307692,0.195122,"[0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, ...",0.133956,0.515915
3772,13B-8bit,t_annony,20.0,32.0,0.5,Set_2_Prop_0.5,1.0,0.64,0.103448,0.230769,0.142857,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...",0.123873,0.465959
3773,13B-8bit,t_annony,20.0,32.0,0.5,Set_3_Prop_0.5,1.0,0.58,0.108108,0.307692,0.16,"[1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, ...",0.123264,0.464191
3774,13B-8bit,t_annony,20.0,32.0,0.5,Set_4_Prop_0.5,1.0,0.33,0.114286,0.615385,0.192771,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, ...",0.12033,0.45137
3775,13B-8bit,t_annony,20.0,32.0,0.5,Set_5_Prop_0.5,1.0,0.6,0.114286,0.307692,0.166667,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...",0.125165,0.475685


In [5]:
agg_trad.head()

Unnamed: 0,Model,Tuning,Train Size,Minority Proportion,F1,ROCAUC,PRAUC,Iteration
0,DecisionTree,No,8,0.1,0.084405,0.505747,0.137094,5.5
1,DecisionTree,No,8,0.3,0.083794,0.513263,0.139826,5.5
2,DecisionTree,No,8,0.5,0.185305,0.481432,0.131115,5.5
3,DecisionTree,No,16,0.1,0.05404,0.480592,0.136,5.5
4,DecisionTree,No,16,0.3,0.144016,0.509947,0.137749,5.5


In [6]:
# Find unique values
unique_values = agg['Model'].unique()
print(unique_values)

['7B-unquant' '7B-8bit' '13B-8bit']


In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import ast

def ensemble_predictions(experiment_results_df, voting='majority'):
    """
    Performs ensemble predictions for 3 models and 2 prompt types
    """
    grouped_predictions = experiment_results_df.groupby(['Prompt Type', 'Num Features', 'Sample Size', 'Class 1 Proportion'])
    ensemble_results = []
    
    for (prompt_type, num_features, sample_size, class_prop), group in grouped_predictions:
        try:
            predictions_list = []
            for pred in group['Prediction'].tolist():
                try:
                    if isinstance(pred, str):
                        pred_list = ast.literal_eval(pred)
                        predictions_list.append(pred_list)
                except:
                    continue
            
            if len(predictions_list) < 2: 
                continue
                
            min_length = min(len(pred_list) for pred_list in predictions_list)
            predictions_list = [pred_list[:min_length] for pred_list in predictions_list]
            
            ensemble_preds = []
            for i in range(min_length):
                instance_predictions = []
                for pred_list in predictions_list:
                    try:
                        if isinstance(pred_list[i], (int, float)):
                            instance_predictions.append(pred_list[i])
                    except:
                        continue
                
                if instance_predictions:
                    if voting == 'majority':
                        ensemble_pred = round(sum(instance_predictions) / len(instance_predictions))
                    elif voting == 'weighted':
                        weights = group['F1 Score'].tolist()
                        weighted_sum = sum(p * w for p, w in zip(instance_predictions, weights[:len(instance_predictions)]))
                        ensemble_pred = round(weighted_sum / sum(weights[:len(instance_predictions)]))
                    ensemble_preds.append(ensemble_pred)
            
            true_labels = [y for y in predictions_list[0] if isinstance(y, (int, float))][:len(ensemble_preds)]
            
            if len(true_labels) == len(ensemble_preds) and len(ensemble_preds) > 0:
                metrics = {
                    'Prompt Type': prompt_type,
                    'Num Features': num_features,
                    'Sample Size': sample_size,
                    'Class 1 Proportion': class_prop,
                    'Ensemble F1': f1_score(true_labels, ensemble_preds, zero_division=1)
                }
                ensemble_results.append(metrics)
        except:
            continue
    
    return pd.DataFrame(ensemble_results)

def run_ensemble_analysis(experiment_results_df):
    """
    Run ensemble analysis for different voting methods
    """
    results = {}
    
    majority_results = ensemble_predictions(experiment_results_df, voting='majority')
    weighted_results = ensemble_predictions(experiment_results_df, voting='weighted')
    
    if not majority_results.empty:
        majority_results.to_csv('majority_voting_results.csv', index=False)
        results['majority'] = majority_results
        
    if not weighted_results.empty:
        weighted_results.to_csv('weighted_voting_results.csv', index=False)
        results['weighted'] = weighted_results
    
    for method, df in results.items():
        if not df.empty:
            print(f"\n{method.capitalize()} Voting Results by Prompt Type:")
            summary = df.groupby('Prompt Type')[['Ensemble F1']].mean()
            print(summary)
    
    return results

if __name__ == "__main__":
    # experiment_results_df = pd.read_csv('aggregated_table_2024_12_01_6pm.csv')
    experiment_results_df = agg
    results = run_ensemble_analysis(experiment_results_df)


Majority Voting Results by Prompt Type:
             Ensemble F1
Prompt Type             
t_annony        0.396087
t_table         0.281389

Weighted Voting Results by Prompt Type:
             Ensemble F1
Prompt Type             
t_annony        0.149239
t_table         0.229471


In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import ast

# 1. Stacking Ensemble
def stacking_ensemble(experiment_results_df, meta_classifier=LogisticRegression()):
    """
    Implements stacking ensemble using a meta-classifier
    """
    grouped_predictions = experiment_results_df.groupby(['Prompt Type', 'Num Features', 'Sample Size', 'Class 1 Proportion'])
    stacking_results = []
    
    for (prompt_type, num_features, sample_size, class_prop), group in grouped_predictions:
        try:
            predictions_list = []
            for pred in group['Prediction'].tolist():
                try:
                    if isinstance(pred, str):
                        pred_list = ast.literal_eval(pred)
                    else:
                        pred_list = pred
                    predictions_list.append(pred_list)
                except:
                    continue
            
            if len(predictions_list) < 2: 
                continue
                
            min_length = min(len(pred_list) for pred_list in predictions_list)
            cleaned_predictions = []
            for pred_list in predictions_list:
                numeric_preds = []
                for p in pred_list[:min_length]:
                    if isinstance(p, (int, float)):
                        numeric_preds.append(float(p))
                    else:
                        numeric_preds.append(0.0)
                cleaned_predictions.append(numeric_preds)
            
            X_meta = np.array(cleaned_predictions).T
            y_true = np.array([float(y) for y in cleaned_predictions[0]])
            
            if X_meta.shape[0] == len(y_true) and X_meta.shape[0] > 0:
                X_train, X_test, y_train, y_test = train_test_split(X_meta, y_true, test_size=0.2, random_state=42)
                meta_classifier.fit(X_train, y_train)
                ensemble_preds = meta_classifier.predict(X_test)
                
                metrics = {
                    'Prompt Type': prompt_type,
                    'Num Features': num_features,
                    'Sample Size': sample_size,
                    'Class 1 Proportion': class_prop,
                    'Ensemble F1': f1_score(y_test, ensemble_preds, zero_division=1)
                }
                stacking_results.append(metrics)
        except:
            continue
    
    return pd.DataFrame(stacking_results)

# 2. Confidence-based Voting
def confidence_voting(experiment_results_df, confidence_threshold=0.7):
    """
    Implements confidence-based voting using model performance metrics
    """
    grouped_predictions = experiment_results_df.groupby(['Prompt Type', 'Num Features', 'Sample Size', 'Class 1 Proportion'])
    confidence_results = []
    
    for (prompt_type, num_features, sample_size, class_prop), group in grouped_predictions:
        try:
            predictions_list = []
            for pred in group['Prediction'].tolist():
                try:
                    if isinstance(pred, str):
                        pred_list = ast.literal_eval(pred)
                    else:
                        pred_list = pred
                    predictions_list.append(pred_list)
                except:
                    continue
            
            if len(predictions_list) < 2:
                continue
                
            min_length = min(len(pred_list) for pred_list in predictions_list)
            predictions_list = [pred_list[:min_length] for pred_list in predictions_list]
            confidences = group['F1 Score'].values
            
            ensemble_preds = []
            for i in range(min_length):
                instance_predictions = []
                instance_confidences = []
                
                for pred_list, conf in zip(predictions_list, confidences):
                    try:
                        if isinstance(pred_list[i], (int, float)) and conf >= confidence_threshold:
                            instance_predictions.append(pred_list[i])
                            instance_confidences.append(conf)
                    except:
                        continue
                
                if instance_predictions:
                    weighted_pred = sum(p * c for p, c in zip(instance_predictions, instance_confidences))
                    weighted_pred /= sum(instance_confidences)
                    ensemble_preds.append(round(weighted_pred))
                elif predictions_list:
                    ensemble_preds.append(round(np.mean([p[i] for p in predictions_list if isinstance(p[i], (int, float))])))
            
            true_labels = [y for y in predictions_list[0] if isinstance(y, (int, float))][:len(ensemble_preds)]
            
            if len(true_labels) == len(ensemble_preds) and len(ensemble_preds) > 0:
                metrics = {
                    'Prompt Type': prompt_type,
                    'Num Features': num_features,
                    'Sample Size': sample_size,
                    'Class 1 Proportion': class_prop,
                    'Ensemble F1': f1_score(true_labels, ensemble_preds, zero_division=1)
                }
                confidence_results.append(metrics)
        except:
            continue
    
    return pd.DataFrame(confidence_results)

# 3. Dynamic Weighted Voting
def dynamic_weighted_voting(experiment_results_df, window_size=3):
    """
    Implements dynamic weighted voting with moving window performance
    """
    grouped_predictions = experiment_results_df.groupby(['Prompt Type', 'Num Features', 'Sample Size', 'Class 1 Proportion'])
    dynamic_results = []
    
    for (prompt_type, num_features, sample_size, class_prop), group in grouped_predictions:
        try:
            predictions_list = []
            for pred in group['Prediction'].tolist():
                try:
                    if isinstance(pred, str):
                        pred_list = ast.literal_eval(pred)
                    else:
                        pred_list = pred
                    predictions_list.append(pred_list)
                except:
                    continue
            
            if len(predictions_list) < 2:
                continue
                
            min_length = min(len(pred_list) for pred_list in predictions_list)
            predictions_list = [pred_list[:min_length] for pred_list in predictions_list]
            
            f1_scores = group['F1 Score'].values
            dynamic_weights = pd.Series(f1_scores).rolling(window=window_size, min_periods=1).mean().values
            
            ensemble_preds = []
            for i in range(min_length):
                instance_predictions = []
                instance_weights = []
                
                for pred_list, weight in zip(predictions_list, dynamic_weights):
                    try:
                        if isinstance(pred_list[i], (int, float)):
                            instance_predictions.append(pred_list[i])
                            instance_weights.append(weight)
                    except:
                        continue
                
                if instance_predictions:
                    weighted_pred = sum(p * w for p, w in zip(instance_predictions, instance_weights))
                    weighted_pred /= sum(instance_weights) if sum(instance_weights) > 0 else 1
                    ensemble_preds.append(round(weighted_pred))
                elif predictions_list:
                    ensemble_preds.append(round(np.mean([p[i] for p in predictions_list if isinstance(p[i], (int, float))])))
            
            true_labels = [y for y in predictions_list[0] if isinstance(y, (int, float))][:len(ensemble_preds)]
            
            if len(true_labels) == len(ensemble_preds) and len(ensemble_preds) > 0:
                metrics = {
                    'Prompt Type': prompt_type,
                    'Num Features': num_features,
                    'Sample Size': sample_size,
                    'Class 1 Proportion': class_prop,
                    'Ensemble F1': f1_score(true_labels, ensemble_preds, zero_division=1)
                }
                dynamic_results.append(metrics)
        except:
            continue
    
    return pd.DataFrame(dynamic_results)

def run_all_ensemble_methods(experiment_results_df):
    """
    Run all ensemble methods and compare results
    """
    results = {}
    
    results['stacking'] = stacking_ensemble(experiment_results_df)
    results['confidence'] = confidence_voting(experiment_results_df, confidence_threshold=0.7)
    results['dynamic'] = dynamic_weighted_voting(experiment_results_df, window_size=3)
    
    for method, df in results.items():
        if not df.empty:
            df.to_csv(f'{method}_ensemble_results.csv', index=False)
            print(f"\n{method.capitalize()} Ensemble Results by Prompt Type:")
            summary = df.groupby('Prompt Type')[['Ensemble F1']].mean()
            print(summary)
    
    return results

if __name__ == "__main__":
    # experiment_results_df = pd.read_csv('aggregated_table_2024_12_01_6pm.csv')
    experiment_results_df = agg
    results = run_all_ensemble_methods(experiment_results_df)


Stacking Ensemble Results by Prompt Type:
             Ensemble F1
Prompt Type             
t_annony        0.779458
t_table         0.800872

Confidence Ensemble Results by Prompt Type:
             Ensemble F1
Prompt Type             
t_annony        0.396087
t_table         0.281389

Dynamic Ensemble Results by Prompt Type:
             Ensemble F1
Prompt Type             
t_annony        0.386992
t_table         0.288887


In [9]:
# # WITH SAMPLE SIZES

# def stacking_ensemble(experiment_results_df, meta_classifier=LogisticRegression()):
#     """
#     Implements stacking ensemble considering sample sizes and prompt types
#     """
#     grouped_predictions = experiment_results_df.groupby(['Prompt Type', 'Sample Size'])
#     stacking_results = []
    
#     for (prompt_type, sample_size), group in grouped_predictions:
#         try:
#             predictions_list = []
#             for pred in group['Prediction'].tolist():
#                 try:
#                     if isinstance(pred, str):
#                         pred_list = ast.literal_eval(pred)
#                         predictions_list.append(pred_list)
#                 except:
#                     continue
            
#             if len(predictions_list) < 2:
#                 continue
                
#             min_length = min(len(pred_list) for pred_list in predictions_list)
#             cleaned_predictions = []
#             for pred_list in predictions_list:
#                 numeric_preds = []
#                 for p in pred_list[:min_length]:
#                     if isinstance(p, (int, float)):
#                         numeric_preds.append(float(p))
#                     else:
#                         numeric_preds.append(0.0)
#                 cleaned_predictions.append(numeric_preds)
            
#             X_meta = np.array(cleaned_predictions).T
#             y_true = np.array([float(y) for y in cleaned_predictions[0]])
            
#             if X_meta.shape[0] == len(y_true) and X_meta.shape[0] > 0:
#                 X_train, X_test, y_train, y_test = train_test_split(X_meta, y_true, test_size=0.2, random_state=42)
#                 meta_classifier.fit(X_train, y_train)
#                 ensemble_preds = meta_classifier.predict(X_test)
                
#                 metrics = {
#                     'Prompt Type': prompt_type,
#                     'Sample Size': sample_size,
#                     'Ensemble F1': f1_score(y_test, ensemble_preds, zero_division=1)
#                 }
#                 stacking_results.append(metrics)
#         except:
#             continue
    
#     return pd.DataFrame(stacking_results)

# def confidence_voting(experiment_results_df, confidence_threshold=0.7):
#     """
#     Implements confidence-based voting considering sample sizes and prompt types
#     """
#     grouped_predictions = experiment_results_df.groupby(['Prompt Type', 'Sample Size'])
#     confidence_results = []
    
#     for (prompt_type, sample_size), group in grouped_predictions:
#         try:
#             predictions_list = []
#             for pred in group['Prediction'].tolist():
#                 try:
#                     if isinstance(pred, str):
#                         pred_list = ast.literal_eval(pred)
#                         predictions_list.append(pred_list)
#                 except:
#                     continue
            
#             if len(predictions_list) < 2:
#                 continue
                
#             min_length = min(len(pred_list) for pred_list in predictions_list)
#             predictions_list = [pred_list[:min_length] for pred_list in predictions_list]
#             confidences = group['F1 Score'].values
            
#             ensemble_preds = []
#             for i in range(min_length):
#                 instance_predictions = []
#                 instance_confidences = []
                
#                 for pred_list, conf in zip(predictions_list, confidences):
#                     try:
#                         if isinstance(pred_list[i], (int, float)) and conf >= confidence_threshold:
#                             instance_predictions.append(pred_list[i])
#                             instance_confidences.append(conf)
#                     except:
#                         continue
                
#                 if instance_predictions:
#                     weighted_pred = sum(p * c for p, c in zip(instance_predictions, instance_confidences))
#                     weighted_pred /= sum(instance_confidences)
#                     ensemble_preds.append(round(weighted_pred))
#                 elif predictions_list:
#                     ensemble_preds.append(round(np.mean([p[i] for p in predictions_list if isinstance(p[i], (int, float))])))
            
#             true_labels = [y for y in predictions_list[0] if isinstance(y, (int, float))][:len(ensemble_preds)]
            
#             if len(true_labels) == len(ensemble_preds) and len(ensemble_preds) > 0:
#                 metrics = {
#                     'Prompt Type': prompt_type,
#                     'Sample Size': sample_size,
#                     'Ensemble F1': f1_score(true_labels, ensemble_preds, zero_division=1)
#                 }
#                 confidence_results.append(metrics)
#         except:
#             continue
    
#     return pd.DataFrame(confidence_results)

# def dynamic_weighted_voting(experiment_results_df, window_size=3):
#     """
#     Implements dynamic weighted voting considering sample sizes and prompt types
#     """
#     grouped_predictions = experiment_results_df.groupby(['Prompt Type', 'Sample Size'])
#     dynamic_results = []
    
#     for (prompt_type, sample_size), group in grouped_predictions:
#         try:
#             predictions_list = []
#             for pred in group['Prediction'].tolist():
#                 try:
#                     if isinstance(pred, str):
#                         pred_list = ast.literal_eval(pred)
#                         predictions_list.append(pred_list)
#                 except:
#                     continue
            
#             if len(predictions_list) < 2:
#                 continue
                
#             min_length = min(len(pred_list) for pred_list in predictions_list)
#             predictions_list = [pred_list[:min_length] for pred_list in predictions_list]
            
#             f1_scores = group['F1 Score'].values
#             dynamic_weights = pd.Series(f1_scores).rolling(window=window_size, min_periods=1).mean().values
            
#             ensemble_preds = []
#             for i in range(min_length):
#                 instance_predictions = []
#                 instance_weights = []
                
#                 for pred_list, weight in zip(predictions_list, dynamic_weights):
#                     try:
#                         if isinstance(pred_list[i], (int, float)):
#                             instance_predictions.append(pred_list[i])
#                             instance_weights.append(weight)
#                     except:
#                         continue
                
#                 if instance_predictions:
#                     weighted_pred = sum(p * w for p, w in zip(instance_predictions, instance_weights))
#                     weighted_pred /= sum(instance_weights) if sum(instance_weights) > 0 else 1
#                     ensemble_preds.append(round(weighted_pred))
#                 elif predictions_list:
#                     ensemble_preds.append(round(np.mean([p[i] for p in predictions_list if isinstance(p[i], (int, float))])))
            
#             true_labels = [y for y in predictions_list[0] if isinstance(y, (int, float))][:len(ensemble_preds)]
            
#             if len(true_labels) == len(ensemble_preds) and len(ensemble_preds) > 0:
#                 metrics = {
#                     'Prompt Type': prompt_type,
#                     'Sample Size': sample_size,
#                     'Ensemble F1': f1_score(true_labels, ensemble_preds, zero_division=1)
#                 }
#                 dynamic_results.append(metrics)
#         except:
#             continue
    
#     return pd.DataFrame(dynamic_results)

# def run_all_ensemble_methods(experiment_results_df):
#     """
#     Run all ensemble methods and compare results by prompt type and sample size
#     """

#     experiment_results_df = experiment_results_df[~experiment_results_df['Model'].str.contains('gpt4', case=False)]

#     results = {}
    
#     results['stacking'] = stacking_ensemble(experiment_results_df)
#     results['confidence'] = confidence_voting(experiment_results_df, confidence_threshold=0.7)
#     results['dynamic'] = dynamic_weighted_voting(experiment_results_df, window_size=3)
    
#     for method, df in results.items():
#         if not df.empty:
#             df.to_csv(f'{method}_ensemble_results.csv', index=False)
#             print(f"\n{method.capitalize()} Ensemble Results:")
#             summary = df.pivot_table(
#                 values='Ensemble F1',
#                 index='Prompt Type',
#                 columns='Sample Size',
#                 aggfunc='mean'
#             )
#             print(summary)
    
#     return results

# if __name__ == "__main__":
#     # experiment_results_df = pd.read_csv('aggregated_table_2024_12_01_6pm.csv')
#     experiment_results_df = agg

#     results = run_all_ensemble_methods(experiment_results_df)

In [10]:
# #WITH SAMPLE SIZES

# def ensemble_predictions(experiment_results_df, voting='majority'):
#     """
#     Performs ensemble predictions for 3 models and 2 prompt types, considering sample sizes
#     """
#     experiment_results_df = experiment_results_df[~experiment_results_df['Model'].str.contains('gpt4', case=False)]

#     grouped_predictions = experiment_results_df.groupby(['Prompt Type', 'Sample Size'])
#     ensemble_results = []
    
#     for (prompt_type, sample_size), group in grouped_predictions:
#         try:
#             predictions_list = []
#             for pred in group['Prediction'].tolist():
#                 try:
#                     if isinstance(pred, str):
#                         pred_list = ast.literal_eval(pred)
#                         predictions_list.append(pred_list)
#                 except:
#                     continue
            
#             if len(predictions_list) < 2:
#                 continue
                
#             min_length = min(len(pred_list) for pred_list in predictions_list)
#             predictions_list = [pred_list[:min_length] for pred_list in predictions_list]
            
#             ensemble_preds = []
#             for i in range(min_length):
#                 instance_predictions = []
#                 for pred_list in predictions_list:
#                     try:
#                         if isinstance(pred_list[i], (int, float)):
#                             instance_predictions.append(pred_list[i])
#                     except:
#                         continue
                
#                 if instance_predictions:
#                     if voting == 'majority':
#                         ensemble_pred = round(sum(instance_predictions) / len(instance_predictions))
#                     elif voting == 'weighted':
#                         weights = group['F1 Score'].tolist()
#                         weighted_sum = sum(p * w for p, w in zip(instance_predictions, weights[:len(instance_predictions)]))
#                         ensemble_pred = round(weighted_sum / sum(weights[:len(instance_predictions)]))
#                     ensemble_preds.append(ensemble_pred)
            
#             true_labels = [y for y in predictions_list[0] if isinstance(y, (int, float))][:len(ensemble_preds)]
            
#             if len(true_labels) == len(ensemble_preds) and len(ensemble_preds) > 0:
#                 metrics = {
#                     'Prompt Type': prompt_type,
#                     'Sample Size': sample_size,
#                     'Ensemble F1': f1_score(true_labels, ensemble_preds, zero_division=1)
#                 }
#                 ensemble_results.append(metrics)
#         except:
#             continue
    
#     return pd.DataFrame(ensemble_results)

# def run_ensemble_analysis(experiment_results_df):
#     experiment_results_df = experiment_results_df[~experiment_results_df['Model'].str.contains('gpt4', case=False)]

#     """
#     Run ensemble analysis for different voting methods
#     """
#     results = {}
    
#     majority_results = ensemble_predictions(experiment_results_df, voting='majority')
#     weighted_results = ensemble_predictions(experiment_results_df, voting='weighted')
    
#     if not majority_results.empty:
#         majority_results.to_csv('majority_voting_results.csv', index=False)
#         results['majority'] = majority_results
        
#     if not weighted_results.empty:
#         weighted_results.to_csv('weighted_voting_results.csv', index=False)
#         results['weighted'] = weighted_results
    
#     for method, df in results.items():
#         if not df.empty:
#             print(f"\n{method.capitalize()} Voting Results:")
#             summary = df.pivot_table(
#                 values='Ensemble F1',
#                 index='Prompt Type',
#                 columns='Sample Size',
#                 aggfunc='mean'
#             )
#             print(summary)
    
#     return results

# if __name__ == "__main__":
#     # experiment_results_df = pd.read_csv('aggregated_table_2024_12_01_6pm.csv')
#     experiment_results_df = agg

#     results = run_ensemble_analysis(experiment_results_df)