In [None]:
results_df = pd.DataFrame(all_results)
print(f"Total experiments run (attempted): {experiment_count}")
if 'status' in results_df.columns:
    success_count = len(results_df[results_df['status'] == 'success'])
    print(f"Successfully completed experiments: {success_count}")
else:
    print("Status column not found in results, cannot count successful experiments.")

# Save the results to a CSV file
results_df.to_csv('layer12.csv', index=False)

Total experiments run (attempted): 3
Successfully completed experiments: 3


In [None]:
import itertools
import pandas as pd
import torch
from torch.optim import AdamW # Import specifically AdamW from torch.optim
from transformers import get_linear_schedule_with_warmup # Optional scheduler
import gc # Garbage collector
import traceback # For error printing
import time # Added for potential timing if needed

# Ensure df_raw, CLASS_WEIGHTS and all necessary functions/constants are defined
# (create_dataloaders, get_model, train_and_evaluate, compute_metrics, FocalLoss)
# Also needs: PREPROCESSING_OPTIONS, BASE_MODELS_TO_TRY, TASK_STRATEGIES,
# LOSS_FUNCTIONS_TO_TRY, BATCH_SIZE, MAX_LENGTH, LEARNING_RATE, EPOCHS,
# DEVICE, SEED, TASK_LIST, NUM_TASKS

all_results = []

# --- Generate all preprocessing combinations ---
pp_keys = PREPROCESSING_OPTIONS.keys()
pp_value_combinations = list(itertools.product(*PREPROCESSING_OPTIONS.values()))
preprocessing_configs = [dict(zip(pp_keys, values)) for values in pp_value_combinations]

print(f"Starting Experiment Runner...")
print(f"Total preprocessing configs: {len(preprocessing_configs)}")
print(f"Models to try: {BASE_MODELS_TO_TRY}")
print(f"Strategies to try: {TASK_STRATEGIES}")
print(f"Loss functions to try: {LOSS_FUNCTIONS_TO_TRY}") # Should include new ones now
print("-" * 50)

# --- Main Experiment Loop ---
experiment_count = 0
for pp_flags in preprocessing_configs:
    for base_model_name in BASE_MODELS_TO_TRY:
        for strategy in TASK_STRATEGIES:
            for loss_fn_name in LOSS_FUNCTIONS_TO_TRY: # Now iterates through new losses
                experiment_count += 1
                config = {
                    'exp_id': experiment_count,
                    'model': base_model_name,
                    'strategy': strategy,
                    'loss': loss_fn_name,
                    **pp_flags # Add preprocessing flags
                }
                print(f"\n--- Running Experiment {experiment_count} ---")
                print(f"Config: {config}")

                # --- 1. Data Preparation ---
                print("Preparing Dataloaders...")
                # (Dataloader creation code remains the same)
                try:
                    train_loader, val_loader, test_loader, tokenizer = create_dataloaders(
                        df=df_raw,
                        tokenizer_name=base_model_name,
                        batch_size=BATCH_SIZE,
                        max_len=MAX_LENGTH,
                        preprocessing_flags=pp_flags,
                        random_state=SEED
                    )
                    if not train_loader or not val_loader:
                         print("Error: Dataloader creation failed. Skipping experiment.")
                         result_entry = {'config': config, 'status': 'dataloader_error', 'val_metrics': None}
                         all_results.append(result_entry)
                         continue
                except Exception as e:
                    print(f"Error during dataloader creation for config {config}: {e}")
                    traceback.print_exc()
                    result_entry = {'config': config, 'status': 'dataloader_exception', 'val_metrics': None}
                    all_results.append(result_entry)
                    continue

                # --- 2. Model, Optimizer, Scheduler Setup ---
                model = None
                optimizer = None
                scheduler = None
                best_val_metrics_agg = {}

                try:
                    if strategy == 'multi_task':
                        print("Initializing Multi-Task Model...")
                        model = get_model(strategy, base_model_name)
                        if model:
                            model.to(DEVICE)
                            optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
                            # scheduler = ... # Optional
                        else:
                            raise ValueError("Multi-task model creation failed.")

                        print("Starting Multi-Task Training...")
                        best_model_state, history = train_and_evaluate(
                            model=model,
                            train_loader=train_loader,
                            val_loader=val_loader,
                            optimizer=optimizer,
                            loss_fn_provider=loss_fn_name, # Pass loss name
                            class_weights=CLASS_WEIGHTS, # *** Pass class weights ***
                            device=DEVICE,
                            epochs=EPOCHS,
                            strategy=strategy,
                            scheduler=scheduler
                        )
                        # (Result extraction remains the same)
                        best_epoch_metrics = {}
                        if history and history.get('val_metrics'):
                             epoch_avg_f1s = [np.mean([m.get('lenient_f1', 0.0) for m in epoch_metrics.values()]) if epoch_metrics else -1 for epoch_metrics in history['val_metrics']]
                             if epoch_avg_f1s:
                                  best_epoch_idx = np.argmax(epoch_avg_f1s)
                                  best_epoch_metrics = history['val_metrics'][best_epoch_idx]
                             else: print("Warning: No valid validation metrics found in history.")
                        best_val_metrics_agg = best_epoch_metrics


                    elif strategy == 'single_task':
                        print("Initializing Single-Task Models (one per task)...")
                        model_creator = get_model(strategy, base_model_name)
                        if not model_creator: raise ValueError("Single-task model creator function not obtained.")

                        task_histories = {}
                        task_best_metrics = {}

                        for task_idx, task_name in enumerate(TASK_LIST):
                            print(f"\n-- Training Single-Task Model for: {task_name} (Task {task_idx+1}/{NUM_TASKS}) --")
                            task_model = model_creator(base_model_name, NUM_CLASSES)
                            if not task_model: print(f"Error creating model for task {task_name}. Skipping task."); continue
                            task_model.to(DEVICE)
                            task_optimizer = AdamW(task_model.parameters(), lr=LEARNING_RATE)
                            # task_scheduler = ... # Optional

                            best_task_model_state, task_history = train_and_evaluate(
                                model=task_model,
                                train_loader=train_loader,
                                val_loader=val_loader,
                                optimizer=task_optimizer,
                                loss_fn_provider=loss_fn_name, # Pass loss name
                                class_weights=CLASS_WEIGHTS, # *** Pass class weights ***
                                device=DEVICE,
                                epochs=EPOCHS,
                                strategy=strategy,
                                task_index=task_idx,
                                scheduler=None # Optional: task_scheduler
                            )
                            task_histories[task_name] = task_history
                            # (Result extraction remains the same)
                            best_task_epoch_metrics = {}
                            if task_history and task_history.get('val_metrics'):
                                 epoch_task_f1s = [epoch_metrics.get(task_name, {}).get('lenient_f1', -1) if epoch_metrics else -1 for epoch_metrics in task_history['val_metrics']]
                                 if epoch_task_f1s:
                                      best_epoch_idx = np.argmax(epoch_task_f1s)
                                      if best_epoch_idx < len(task_history['val_metrics']):
                                           best_task_epoch_metrics = task_history['val_metrics'][best_epoch_idx].get(task_name, {})
                                      else: print(f"Warning: Best epoch index {best_epoch_idx} out of range for task {task_name} history.")
                                 else: print(f"Warning: No valid validation metrics found in history for task {task_name}.")
                            task_best_metrics[task_name] = best_task_epoch_metrics

                            print(f"Cleaning up resources for task: {task_name}")
                            del task_model, task_optimizer
                            if 'best_task_model_state' in locals(): del best_task_model_state
                            gc.collect()
                            if torch.cuda.is_available(): torch.cuda.empty_cache()

                        best_val_metrics_agg = task_best_metrics


                    # --- Store results for this configuration ---
                    result_entry = { 'config': config, 'status': 'success', 'val_metrics': best_val_metrics_agg }
                    all_results.append(result_entry)
                    print(f"--- Experiment {experiment_count} Completed Successfully ---")

                except Exception as e:
                    print(f"Error during training/evaluation for config {config}: {e}")
                    traceback.print_exc()
                    result_entry = {'config': config, 'status': 'train_eval_exception', 'val_metrics': None}
                    all_results.append(result_entry)
                    print(f"--- Experiment {experiment_count} Failed ---")

                # --- Cleanup for the experiment ---
                # (Cleanup code remains the same)
                print("Cleaning up experiment resources...")
                if 'model' in locals() and model is not None: del model
                if 'optimizer' in locals() and optimizer is not None : del optimizer
                if 'scheduler' in locals() and scheduler is not None: del scheduler
                if 'train_loader' in locals() and train_loader is not None: del train_loader
                if 'val_loader' in locals() and val_loader is not None: del val_loader
                if 'test_loader' in locals() and test_loader is not None: del test_loader
                if 'tokenizer' in locals() and tokenizer is not None: del tokenizer
                if 'best_model_state' in locals(): del best_model_state
                if 'history' in locals(): del history
                if 'task_histories' in locals(): del task_histories
                gc.collect()
                if torch.cuda.is_available(): torch.cuda.empty_cache()
                print("-" * 50)


# --- Convert results to DataFrame ---
# (Result conversion and saving code remains the same)
print("\n--- Experiment Runner Finished ---")
results_df = pd.DataFrame(all_results)
print(f"Total experiments run (attempted): {experiment_count}")
if 'status' in results_df.columns:
    success_count = len(results_df[results_df['status'] == 'success'])
    print(f"Successfully completed experiments: {success_count}")
else:
    print("Status column not found in results, cannot count successful experiments.")



Starting Experiment Runner...
Total preprocessing configs: 1
Models to try: ['distilbert-base-uncased', 'bert-base-uncased', 'roberta-base']
Strategies to try: ['single_task']
Loss functions to try: ['FocalLoss']
--------------------------------------------------

--- Running Experiment 1 ---
Config: {'exp_id': 1, 'model': 'distilbert-base-uncased', 'strategy': 'single_task', 'loss': 'FocalLoss', 'remove_punct': False, 'remove_stopwords': False, 'include_history': False}
Preparing Dataloaders...
Data split sizes:
  Train: 1980
  Validation: 248
  Test: 248
Loading tokenizer: distilbert-base-uncased
Creating Train Dataset...
Creating Validation Dataset...
Creating Test Dataset...
Using 19 workers for DataLoaders.
Dataloaders created successfully.
Initializing Single-Task Models (one per task)...
Strategy: single_task. Will load distilbert-base-uncased with 3 labels per task during training.
Loading custom concat-based single-task model: distilbert-base-uncased with 3 labels.

-- Trainin

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



-- Training Single-Task Model for: Mistake_Identification (Task 1/4) --
Loading custom concat-based single-task model: roberta-base with 3 labels.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Starting Training --- [Epochs: 5, Strategy: single_task, Loss: FocalLoss, Task Index: 0]
  Epoch 1/5, Step 50/62, Batch Loss: 0.1807
  Epoch 1/5, Step 62/62, Batch Loss: 0.1898
Epoch 1/5 | Time: 77.54s | Avg Train Loss: 0.2557
  Avg Validation Lenient F1 (over evaluated tasks): 0.9626
    Mistake_Identification: Exact Acc: 0.8669, Exact F1: 0.5954, Lenient Acc: 0.9355, Lenient F1: 0.9626
  Validation metric improved (-inf --> 0.9626).
  Epoch 2/5, Step 50/62, Batch Loss: 0.1682
  Epoch 2/5, Step 62/62, Batch Loss: 0.0899
Epoch 2/5 | Time: 76.24s | Avg Train Loss: 0.1522
  Avg Validation Lenient F1 (over evaluated tasks): 0.9446
    Mistake_Identification: Exact Acc: 0.8468, Exact F1: 0.6351, Lenient Acc: 0.9073, Lenient F1: 0.9446
  Validation metric did not improve. Patience: 1/3.
  Epoch 3/5, Step 50/62, Batch Loss: 0.2639
  Epoch 3/5, Step 62/62, Batch Loss: 0.1764
Epoch 3/5 | Time: 75.78s | Avg Train Loss: 0.1357
  Avg Validation Lenient F1 (over evaluated tasks): 0.9287
    Mi

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Starting Training --- [Epochs: 5, Strategy: single_task, Loss: FocalLoss, Task Index: 1]
  Epoch 1/5, Step 50/62, Batch Loss: 0.3848
  Epoch 1/5, Step 62/62, Batch Loss: 0.2451
Epoch 1/5 | Time: 38.45s | Avg Train Loss: 0.3791
  Avg Validation Lenient F1 (over evaluated tasks): 0.8282
    Mistake_Location: Exact Acc: 0.6855, Exact F1: 0.4486, Lenient Acc: 0.7540, Lenient F1: 0.8282
  Validation metric improved (-inf --> 0.8282).
  Epoch 2/5, Step 50/62, Batch Loss: 0.1608
  Epoch 2/5, Step 62/62, Batch Loss: 0.2368
Epoch 2/5 | Time: 38.56s | Avg Train Loss: 0.2871
  Avg Validation Lenient F1 (over evaluated tasks): 0.8627
    Mistake_Location: Exact Acc: 0.7298, Exact F1: 0.4857, Lenient Acc: 0.8024, Lenient F1: 0.8627
  Validation metric improved (0.8282 --> 0.8627).
  Epoch 3/5, Step 50/62, Batch Loss: 0.3854
  Epoch 3/5, Step 62/62, Batch Loss: 0.2304
Epoch 3/5 | Time: 38.56s | Avg Train Loss: 0.2343
  Avg Validation Lenient F1 (over evaluated tasks): 0.8780
    Mistake_Location

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Starting Training --- [Epochs: 5, Strategy: single_task, Loss: FocalLoss, Task Index: 2]
  Epoch 1/5, Step 50/62, Batch Loss: 0.3384
  Epoch 1/5, Step 62/62, Batch Loss: 0.4159
Epoch 1/5 | Time: 59.71s | Avg Train Loss: 0.4161
  Avg Validation Lenient F1 (over evaluated tasks): 0.8696
    Providing_Guidance: Exact Acc: 0.6492, Exact F1: 0.4267, Lenient Acc: 0.7944, Lenient F1: 0.8696
  Validation metric improved (-inf --> 0.8696).
  Epoch 2/5, Step 50/62, Batch Loss: 0.4222
  Epoch 2/5, Step 62/62, Batch Loss: 0.2570
Epoch 2/5 | Time: 78.32s | Avg Train Loss: 0.3470
  Avg Validation Lenient F1 (over evaluated tasks): 0.9059
    Providing_Guidance: Exact Acc: 0.6935, Exact F1: 0.4620, Lenient Acc: 0.8468, Lenient F1: 0.9059
  Validation metric improved (0.8696 --> 0.9059).
  Epoch 3/5, Step 50/62, Batch Loss: 0.2319
  Epoch 3/5, Step 62/62, Batch Loss: 0.4197
Epoch 3/5 | Time: 75.94s | Avg Train Loss: 0.3000
  Avg Validation Lenient F1 (over evaluated tasks): 0.8684
    Providing_Gu

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Starting Training --- [Epochs: 5, Strategy: single_task, Loss: FocalLoss, Task Index: 3]
  Epoch 1/5, Step 50/62, Batch Loss: 0.3698
  Epoch 1/5, Step 62/62, Batch Loss: 0.2898
Epoch 1/5 | Time: 78.54s | Avg Train Loss: 0.4000
  Avg Validation Lenient F1 (over evaluated tasks): 0.8296
    Actionability: Exact Acc: 0.6734, Exact F1: 0.4836, Lenient Acc: 0.7863, Lenient F1: 0.8296
  Validation metric improved (-inf --> 0.8296).
  Epoch 2/5, Step 50/62, Batch Loss: 0.2523
  Epoch 2/5, Step 62/62, Batch Loss: 0.3501
Epoch 2/5 | Time: 74.85s | Avg Train Loss: 0.2872
  Avg Validation Lenient F1 (over evaluated tasks): 0.8693
    Actionability: Exact Acc: 0.6815, Exact F1: 0.5728, Lenient Acc: 0.8266, Lenient F1: 0.8693
  Validation metric improved (0.8296 --> 0.8693).
  Epoch 3/5, Step 50/62, Batch Loss: 0.2771
  Epoch 3/5, Step 62/62, Batch Loss: 0.3625
Epoch 3/5 | Time: 78.56s | Avg Train Loss: 0.2259
  Avg Validation Lenient F1 (over evaluated tasks): 0.9138
    Actionability: Exact A

In [None]:
results_df = pd.DataFrame(all_results)
print(f"Total experiments run (attempted): {experiment_count}")
if 'status' in results_df.columns:
    success_count = len(results_df[results_df['status'] == 'success'])
    print(f"Successfully completed experiments: {success_count}")
else:
    print("Status column not found in results, cannot count successful experiments.")

# Save the results to a CSV file
results_df.to_csv('layer12.csv', index=False)

Total experiments run (attempted): 3
Successfully completed experiments: 3
