In [None]:
# @title 1. Install Unsloth + dependencies
!pip install "unsloth[torch]" -q
!pip install "datasets" "accelerate" -q



[0m

In [None]:
# @title 2. Mount Google Drive (optional but recommended)
from google.colab import drive
drive.mount('/content/drive')




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# @title 3. Load CSV and inspect columns
import pandas as pd

CSV_PATH = "/content/drive/MyDrive/extracted_incident_reports.csv"  # <-- change this

df = pd.read_csv(CSV_PATH)

# Drop rows where Root Cause Analysis is NaN or empty/whitespace
df["Root Cause Analysis"] = df["Root Cause Analysis"].astype(str)

# Remove rows where the string is empty or only spaces
df = df[df["Root Cause Analysis"].str.strip() != ""]

# Optionally reset index
df = df.reset_index(drop=True)

print("Rows after filtering:", len(df))
df.tail()



Rows after filtering: 349


Unnamed: 0,Filename,Incident ID,Description of the Accident,Investigation of the Accident,Discussion,Root Cause Analysis
344,September_28__2022_-_Final_Report.pdf,"September 28, 2022-Final Report-Surface (Const...","On September 28, 2022, at 5:30 a.m., Duarte st...","On September 28, 2022, at 1:04 p.m., Aaron Aut...",Location of the Accident \nThe accident occurr...,The accident investigator conducted an analysi...
345,September_5__2019_Fatality_-_Final_Report_0.pdf,"September 5, 2019-Final Report-Underground Coa...","On Thursday, September 5, 2019, Jeremy Elder s...","On Thursday, September 5, 2019, at 3:00 p.m., ...",Accident Scene \nThe accident occurred in the ...,MSHA conducted an analysis to identify the fun...
346,September_28__2024_-_Final_Report_-_Leer_Mine.pdf,"September 28, 2024-Final Report-Underground (C...","On September 27, 2024, at 11:00 p.m., Walls st...","On September 28, 2024, at 8:07 a.m., Ralph Fra...",Accident Location \nThe accident occurred on ...,The accident investigation team conducted an a...
347,Sept_18_2024_-_Fatality_Report_-_American_Asph...,"Sept 18, 2024-Final Report-Surface (Crushed, B...","On September 18, 2024, at 6:26 a.m., Brace arr...","At 9:54 a.m., Banks called the Department of L...",Location of the Accident \nThe accident occurr...,The accident investigation team conducted an a...
348,Sept_14__2021_-_Fatal_Report_-_Butte_Gulch_Rep...,"Sept 14, 2021-Final Report-Surface (Gold)","On September 14, 2021, Branden Dunsmore, Equip...","On September 14, 2021, at 4:39 p.m., Curtis Pe...",Location of the Accident \nThe accident occurr...,The accident investigation team conducted an a...


In [None]:
# @title 4. Build prompt → root cause dataset
def build_prompt(row):
    return (
        "You are an MSHA safety investigator. "
        "Read the following sections and write a clear, concise ROOT CAUSE ANALYSIS.\n\n"
        "SECTION: DESCRIPTION OF THE ACCIDENT:\n"
        f"{row['Description of the Accident']}\n\n"
        "SECTION: INVESTIGATION OF THE ACCIDENT:\n"
        f"{row['Investigation of the Accident']}\n\n"
        "SECTION: DISCUSSION:\n"
        f"{row['Discussion']}\n\n"
        "TASK: Provide the ROOT CAUSE ANALYSIS for this accident."
    )
records = []
for _, r in df.iterrows():
    rc = r.get("Root Cause Analysis", "")
    if isinstance(rc, str) and rc.strip():
        records.append({
            "instruction": build_prompt(r),
            "output": rc.strip(),
        })

len(records), records[0]


(349,
 {'instruction': 'You are an MSHA safety investigator. Read the following sections and write a clear, concise ROOT CAUSE ANALYSIS.\n\nSECTION: DESCRIPTION OF THE ACCIDENT:\nOn April 11, 2023, at approximately 5:00 a.m., Nutting arrived at the Galena mine to begin his \nshift.  Ryan Jurado, Shifter, met with the production crew.  Based on information obtained from \ninterviews, Nutting and Jason Lauck, Stope Miner, traveled down the shaft to the 3200 level.  \nFrom there, Nutting and Lauck climbed a ladder to the 3200-033 stope (see Appendix A).  \nNutting and Lauck then checked the results of the explosive rounds shot during the previous shift \nand started their normal mining cycle. \n \nAccording to interviews, prior to 9:30 a.m., Jurado visited the 3200-033 stope, spoke with both \nminers, and signed their examination cards.  At approximately 9:30 a.m., Kelly Gallogly, \nGeologist, visited the stope and witnessed Nutting moving a jackleg drill in the 001A Vein.  \nNutting appe

In [None]:
# @title 5. Create Hugging Face Dataset
from datasets import Dataset

dataset = Dataset.from_list(records)
dataset
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
val_dataset = dataset["test"]
len(train_dataset), len(val_dataset)


(314, 35)

In [None]:
# @title 6. Load Llama-3.2-3B-Instruct with Unsloth (4-bit)
from unsloth import FastLanguageModel
import torch

MAX_SEQ_LENGTH = 2048  # adjust if needed
MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    load_in_4bit = True,   # QLoRA-style 4-bit loading
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [None]:
# @title 7. Enable LoRA fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = 16,
    lora_dropout = 0.0,
    bias = "none",
    use_rslora = True,
)


Unsloth 2025.11.4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
# @title 8. Apply chat template to build training text

def format_example(example):
    # One conversation: user gives all sections, assistant gives root cause analysis.
    text = tokenizer.apply_chat_template(
        [
            {"role": "user", "content": example["instruction"]},
            {"role": "assistant", "content": example["output"]},
        ],
        tokenize = False,
    )
    return {"text": text}

use_split = isinstance(dataset, dict)

if use_split:
    train_dataset_fmt = train_dataset.map(format_example)
    val_dataset_fmt = val_dataset.map(format_example)
else:
    train_dataset_fmt = dataset.map(format_example)
    val_dataset_fmt = None

train_dataset_fmt[0]


Map:   0%|          | 0/314 [00:00<?, ? examples/s]

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

{'instruction': 'You are an MSHA safety investigator. Read the following sections and write a clear, concise ROOT CAUSE ANALYSIS.\n\nSECTION: DESCRIPTION OF THE ACCIDENT:\nOn Thursday, November 29, 2018, at 6:00 a.m., George Ney (victim) and John Seasock, \nMechanics, began their shift. They spent the morning performing maintenance on \nseveral mining vehicles.  Then they went to a garage located at the facility to have \nlunch. \nWhile in the garage, David Morgan, Mine Superintendent, modified the hydraulic \nsystem on the Ford F-550 service truck that he used at the mine.  He removed the \nhydraulic hose reel and replaced it with a hose coupling connecting the hoses together.  \nMorgan then removed the hydraulic oil cooler and plugged its lines with fittings.  He \nstarted the truck to check for hydraulic leaks, checked the operation of a bed-mounted \ncrane, and then turned off the truck. \nAfter Ney finished his lunch, Morgan asked him if he knew anything about the \nhydraulics in 

In [None]:
# @title 9. Configure SFTTrainer
from trl import SFTTrainer
from transformers import TrainingArguments

output_dir = "llama32-3b-msha-rootcause"

training_args = TrainingArguments(
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    num_train_epochs = 2,
    learning_rate = 2e-4,
    fp16 = True,
    logging_steps = 10,
    save_strategy = "epoch",
    output_dir = output_dir,
    report_to = "none",
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer, # Add the tokenizer here
    train_dataset = train_dataset_fmt,
    eval_dataset = val_dataset_fmt,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    packing = True,
    args = training_args,
)
trainer

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/314 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/35 [00:00<?, ? examples/s]

<UnslothSFTTrainer.UnslothSFTTrainer at 0x7b008f1811c0>

In [None]:
# @title 10. Start fine-tuning
trainer.train()


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 314 | Num Epochs = 2 | Total steps = 158
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.1506
20,1.8508
30,1.8366
40,1.835
50,1.8204
60,1.7896
70,1.7525
80,1.7219
90,1.5942
100,1.5605


TrainOutput(global_step=158, training_loss=1.7033394741106638, metrics={'train_runtime': 913.471, 'train_samples_per_second': 0.687, 'train_steps_per_second': 0.173, 'total_flos': 1.095863644164096e+16, 'train_loss': 1.7033394741106638, 'epoch': 2.0})

In [None]:
# @title 11. Save fine-tuned LoRA model + tokenizer
save_dir = "/content/drive/MyDrive/Llama-root-cause"  # you can also save to Drive

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
save_dir


'/content/drive/MyDrive/Llama-root-cause'

In [None]:
!pip install rouge-score bert-score sentence-transformers




In [None]:
import os
import pandas as pd
import torch
import unsloth
from rouge_score import rouge_scorer
from bert_score import score as bertscore
from sentence_transformers import SentenceTransformer, util

from unsloth import FastLanguageModel

# ========= CONFIG =========
MODEL_PATH = "/content/drive/MyDrive/Llama-root-cause"
EVAL_CSV_PATH = "/content/drive/MyDrive/evaluation_incident_reports.csv"
OUTPUT_CSV_PATH = "/content/drive/MyDrive/MSHA/eval_results_with_all_metrics.csv"
MAX_SEQ_LENGTH = 2048
MAX_NEW_TOKENS = 256
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # or any suitable model
# ==========================

# 1. Load LLM
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_PATH,
    max_seq_length = MAX_SEQ_LENGTH,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model)
model.to(DEVICE)

# 2. Load embedding model
emb_model = SentenceTransformer(EMB_MODEL_NAME, device=DEVICE)

# 3. Load data
df = pd.read_csv(EVAL_CSV_PATH)
df["Root Cause Analysis"] = df["Root Cause Analysis"].astype(str)
df = df[df["Root Cause Analysis"].str.strip() != ""].reset_index(drop=True)

# 4. Helpers
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

def build_prompt(row):
    return (
        "You are an MSHA safety investigator. "
        "Read the following sections and write a clear, concise ROOT CAUSE ANALYSIS.\n\n"
        "SECTION: DESCRIPTION OF THE ACCIDENT:\n"
        f"{row['Description of the Accident']}\n\n"
        "SECTION: INVESTIGATION OF THE ACCIDENT:\n"
        f"{row['Investigation of the Accident']}\n\n"
        "SECTION: DISCUSSION:\n"
        f"{row['Discussion']}\n\n"
        "TASK: Provide the ROOT CAUSE ANALYSIS for this accident."
    )

def generate_root_cause(prompt: str) -> str:
    inputs = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        return_tensors = "pt",
    ).to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens = MAX_NEW_TOKENS,
            temperature = 0.7,
            top_p = 0.9,
            do_sample = True,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

# 5. Run model and collect texts
preds = []
refs = []

for idx, row in df.iterrows():
    print(f"Evaluating {idx+1}/{len(df)}...")
    prompt = build_prompt(row)
    pred = generate_root_cause(prompt)
    ref = row["Root Cause Analysis"].strip()
    preds.append(pred)
    refs.append(ref)

df["Model Root Cause Analysis"] = preds

# 6. ROUGE-L (per example)
rouge_l_scores = []
for hyp, ref in zip(preds, refs):
    s = rouge.score(ref, hyp)["rougeL"].fmeasure
    rouge_l_scores.append(s)

df["ROUGE_L"] = rouge_l_scores

# 7. BERTScore (batched)
P, R, F1 = bertscore(preds, refs, lang="en")
df["BERTScore_F1"] = F1.tolist()

# 8. Embedding similarity (cosine)
emb_preds = emb_model.encode(preds, convert_to_tensor=True, normalize_embeddings=True)
emb_refs = emb_model.encode(refs, convert_to_tensor=True, normalize_embeddings=True)
cos_sims = util.cos_sim(emb_preds, emb_refs).diagonal()  # similarity per pair
df["Embedding_CosineSim"] = cos_sims.cpu().tolist()

# 9. Print global averages
print("Average ROUGE-L:", df["ROUGE_L"].mean())
print("Average BERTScore F1:", df["BERTScore_F1"].mean())
print("Average Embedding CosineSim:", df["Embedding_CosineSim"].mean())

# 10. Save detailed results
df.to_csv(OUTPUT_CSV_PATH, index=False)
print("Saved metrics to:", OUTPUT_CSV_PATH)


==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [None]:
import os
import pandas as pd
import torch
import unsloth
from rouge_score import rouge_scorer
from bert_score import score as bertscore
from sentence_transformers import SentenceTransformer, util

# Ensure the latest bitsandbytes version by uninstalling and reinstalling
# Reinstall unsloth and bitsandbytes to ensure compatibility
!pip install "unsloth[torch]" -q
!pip install -U bitsandbytes -q

from unsloth import FastLanguageModel

# ========= CONFIG =========
MODEL_PATH = "/content/drive/MyDrive/Llama-root-cause"
EVAL_CSV_PATH = "/content/drive/MyDrive/evaluation_incident_reports.csv"
OUTPUT_CSV_PATH = "/content/drive/MyDrive/MSHA/eval_results_with_all_metrics.csv"
MAX_SEQ_LENGTH = 2048
MAX_NEW_TOKENS = 256
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # or any suitable model
# ==========================

# 1. Load LLM
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_PATH,
    max_seq_length = MAX_SEQ_LENGTH,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model)
model.to(DEVICE)

# 2. Load embedding model
emb_model = SentenceTransformer(EMB_MODEL_NAME, device=DEVICE)

# 3. Load data
df = pd.read_csv(EVAL_CSV_PATH)
df["Root Cause Analysis"] = df["Root Cause Analysis"].astype(str)
df = df[df["Root Cause Analysis"].str.strip() != ""].reset_index(drop=True)

# 4. Helpers
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

def build_prompt(row):
    return (
        "You are an MSHA safety investigator. "
        "Read the following sections and write a clear, concise ROOT CAUSE ANALYSIS.\n\n"
        "SECTION: DESCRIPTION OF THE ACCIDENT:\n"
        f"{row['Description of the Accident']}\n\n"
        "SECTION: INVESTIGATION OF THE ACCIDENT:\n"
        f"{row['Investigation of the Accident']}\n\n"
        "SECTION: DISCUSSION:\n"
        f"{row['Discussion']}\n\n"
        "TASK: Provide the ROOT CAUSE ANALYSIS for this accident."
    )

def generate_root_cause(prompt: str) -> str:
    # Get input_ids tensor from chat template
    input_ids = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        return_tensors = "pt",
        max_length = MAX_SEQ_LENGTH,
        truncation = True,
    ).to(DEVICE)  # shape: (1, seq_len)

    # Build attention_mask: 1 for tokens to attend to, 0 for padding
    # Here we treat pad_token_id as padding; if no pad token, we treat all tokens as non‑padding.
    if tokenizer.pad_token_id is not None:
        attention_mask = (input_ids != tokenizer.pad_token_id).long()
    else:
        attention_mask = torch.ones_like(input_ids)

    with torch.no_grad():
        outputs = model.generate(
            input_ids = input_ids,
            attention_mask = attention_mask,
            max_new_tokens = MAX_NEW_TOKENS,
            temperature = 0.7,
            top_p = 0.9,
            do_sample = True,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()



# 5. Run model and collect texts
preds = []
refs = []

for idx, row in df.iterrows():
    print(f"Evaluating {idx+1}/{len(df)}...")
    prompt = build_prompt(row)
    pred = generate_root_cause(prompt)
    ref = row["Root Cause Analysis"].strip()
    preds.append(pred)
    refs.append(ref)

df["Model Root Cause Analysis"] = preds

# 6. ROUGE-L (per example)
rouge_l_scores = []
for hyp, ref in zip(preds, refs):
    s = rouge.score(ref, hyp)["rougeL"].fmeasure
    rouge_l_scores.append(s)

df["ROUGE_L"] = rouge_l_scores

# 7. BERTScore (batched)
P, R, F1 = bertscore(preds, refs, lang="en")
df["BERTScore_F1"] = F1.tolist()

# 8. Embedding similarity (cosine)
emb_preds = emb_model.encode(preds, convert_to_tensor=True, normalize_embeddings=True)
emb_refs = emb_model.encode(refs, convert_to_tensor=True, normalize_embeddings=True)
cos_sims = util.cos_sim(emb_preds, emb_refs).diagonal()  # similarity per pair
df["Embedding_CosineSim"] = cos_sims.cpu().tolist()

# 9. Print global averages
print("Average ROUGE-L:", df["ROUGE_L"].mean())
print("Average BERTScore F1:", df["BERTScore_F1"].mean())
print("Average Embedding CosineSim:", df["Embedding_CosineSim"].mean())

# 10. Save detailed results
df.to_csv(OUTPUT_CSV_PATH, index=False)
print("Saved metrics to:", OUTPUT_CSV_PATH)

[0m==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Evaluating 1/46...
Evaluating 2/46...
Evaluating 3/46...
Evaluating 4/46...
Evaluating 5/46...
Evaluating 6/46...
Evaluating 7/46...
Evaluating 8/46...
Evaluating 9/46...
Evaluating 10/46...
Evaluating 11/46...
Evaluating 12/46...
Evaluating 13/46...
Evaluating 14/46...
Evaluating 15/46...
Evaluating 16/46...
Evaluating 17/46...
Evaluating 18/46...
Evaluating 19/46...
Evaluating 20/46...
Evaluating 21/46...
Evaluating 22/46...
Evaluating 23/46...
Evaluating 24/46...
Evaluating 25/46...
Evaluating 26/46...
Evaluating 27/46...
Evaluati

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average ROUGE-L: 0.09871147018309516
Average BERTScore F1: 0.8251945117245549
Average Embedding CosineSim: 0.49305908576301904


OSError: Cannot save file into a non-existent directory: '/content/drive/MyDrive/MSHA'

In [None]:
df.to_csv("/content/drive/MyDrive", index=False)

IsADirectoryError: [Errno 21] Is a directory: '/content/drive/MyDrive'

# Task
Create the directory `/content/drive/MyDrive/MSHA` and save the `df` DataFrame to `/content/drive/MyDrive/MSHA/eval_results_with_all_metrics.csv`.

## Create output directory and save results

### Subtask:
Create the directory `/content/drive/MyDrive/MSHA` and save the `df` DataFrame to `/content/drive/MyDrive/MSHA/eval_results_with_all_metrics.csv`.


**Reasoning**:
The previous attempt to save the CSV failed because the directory did not exist. This step will create the necessary directory and then save the DataFrame to the specified path.



In [None]:
import os

# Ensure the output directory exists
os.makedirs(os.path.dirname(OUTPUT_CSV_PATH), exist_ok=True)

# Save detailed results
df.to_csv(OUTPUT_CSV_PATH, index=False)
print("Saved metrics to:", OUTPUT_CSV_PATH)

Saved metrics to: /content/drive/MyDrive/MSHA/eval_results_with_all_metrics.csv
