In [None]:
import os
import torch
import pandas as pd
import json
import zipfile
import gc
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    TrainerCallback
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
import numpy as np
import traceback

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize the tokenizer and model for a generative QA model (e.g., T5)
model_name = "t5-base"  # Use a T5 model for generative QA
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Load JSON and CSV files
with open("data/feature_classification_input.json", "r") as f:
    json_data = json.load(f)
features_df = pd.read_csv('data/private_data/train_features.csv')
labels_df = pd.read_csv('data/private_data/train_labels.csv')

# Drop the `uid` column from labels_df if it exists
if 'uid' in labels_df.columns:
    labels_df = labels_df.drop(columns=["uid"])

# Fill any missing values in labels_df
labels_df.fillna('No answer provided.', inplace=True)  # Use a default response for empty answers

# Create generative QA data with enriched context
def create_qa_data(section):
    qa_data = []
    for variable in section["variables"]:
        variable_id = variable.get("id")
        question = variable.get("question", "No question provided.")
        criteria = "; ".join(variable.get("criteria", []))
        examples = "; ".join(variable.get("examples", []))
        exclusions = "; ".join(variable.get("exclusions", []))
        notes = "; ".join(variable.get("notes", []))

        # Full prompt with context
        full_prompt = (
            f"Question: {question}\n"
            f"Definition: {variable.get('definition', 'No definition provided.')}\n"
            f"Criteria: {criteria}\n"
            f"Examples: {examples}\n"
            f"Exclusions: {exclusions}\n"
            f"Notes: {notes}\n"
            "Answer:"
        )

        # Check if the variable exists in `labels_df.columns`
        if variable_id not in labels_df.columns:
            print(f"Variable ID '{variable_id}' not found in labels_df columns.")
            continue

        # Generate QA samples for each entry in the dataset
        for idx, row in features_df.iterrows():
            context = row.get("NarrativeCME", "No context provided.")
            answer = str(labels_df.loc[idx, variable_id])
            qa_data.append({
                "input_text": f"{full_prompt} Context: {context}",
                "target_text": answer
            })
    return qa_data

# Tokenize input and target text with debugging
def preprocess_function(examples):
    inputs = examples["input_text"]
    targets = examples["target_text"]

    # Tokenize inputs with padding and truncation
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Convert label to integer and wrap in list if needed
    labels = tokenizer(targets, max_length=128, truncation=True)["input_ids"]

    # Ensure labels is always a non-empty list of token IDs
    if not labels or any(len(label) == 0 for label in labels):
        print(f"Error: Encountered empty or malformed labels: {labels}")
        return None

    # Debug: Log shapes and types
    # print("\n=== Preprocessing Debugging ===")
    # print(f"Inputs: {inputs[:1]}")
    # print(f"Tokenized Inputs Length: {len(model_inputs['input_ids'])}")
    # print(f"Labels: {targets[:1]}")
    # print(f"Processed Labels: {labels}")

    model_inputs["labels"] = labels
    return model_inputs

# Custom Data Collator for Seq2Seq with enhanced debugging
class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    def __call__(self, features):
        # Remove entries with None or zero-dimensional labels and log each step
        filtered_features = []
        for f in features:
            if f.get("labels") is not None and len(f["labels"]) > 0:
                filtered_features.append(f)
            else:
                print(f"Warning: Encountered a feature with empty or None labels: {f}")

        if not filtered_features:
            raise ValueError("Error: No valid features with labels available for collation.")

        return super().__call__(filtered_features)

# Instantiate custom data collator
data_collator = CustomDataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

class SaveCheckpointCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, model=None, tokenizer=None, **kwargs):
        # Custom checkpoint saving at the end of each epoch with section name and epoch number
        checkpoint_dir = f"{args.output_dir}/checkpoint_{state.epoch:.1f}_section_{section_name}"
        if model is not None:
            model.save_pretrained(checkpoint_dir)
        if tokenizer is not None:
            tokenizer.save_pretrained(checkpoint_dir)
        print(f"Saved checkpoint at {checkpoint_dir} for epoch {state.epoch}")

# Proceed with dataset preparation and training
for section in json_data["sections"]:
    section_name = section["name"].replace(" ", "_").lower()

    qa_data = create_qa_data(section)
    if not qa_data:
        print(f"No valid QA data found for section: {section_name}")
        continue

    # Create dataset from qa_data
    dataset = pd.DataFrame(qa_data)
    train_data, eval_data = train_test_split(dataset, test_size=0.2, random_state=42)

    # Convert to Hugging Face Dataset
    train_dataset = Dataset.from_pandas(train_data)
    eval_dataset = Dataset.from_pandas(eval_data)

    # Apply preprocessing with additional debug information
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

    # Define training arguments with save_steps and checkpoint naming
    training_args = TrainingArguments(
        output_dir=f"./models/{section_name}_qa_finetuned",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=16,
        num_train_epochs=4,
        fp16=True,
        eval_strategy="epoch",
        save_strategy="epoch",  # Save checkpoint at each epoch
        load_best_model_at_end=True,
        save_total_limit=2,  # Keep only the last 2 checkpoints
        logging_dir=f"./logs/{section_name}_qa_finetuned",
    )

    # Initialize the Trainer with checkpoint callback
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        data_collator=data_collator,
        callbacks=[SaveCheckpointCallback()]  # Add custom callback for checkpoint saving
    )

    try:
        print(f"\nStarting training for section: {section_name}...")
        trainer.train()
    except Exception as e:
        print(f"\nTraining failed for section: {section_name} with error: {e}")
        traceback.print_exc()

    # Zip the best model directory at the end of training for each section
    model_dir = f"./models/{section_name}_qa_finetuned"
    zip_file_path = f"{model_dir}.zip"
    with zipfile.ZipFile(zip_file_path, 'w') as zipf:
        for root, dirs, files in os.walk(model_dir):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), model_dir))

    gc.collect()
    torch.cuda.empty_cache()

Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Starting training for section: mental_health_history_and_current_state...


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkaungkhant-ko[0m ([33mkaungkhant-ko-self-employed[0m). Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,0.235831
2,No log,0.221549
3,0.335100,0.20036
4,0.335100,0.201634


Saved checkpoint at ./models/mental_health_history_and_current_state_qa_finetuned/checkpoint_1.0_section_mental_health_history_and_current_state for epoch 1.0
Saved checkpoint at ./models/mental_health_history_and_current_state_qa_finetuned/checkpoint_2.0_section_mental_health_history_and_current_state for epoch 2.0
Saved checkpoint at ./models/mental_health_history_and_current_state_qa_finetuned/checkpoint_3.0_section_mental_health_history_and_current_state for epoch 3.0
Saved checkpoint at ./models/mental_health_history_and_current_state_qa_finetuned/checkpoint_4.0_section_mental_health_history_and_current_state for epoch 4.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Map:   0%|          | 0/12800 [00:00<?, ? examples/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Starting training for section: specific_mental_health_diagnoses...


Epoch,Training Loss,Validation Loss
1,No log,0.070776
2,0.082200,0.069701
3,0.069000,0.069766
4,0.062900,0.075022


Saved checkpoint at ./models/specific_mental_health_diagnoses_qa_finetuned/checkpoint_1.0_section_specific_mental_health_diagnoses for epoch 1.0
Saved checkpoint at ./models/specific_mental_health_diagnoses_qa_finetuned/checkpoint_2.0_section_specific_mental_health_diagnoses for epoch 2.0
Saved checkpoint at ./models/specific_mental_health_diagnoses_qa_finetuned/checkpoint_3.0_section_specific_mental_health_diagnoses for epoch 3.0
Saved checkpoint at ./models/specific_mental_health_diagnoses_qa_finetuned/checkpoint_4.0_section_specific_mental_health_diagnoses for epoch 4.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Starting training for section: contributing_factors...


Epoch,Training Loss,Validation Loss
1,0.1275,0.106754
2,0.106,0.108021
3,0.0949,0.107495
4,0.0884,0.107155


Saved checkpoint at ./models/contributing_factors_qa_finetuned/checkpoint_1.0_section_contributing_factors for epoch 1.0
Saved checkpoint at ./models/contributing_factors_qa_finetuned/checkpoint_2.0_section_contributing_factors for epoch 2.0
Saved checkpoint at ./models/contributing_factors_qa_finetuned/checkpoint_3.0_section_contributing_factors for epoch 3.0
Saved checkpoint at ./models/contributing_factors_qa_finetuned/checkpoint_4.0_section_contributing_factors for epoch 4.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Starting training for section: disclosure_of_intent...


Epoch,Training Loss,Validation Loss
1,0.1299,0.104928
2,0.1111,0.108282
3,0.0991,0.111128
4,0.091,0.10456


Saved checkpoint at ./models/disclosure_of_intent_qa_finetuned/checkpoint_1.0_section_disclosure_of_intent for epoch 1.0
Saved checkpoint at ./models/disclosure_of_intent_qa_finetuned/checkpoint_2.0_section_disclosure_of_intent for epoch 2.0
Saved checkpoint at ./models/disclosure_of_intent_qa_finetuned/checkpoint_3.0_section_disclosure_of_intent for epoch 3.0
Saved checkpoint at ./models/disclosure_of_intent_qa_finetuned/checkpoint_4.0_section_disclosure_of_intent for epoch 4.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Starting training for section: incident_details...


Epoch,Training Loss,Validation Loss
1,No log,0.233245
2,No log,0.207753
3,0.348300,0.200251
4,0.348300,0.197076


Saved checkpoint at ./models/incident_details_qa_finetuned/checkpoint_1.0_section_incident_details for epoch 1.0
Saved checkpoint at ./models/incident_details_qa_finetuned/checkpoint_2.0_section_incident_details for epoch 2.0
Saved checkpoint at ./models/incident_details_qa_finetuned/checkpoint_3.0_section_incident_details for epoch 3.0
Saved checkpoint at ./models/incident_details_qa_finetuned/checkpoint_4.0_section_incident_details for epoch 4.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp models/contributing_factors_qa_finetuned.zip /content/drive/MyDrive/FLANT5\ Training

In [None]:
!cp models/incident_details_qa_finetuned.zip /content/drive/MyDrive/FLANT5\ Training