In [1]:
# 📘 T5-Large Fine-Tuning with PEFT LoRA on Risk Communication Dataset

In [2]:
# 🛠️ 1. Install Required Libraries
!pip install -q transformers datasets peft accelerate sentencepiece

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m113.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# 📥 2. Load and Preprocess the Dataset
from datasets import load_dataset
from transformers import T5Tokenizer

In [4]:
# Upload CSV manually via Colab UI or mount Google Drive
file_path = "/content/risk_data_formatted.csv"  # <-- replace if needed

In [5]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Convert the pandas DataFrame to a datasets Dataset
# Assuming the CSV has 'input' and 'output' columns
dataset = Dataset.from_pandas(df)

# Rename columns as in the original code
dataset = dataset.rename_columns({"input": "input_text", "output": "target_text"})

# Split the dataset
# Convert the single Dataset object into a DatasetDict with 'train' and 'test' splits
train_test_split = dataset.train_test_split(test_size=0.1)
dataset = DatasetDict({'train': train_test_split['train'], 'test': train_test_split['test']})

In [6]:
# Add prompt to each input_text
def add_prompt_prefix(example):
    example["input_text"] = f"Extract structured risk information from the following sentence: {example['input_text']}"
    return example

dataset = dataset.map(add_prompt_prefix)


Map:   0%|          | 0/151 [00:00<?, ? examples/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

In [7]:
tokenizer = T5Tokenizer.from_pretrained("t5-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
def preprocess(example):
    input_enc = tokenizer(example["input_text"], truncation=True, padding="max_length", max_length=512)
    target_enc = tokenizer(example["target_text"], truncation=True, padding="max_length", max_length=512)
    input_enc["labels"] = target_enc["input_ids"]
    return input_enc

In [9]:
tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["input_text", "target_text"])

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

In [10]:
# 🔌 3. Load T5-Large and Apply LoRA
from transformers import T5ForConditionalGeneration
from peft import get_peft_model, LoraConfig, TaskType

In [14]:
model = T5ForConditionalGeneration.from_pretrained("t5-large")

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [15]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none"
)

In [16]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 740,027,392 || trainable%: 0.3188


In [5]:
# 🎯 4. Training Setup
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

In [2]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./t5_lora_risk_output", #storage place of trained model
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    learning_rate=5e-4,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",         # ✅ Add this to log every epoch
    logging_dir="./logs",
    report_to="none",
    logging_first_step=True,          # ✅ Optional: log on first step
)


In [4]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

NameError: name 'Trainer' is not defined

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1599,0.139473
2,0.1298,0.139992
3,0.1124,0.136926
4,0.1039,0.134434
5,0.1007,0.134284


TrainOutput(global_step=755, training_loss=0.12127252135253111, metrics={'train_runtime': 564.9469, 'train_samples_per_second': 1.336, 'train_steps_per_second': 1.336, 'total_flos': 1640082765250560.0, 'train_loss': 0.12127252135253111, 'epoch': 5.0})

In [None]:
# 🔍 5. Inference Function

In [6]:
# 🔍 5. Inference Function
# %%
def predict(text):
    prompt = f"Extract structured risk information from the following sentence: {text}"
    # Ensure inputs are on the correct device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).input_ids.cuda()
    # Pass input_ids and max_length as keyword arguments
    outputs = model.generate(input_ids=inputs, max_length=512) # Changed: Pass inputs as keyword argument
    return tokenizer.decode(outputs[0], skip_special_tokens=True)





In [31]:
# ✅ 6. Test on a custom example
print("\nCustom test case:\n")
custom_text = (
    "Living in a city with high air pollution increases your risk of lung disease by 10%. "
    "While that may seem minor, it translates to an increase from 30 in 1,000 to 33 in "
    "1,000 people developing chronic respiratory issues over a decade."
)
predicted_custom_output = predict(custom_text)
print(f"Custom Input: {custom_text}")
print(f"Model Output: {predicted_custom_output}")


Custom test case:

Custom Input: Living in a city with high air pollution increases your risk of lung disease by 10%. While that may seem minor, it translates to an increase from 30 in 1,000 to 33 in 1,000 people developing chronic respiratory issues over a decade.
Model Output: Risk communication: 1 Absolute risk (base case): 30/1000 Absolute risk (new case): 33/1000 Absolute number (base case): null Absolute number (new case): null Absolute risk difference: null Relative risk difference: 0.10 Absolute number difference: null Verbal risk descriptor (base case): null Verbal risk descriptor (new situation): null Verbal risk descriptor (change from base to new): null Reference class size (base case: absolute number): 1000 Reference class size (new case: absolute number): 1000 Reference class description (base case): people living in a city with high air pollution Reference class description (new case): people living in a city with high air pollution Source (base case): null Source (new 

In [35]:
import pandas as pd
import re

# === Step 1: Save ground truth from your original DataFrame ===
df[["input", "output"]].to_csv("ground_truth.csv", index=False)

# === Step 2: Use model to predict outputs ===
pred_outputs = []
from tqdm import tqdm  # Optional: progress bar
for text in tqdm(df["input"], desc="Generating predictions"):
    input_ids = tokenizer(text, return_tensors="pt", truncation=True).input_ids.to(model.device)
    # Pass input_ids and max_length as keyword arguments
    output_ids = model.generate(input_ids=input_ids, max_length=512) # Changed: Pass inputs as keyword argument
    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    pred_outputs.append(decoded)

# Save predictions
pred_df = pd.DataFrame({
    "input": df["input"],
    "output": pred_outputs
})
pred_df.to_csv("model_outputs.csv", index=False)
print("✅ Saved ground_truth.csv and model_outputs.csv")

# === Step 3: Evaluate predictions field-by-field ===

# Load both files
ground_truth = pd.read_csv("ground_truth.csv")
predictions = pd.read_csv("model_outputs.csv")
merged = pd.merge(ground_truth, predictions, on="input", suffixes=("_true", "_pred"))

def parse_output(text):
    result = {}
    for line in str(text).strip().split("\n"):
        if ": " in line:
            key, value = line.split(": ", 1)
            result[key.strip()] = value.strip()
    return result

def normalize_value(val):
    val = val.lower().strip()
    if val in ["null", "n/a", "none", ""]:
        return "null"
    val = val.replace(",", "")
    match = re.match(r"([0-9.]+)\s*(million|thousand)?", val)
    if match:
        num, unit = match.groups()
        num = float(num)
        if unit == "million":
            num *= 1_000_000
        elif unit == "thousand":
            num *= 1_000
        return str(int(num)) if num.is_integer() else str(num)
    return val

field_stats = {}
results = []

for _, row in merged.iterrows():
    true_dict = parse_output(row["output_true"])
    pred_dict = parse_output(row["output_pred"])
    row_result = {"input": row["input"]}

    for field in true_dict:
        pred_val = normalize_value(pred_dict.get(field, "null"))
        true_val = normalize_value(true_dict.get(field, "null"))
        correct = (pred_val == true_val)
        row_result[field] = "✅" if correct else f"❌ (pred: {pred_val})"
        if field not in field_stats:
            field_stats[field] = {"correct": 0, "total": 0}
        field_stats[field]["total"] += 1
        if correct:
            field_stats[field]["correct"] += 1
    results.append(row_result)

# Show sample and summary
results_df = pd.DataFrame(results)
print("\n=== Sample Comparison ===")
print(results_df.head(3).to_string())

print("\n=== Field-wise Accuracy ===")
for field, stats in field_stats.items():
    acc = 100 * stats["correct"] / stats["total"]
    print(f"{field:50s} → {acc:.1f}% ({stats['correct']}/{stats['total']})")


Generating predictions: 100%|██████████| 168/168 [10:05<00:00,  3.61s/it]

✅ Saved ground_truth.csv and model_outputs.csv

=== Sample Comparison ===
                                                                                                                                                                                                                                                                                                    input Risk communication Absolute risk (base case) Absolute risk (new case) Absolute number (base case) Absolute number (new case) Absolute risk difference Relative risk difference Absolute number difference Verbal risk descriptor (base case) Verbal risk descriptor (new situation) Verbal risk descriptor (change from base to new) Reference class size (base case Reference class size (new case Reference class description (base case) Reference class description (new case) Source (base case) Source (new situation)  Topic and unit
0                                                                                                      


