In [5]:
# === Essential Setup ===
!pip install -q transformers pandas sentence-transformers gdown

import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW # Import AdamW from torch.optim
from tqdm.auto import tqdm
import gdown
import os
# === Streamlined Configuration ===
TRAIN_URL = 'https://drive.google.com/uc?id=1HU8jXdzYPjReMCWpYnoXdw4SZgLgvkHZ'
TEST_URL = 'https://drive.google.com/uc?id=1EW71QvETOnPKWrx8JZCj7sPXY4Iq60nq'
MODEL_NAME = "t5-base"
MAX_INPUT_LENGTH = 384
MAX_TARGET_LENGTH = 160
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 8
EPOCHS = 12
LR = 3e-5
OUTPUT_DIR = './model_output'

# === Data Preparation ===
train_path = 'train.csv'
test_path = 'test.csv'
if not os.path.exists(train_path):
    gdown.download(TRAIN_URL, train_path, quiet=False)
if not os.path.exists(test_path):
    gdown.download(TEST_URL, test_path, quiet=False)

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Clean and concise input builder
def build_input(row):
    sections = [
        f"County: {row.get('county', '')}",
        f"Experience: {row.get('experience_level', '')}",
        f"Case: {row['Prompt']}",
    ]

    if 'DDX SNOMED' in row and pd.notna(row['DDX SNOMED']):
        sections.append(f"Diagnoses: {row['DDX SNOMED']}")

    return "\n".join(sections)

train_df["input_text"] = train_df.apply(build_input, axis=1)
train_df["target_text"] = train_df["Clinician"]
test_df["input_text"] = test_df.apply(build_input, axis=1)

# === Initialize Model ===
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)

# === Efficient Dataset ===
class ClinicalDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        inputs = tokenizer(
            row["input_text"],
            max_length=MAX_INPUT_LENGTH,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        targets = tokenizer(
            row["target_text"],
            max_length=MAX_TARGET_LENGTH,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze()
        }

# Create datasets
train_dataset = ClinicalDataset(train_df, tokenizer)
val_size = min(100, int(0.1 * len(train_dataset)))  # Fixed-size validation
train_size = len(train_dataset) - val_size
train_ds, val_ds = torch.utils.data.random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

# === Focused Training ===
optimizer = AdamW(model.parameters(), lr=LR)
best_loss = float('inf')

for epoch in range(EPOCHS):
    # Training
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    # Save best model
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        model.save_pretrained(OUTPUT_DIR)
        tokenizer.save_pretrained(OUTPUT_DIR)
        print(f"Saved best model with loss: {best_loss:.4f}")

# Load best model
model = T5ForConditionalGeneration.from_pretrained(OUTPUT_DIR).to(DEVICE)

# === Smart Generation ===
def generate_response(input_text):
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    ).to(DEVICE)

    outputs = model.generate(
        **inputs,
        max_length=MAX_TARGET_LENGTH,
        num_beams=4,          # Better than sampling for quality
        early_stopping=True,
        repetition_penalty=2.5,  # Reduce repetition
        length_penalty=1.2,      # Encourage longer responses
        no_repeat_ngram_size=3    # Prevent n-gram repeats
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Generate test responses
print("\nGenerating test responses...")
test_df["Clinician"] = test_df["input_text"].apply(generate_response)

# Create submission
submission_df = test_df[["Master_Index", "Clinician"]]
submission_path = "clinical_reasoning_submission.csv"
submission_df.to_csv(submission_path, index=False)
print(f"\nSubmission created: {submission_path}")

Epoch 1/12:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 1 | Train Loss: 5.2392 | Val Loss: 3.2254
Saved best model with loss: 3.2254


Epoch 2/12:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 2 | Train Loss: 3.2234 | Val Loss: 2.9635
Saved best model with loss: 2.9635


Epoch 3/12:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 3 | Train Loss: 2.9994 | Val Loss: 2.8386
Saved best model with loss: 2.8386


Epoch 4/12:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 4 | Train Loss: 2.8646 | Val Loss: 2.7450
Saved best model with loss: 2.7450


Epoch 5/12:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 5 | Train Loss: 2.7547 | Val Loss: 2.6717
Saved best model with loss: 2.6717


Epoch 6/12:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 6 | Train Loss: 2.6613 | Val Loss: 2.6243
Saved best model with loss: 2.6243


Epoch 7/12:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 7 | Train Loss: 2.5880 | Val Loss: 2.5813
Saved best model with loss: 2.5813


Epoch 8/12:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 8 | Train Loss: 2.5323 | Val Loss: 2.5536
Saved best model with loss: 2.5536


Epoch 9/12:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 9 | Train Loss: 2.4628 | Val Loss: 2.5273
Saved best model with loss: 2.5273


Epoch 10/12:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 10 | Train Loss: 2.4201 | Val Loss: 2.5053
Saved best model with loss: 2.5053


Epoch 11/12:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 11 | Train Loss: 2.3709 | Val Loss: 2.4887
Saved best model with loss: 2.4887


Epoch 12/12:   0%|          | 0/45 [00:00<?, ?it/s]

Epoch 12 | Train Loss: 2.3315 | Val Loss: 2.4700
Saved best model with loss: 2.4700

Generating test responses...

Submission created: clinical_reasoning_submission.csv
