Dependencies

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers

In [None]:
pip install datasets


In [None]:
pip install "accelerate>=0.26.0"


In [None]:
!pip install transformers-4.46.2-py3-none-any.whl

In [None]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126


In [None]:
pip show accelerate


In [None]:
nvcc --version



In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.metrics import classification_report
import json

# Check if CUDA is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the Bio_ClinicalBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("./Bio_ClinicalBERT")
model = AutoModelForSequenceClassification.from_pretrained("./Bio_ClinicalBERT", num_labels=3)  # Assuming 3 chronic diseases

# Initialize the classification head weights
model.classifier.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
model.classifier.bias.data.zero_()

# Move the model to the GPU
model.to(device)

# Custom Trainer to handle non-contiguous tensors (optional, only if issues arise)
class CustomTrainer(Trainer):
    def _save(self, output_dir: str, state_dict=None):
        for name, param in self.model.named_parameters():
            if not param.is_contiguous():
                param.data = param.contiguous()
        super()._save(output_dir, state_dict)

# Sample data - for testing
data = [
    {"text": "Patient has a history of type 2 diabetes.", "label": 0},
    {"text": "Hypertension is controlled with medication.", "label": 1},
    {"text": "Asthma symptoms worsening over the past week.", "label": 2},
]

# Create a DataFrame from the sample data
df = pd.DataFrame(data)

# Tokenization function with max_length
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Convert DataFrame to Dataset object
dataset = Dataset.from_pandas(df)

# Apply tokenizer to dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split the data into training and test sets
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# Define training arguments (necessary for setting up Trainer, but won't actually train)
training_args = TrainingArguments(
    output_dir="./results",         # Directory to save model checkpoints
    num_train_epochs=3,             # Number of training epochs
    per_device_train_batch_size=8,  # Training batch size
    per_device_eval_batch_size=8,   # Evaluation batch size
    save_steps=500,                 # Save checkpoint every 500 steps
    save_total_limit=1,             # Only keep the latest checkpoint
)

# Set up Trainer with CustomTrainer, but skip training
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Make predictions directly without fine-tuning
predictions = trainer.predict(test_dataset)

# Ensure predictions are moved to the GPU before converting to tensor
preds = torch.tensor(predictions.predictions).to(device).argmax(dim=1)

# Create the data catalog
catalog = {
    "use_case": "Classifying Doctors' Notes for Chronic Diseases",
    "model": "Bio_ClinicalBERT",
    "model_parameters": {
        "num_labels": 3,
        "epochs": training_args.num_train_epochs,
        "batch_size": training_args.per_device_train_batch_size,
    },
    "results": []
}

# Populate the catalog with classification results
for i in range(len(test_dataset)):
    note = test_dataset["text"][i]
    true_label = test_dataset["label"][i]
    predicted_label = preds[i].item()
    confidence_score = float(predictions.predictions[i].max())  # Convert to Python float
    catalog["results"].append({
        "note": note,
        "true_label": true_label,
        "predicted_label": predicted_label,
        "confidence_score": confidence_score,
    })

# Save the catalog to a JSON file
with open("classification_catalog.json", "w") as f:
    json.dump(catalog, f, indent=4)

# Print the classification report
print(classification_report(test_dataset["label"], preds.cpu()))  # Convert tensors back to CPU for reporting

