In [None]:
%pip install rouge nltk

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


## Imports and Device Setup

In [6]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
import xml.etree.ElementTree as ET
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
import warnings
warnings.filterwarnings('ignore')

# Set device and memory optimization for Kaggle
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


### Kaggle Dataset paths

In [9]:
KAGGLE_INPUT_PATH = '/kaggle/input/chest-xrays-indiana-university'
IMAGES_PATH = os.path.join(KAGGLE_INPUT_PATH, 'images', 'images_normalized')
REPORTS_PATH = os.path.join(KAGGLE_INPUT_PATH, 'indiana_reports.csv')
PROJECTIONS_PATH = os.path.join(KAGGLE_INPUT_PATH, 'indiana_projections.csv')

print(KAGGLE_INPUT_PATH, IMAGES_PATH, REPORTS_PATH, PROJECTIONS_PATH)

/kaggle/input/chest-xrays-indiana-university /kaggle/input/chest-xrays-indiana-university/images/images_normalized /kaggle/input/chest-xrays-indiana-university/indiana_reports.csv /kaggle/input/chest-xrays-indiana-university/indiana_projections.csv


## LoRA Implementation

In [10]:
class LoRALayer(nn.Module):
    """Low-Rank Adaptation layer for efficient fine-tuning"""
    
    def __init__(self, in_features, out_features, rank=16, alpha=32):
        super().__init__()
        self.rank = rank
        self.alpha = alpha
        self.scaling = alpha / rank
        
        # LoRA parameters
        self.lora_A = nn.Parameter(torch.randn(in_features, rank) * 0.01)
        self.lora_B = nn.Parameter(torch.zeros(rank, out_features))
        
        # Freeze original weights
        self.weight = nn.Parameter(torch.randn(out_features, in_features), requires_grad=False)
        
    def forward(self, x):
        # Original transformation
        result = torch.matmul(x, self.weight.t())
        
        # Add LoRA adaptation
        lora_output = torch.matmul(torch.matmul(x, self.lora_A), self.lora_B)
        result += lora_output * self.scaling
        
        return result

## Dataset Setup

In [12]:
class XRayDataset(Dataset):
    """Custom dataset for X-ray images and reports"""
    
    def __init__(self, image_paths, reports, tokenizer, max_length=256, transform=None):
        self.image_paths = image_paths
        self.reports = reports
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.transform = transform
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        # Load and process image
        image_path = self.image_paths[idx]
        try:
            image = Image.open(image_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
        except:
            # Return a black image if loading fails
            image = torch.zeros(3, 224, 224)
        
        # Process text
        report = str(self.reports[idx])
        encoded = self.tokenizer(
            report,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'image': image,
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze(),
            'report': report
        }

## Feature Extraction ResNet(CNN)

In [13]:
class CNNEncoder(nn.Module):
    """CNN Encoder using ResNet for feature extraction"""
    
    def __init__(self, embed_size=768, use_lora=False):
        super(CNNEncoder, self).__init__()
        # Use ResNet18 for memory efficiency on Kaggle
        resnet = models.resnet18(pretrained=True)
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        
        if use_lora:
            self.linear = LoRALayer(resnet.fc.in_features, embed_size)
        else:
            self.linear = nn.Linear(resnet.fc.in_features, embed_size)
            
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        
    def forward(self, images):
        with torch.no_grad():
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.linear(features)
        features = self.bn(features)
        return features

## Adding RoBERTa

In [15]:
class XRayReportGenerator(nn.Module):
    """Main model combining CNN and RoBERTa with optional LoRA"""
    
    def __init__(self, vocab_size, embed_size=768, hidden_size=768, use_lora=False):
        super(XRayReportGenerator, self).__init__()
        
        self.use_lora = use_lora
        
        # Image encoder
        self.encoder = CNNEncoder(embed_size, use_lora=use_lora)
        
        # Text decoder using RoBERTa
        config = RobertaConfig(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_hidden_layers=6,  # Reduced for memory
            num_attention_heads=8,
            intermediate_size=2048,
            max_position_embeddings=256
        )
        self.roberta = RobertaModel(config)
        
        # Projection layers with optional LoRA
        if use_lora:
            self.image_projection = LoRALayer(embed_size, hidden_size)
            self.output_projection = LoRALayer(hidden_size, vocab_size)
        else:
            self.image_projection = nn.Linear(embed_size, hidden_size)
            self.output_projection = nn.Linear(hidden_size, vocab_size)
            
        self.dropout = nn.Dropout(0.1)
        
        # Freeze RoBERTa layers if using LoRA
        if use_lora:
            for param in self.roberta.parameters():
                param.requires_grad = False
        
    def forward(self, images, input_ids=None, attention_mask=None):
        # Encode images
        image_features = self.encoder(images)
        image_features = self.image_projection(image_features)
        image_features = image_features.unsqueeze(1)  # Add sequence dimension
        
        if input_ids is not None:
            # Training mode: use teacher forcing
            text_embeddings = self.roberta.embeddings(input_ids)
            
            # Concatenate image features with text embeddings
            combined_embeddings = torch.cat([image_features, text_embeddings], dim=1)
            
            # Create attention mask for combined sequence
            batch_size = images.size(0)
            image_mask = torch.ones(batch_size, 1).to(images.device)
            if attention_mask is not None:
                combined_mask = torch.cat([image_mask, attention_mask], dim=1)
            else:
                combined_mask = image_mask
            
            # Pass through RoBERTa
            outputs = self.roberta(
                inputs_embeds=combined_embeddings,
                attention_mask=combined_mask
            )
            
            # Project to vocabulary
            hidden_states = outputs.last_hidden_state
            hidden_states = self.dropout(hidden_states)
            logits = self.output_projection(hidden_states)
            
            return logits[:, 1:, :]  # Remove image feature from output
        else:
            # Inference mode
            return image_features

## Inference class for generating report

In [16]:
class XRayReportGenerator(nn.Module):
    """Main model combining CNN and RoBERTa with optional LoRA"""
    
    def __init__(self, vocab_size, embed_size=768, hidden_size=768, use_lora=False):
        super(XRayReportGenerator, self).__init__()
        
        self.use_lora = use_lora
        
        # Image encoder
        self.encoder = CNNEncoder(embed_size, use_lora=use_lora)
        
        # Text decoder using RoBERTa
        config = RobertaConfig(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_hidden_layers=6,  # Reduced for memory
            num_attention_heads=8,
            intermediate_size=2048,
            max_position_embeddings=256
        )
        self.roberta = RobertaModel(config)
        
        # Projection layers with optional LoRA
        if use_lora:
            self.image_projection = LoRALayer(embed_size, hidden_size)
            self.output_projection = LoRALayer(hidden_size, vocab_size)
        else:
            self.image_projection = nn.Linear(embed_size, hidden_size)
            self.output_projection = nn.Linear(hidden_size, vocab_size)
            
        self.dropout = nn.Dropout(0.1)
        
        # Freeze RoBERTa layers if using LoRA
        if use_lora:
            for param in self.roberta.parameters():
                param.requires_grad = False
        
    def forward(self, images, input_ids=None, attention_mask=None):
        # Encode images
        image_features = self.encoder(images)
        image_features = self.image_projection(image_features)
        image_features = image_features.unsqueeze(1)  # Add sequence dimension
        
        if input_ids is not None:
            # Training mode: use teacher forcing
            text_embeddings = self.roberta.embeddings(input_ids)
            
            # Concatenate image features with text embeddings
            combined_embeddings = torch.cat([image_features, text_embeddings], dim=1)
            
            # Create attention mask for combined sequence
            batch_size = images.size(0)
            image_mask = torch.ones(batch_size, 1).to(images.device)
            if attention_mask is not None:
                combined_mask = torch.cat([image_mask, attention_mask], dim=1)
            else:
                combined_mask = image_mask
            
            # Pass through RoBERTa
            outputs = self.roberta(
                inputs_embeds=combined_embeddings,
                attention_mask=combined_mask
            )
            
            # Project to vocabulary
            hidden_states = outputs.last_hidden_state
            hidden_states = self.dropout(hidden_states)
            logits = self.output_projection(hidden_states)
            
            return logits[:, 1:, :]  # Remove image feature from output
        else:
            # Inference mode
            return image_features