In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [16]:
# train.py (Main Kaggle notebook file)
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import AutoProcessor, AutoModelForCausalLM
from pathlib import Path
import pandas as pd
import numpy as np
from PIL import Image
import yaml
import logging
from tqdm.notebook import tqdm
import wandb
from datetime import datetime
import os
import math

# Configuration
CONFIG = {
    'data': {
        'input_path': '/kaggle/input/chest-xrays-indiana-university',
        'output_path': '/kaggle/working',
        'image_dir': 'images/images_normalized',
        'image_size': 224,
        'max_length': 512,
        'batch_size': 8
    },
    'model': {
        'name': 'microsoft/git-base',
        'lora_rank': 4,
        'lora_alpha': 32
    },
    'training': {
        'epochs': 10,
        'learning_rate': 2e-5,
        'weight_decay': 0.01,
        'warmup_steps': 100
    }
}

def prepare_metadata(input_path, output_path):
    """
    Prepare metadata by merging projections and reports data.
    """
    # Read the CSV files
    projections_df = pd.read_csv(os.path.join(input_path, 'indiana_projections.csv'))
    reports_df = pd.read_csv(os.path.join(input_path, 'indiana_reports.csv'))
    
    # Create metadata entries
    metadata_entries = []
    for _, row in projections_df.iterrows():
        uid = row['uid']
        filename = row['filename']
        
        # Get the corresponding report
        report_data = reports_df[reports_df['uid'] == uid]
        if len(report_data) == 0:
            continue
            
        report_data = report_data.iloc[0]
        
        # Combine findings and impression for the report
        report = f"Findings: {report_data['findings']} Impression: {report_data['impression']}"
        
        metadata_entries.append({
            'image_filename': filename,
            'report': report,
            'projection': row['projection'],
            'problems': report_data['Problems'],
            'indication': report_data['indication']
        })
    
    # Create metadata DataFrame
    metadata_df = pd.DataFrame(metadata_entries)
    
    # Save the metadata to the working directory
    metadata_path = os.path.join(output_path, 'metadata.csv')
    metadata_df.to_csv(metadata_path, index=False)
    print(f"Metadata saved to {metadata_path}")
    
    return metadata_path

# Dataset class
class XRayReportDataset(Dataset):
    def __init__(self, data_dir, transform=None, max_length=512):
        self.data_dir = Path(data_dir)
        self.transform = transform
        self.processor = AutoProcessor.from_pretrained(CONFIG['model']['name'])
        self.max_length = max_length
        
        # Load metadata from working directory
        self.metadata = pd.read_csv(Path(CONFIG['data']['output_path']) / 'metadata.csv')

    def __len__(self):
        return len(self.metadata)
        
    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        # Update image path to use the correct subdirectory
        image_path = self.data_dir / CONFIG['data']['image_dir'] / row['image_filename']
        report = row['report']
        
        # Load and transform image
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
            
        # Process text
        encoding = self.processor(
            text=report,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            legacy=False
        )
        
        return {
            'pixel_values': encoding['pixel_values'].squeeze(),
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'report': report
        }

# Model class with LoRA
class XRayReportGenerator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(CONFIG['model']['name'])
        self.processor = AutoProcessor.from_pretrained(CONFIG['model']['name'])
        self.lora_mappings = {}
        # Add LoRA layers
        self._add_lora_layers()
        self._freeze_base_model()
    
    def _add_lora_layers(self):
        self.lora_layers = nn.ModuleDict()
        for name, module in self.model.named_modules():
            if "query" in name or "value" in name:
                if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                    # Create a unique key for the layer
                    layer_id = name.replace(".", "_")
                    
                    # Create LoRA layers
                    lora_down = nn.Linear(module.in_features, CONFIG['model']['lora_rank'], bias=False)
                    lora_up = nn.Linear(CONFIG['model']['lora_rank'], module.out_features, bias=False)
                    
                    # Initialize weights
                    nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
                    nn.init.zeros_(lora_up.weight)
                    
                    # Register the layers as ModuleList to ensure proper device placement and parameter tracking
                    self.add_module(f"lora_down_{layer_id}", lora_down)
                    self.add_module(f"lora_up_{layer_id}", lora_up)
                    
                    # Store the mapping between original module and its LoRA layers
                    self.lora_mappings[name] = (f"lora_down_{layer_id}", f"lora_up_{layer_id}")
    
    def _freeze_base_model(self):
        """Freeze base model parameters and unfreeze LoRA parameters"""
        for param in self.model.parameters():
            param.requires_grad = False
            
        # Unfreeze LoRA parameters
        for down, up in self.lora_layers.values():
            for param in down.parameters():
                param.requires_grad = True
            for param in up.parameters():
                param.requires_grad = True
    
    def forward(self, images, input_ids=None, attention_mask=None, labels=None):
        if pixel_values.requires_grad == False:
            pixel_values = pixel_values.detach().requires_grad_(True)
        outputs = self.model(
            pixel_values=images,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_hidden_states=True,
            return_dict=True
        )

        hidden_states = outputs.hidden_states[-1]

        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                if name in self.lora_layers:
                    down_name, up_name = self.lora_mappings[name]
                    lora_down = getattr(self, down_name)
                    lora_up = getattr(self, up_name)
                    
                    # Apply LoRA transformation
                    lora_output = lora_up(lora_down(hidden_states))
                    
                    # Combine with original output
                    original_output = module(hidden_states)
                    module.output = original_output + CONFIG['model']['lora_alpha'] * lora_output
        
        return outputs

    def generate(self, images, max_length=100, num_beams=4):
            """Generate text based on image input"""
            return self.model.generate(
                pixel_values=images,
                max_length=max_length,
                num_beams=num_beams,
                early_stopping=True,
                no_repeat_ngram_size=3
            )

     

# Training function
def train_model(model, train_loader, val_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    optimizer = torch.optim.AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=CONFIG['training']['learning_rate'],
        weight_decay=CONFIG['training']['weight_decay']
    )
    
    best_val_loss = float('inf')
    
    for epoch in range(CONFIG['training']['epochs']):
        # Training
        model.train()
        train_loss = 0
        train_steps = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}')
        for batch in progress_bar:
            try:
                # Move batch to device
                pixel_values = batch['pixel_values'].to(device)
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                
                # Forward pass
                outputs = model(
                    pixel_values=pixel_values,
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=input_ids
                )
                
                loss = outputs.loss
                
                # Check for anomalous loss
                if not torch.isfinite(loss):
                    logging.warning("Non-finite loss detected, skipping batch")
                    continue
                
                # Backward pass with gradient clipping
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                
                train_loss += loss.item()
                train_steps += 1
                
                # Update progress bar
                progress_bar.set_postfix({'loss': loss.item()})
                
            except RuntimeError as e:
                if "out of memory" in str(e):
                    if hasattr(torch.cuda, 'empty_cache'):
                        torch.cuda.empty_cache()
                    logging.warning("GPU OOM, skipping batch")
                    continue
                raise e
        
        avg_train_loss = train_loss / train_steps
        
        # Validation
        model.eval()
        val_loss = 0
        val_steps = 0
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validating'):
                images = batch['image'].to(device)
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                
                outputs = model(
                    images=images,
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=input_ids
                )
                
                val_loss += outputs.loss.item()
                val_steps += 1
        
        avg_val_loss = val_loss / val_steps
        
        # Log metrics
        print(f'Epoch {epoch + 1}:')
        print(f'Average training loss: {avg_train_loss:.4f}')
        print(f'Average validation loss: {avg_val_loss:.4f}')
        
        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save({
                'lora_state_dict': model.lora_layers.state_dict(),
                'epoch': epoch,
                'val_loss': best_val_loss
            }, '/kaggle/working/best_model_lora.pt')

def main():
    #input_path = CONFIG['data']['input_path']
    
    prepare_metadata(
        input_path=CONFIG["data"]["input_path"],
        output_path=CONFIG["data"]["output_path"]
    )
    # Set up transforms
    transform = transforms.Compose([
        transforms.Resize((CONFIG['data']['image_size'], CONFIG['data']['image_size'])),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(10),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    
    # Create dataset
    dataset = XRayReportDataset(
        data_dir=CONFIG['data']['input_path'],
        transform=transform,
        max_length=CONFIG['data']['max_length']
    )
    
    # Split dataset
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(
        dataset, [train_size, val_size]
    )
    
    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=True,
        num_workers=2
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=False,
        num_workers=2
    )
    
    # Initialize model
    model = XRayReportGenerator()
    
    # Train model
    train_model(model, train_loader, val_loader)

if __name__ == "__main__":
    main()

Metadata saved to /kaggle/working/metadata.csv


Epoch 1:   0%|          | 0/747 [00:00<?, ?it/s]

KeyError: Caught KeyError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 50, in fetch
    data = self.dataset.__getitems__(possibly_batched_index)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataset.py", line 420, in __getitems__
    return [self.dataset[self.indices[idx]] for idx in indices]
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataset.py", line 420, in <listcomp>
    return [self.dataset[self.indices[idx]] for idx in indices]
  File "<ipython-input-16-db388528e34d>", line 120, in __getitem__
    'pixel_values': encoding['pixel_values'].squeeze(),
  File "/usr/local/lib/python3.10/dist-packages/transformers/feature_extraction_utils.py", line 87, in __getitem__
    return self.data[item]
KeyError: 'pixel_values'


In [17]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import AutoModelForCausalLM, AutoProcessor
from pathlib import Path
import pandas as pd
from PIL import Image
import os
import math
from tqdm.notebook import tqdm

CONFIG = {
    'data': {
        'input_path': '/kaggle/input/chest-xrays-indiana-university',
        'output_path': '/kaggle/working',
        'image_dir': 'images/images_normalized',
        'image_size': 224,
        'max_length': 512,
        'batch_size': 8
    },
    'model': {
        'name': 'microsoft/git-base',
        'lora_rank': 4,
        'lora_alpha': 32
    },
    'training': {
        'epochs': 10,
        'learning_rate': 2e-5,
        'weight_decay': 0.01,
        'warmup_steps': 100
    }
}

def prepare_metadata(input_path, output_path):
    projections_df = pd.read_csv(os.path.join(input_path, 'indiana_projections.csv'))
    reports_df = pd.read_csv(os.path.join(input_path, 'indiana_reports.csv'))
    
    metadata_entries = []
    for _, row in projections_df.iterrows():
        report_data = reports_df[reports_df['uid'] == row['uid']]
        if len(report_data) == 0:
            continue
            
        report_data = report_data.iloc[0]
        report = f"Findings: {report_data['findings']} Impression: {report_data['impression']}"
        
        metadata_entries.append({
            'image_filename': row['filename'],
            'report': report,
        })
    
    metadata_df = pd.DataFrame(metadata_entries)
    metadata_path = os.path.join(output_path, 'metadata.csv')
    metadata_df.to_csv(metadata_path, index=False)
    return metadata_path

class XRayReportDataset(Dataset):
    def __init__(self, data_dir, transform=None, max_length=512):
        self.data_dir = Path(data_dir)
        self.transform = transform
        self.processor = AutoProcessor.from_pretrained(CONFIG['model']['name'])
        self.max_length = max_length
        self.metadata = pd.read_csv(Path(CONFIG['data']['output_path']) / 'metadata.csv')
    
    def __len__(self):
        return len(self.metadata)
        
    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        image_path = self.data_dir / CONFIG['data']['image_dir'] / row['image_filename']
        
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        
        # Process both image and text using the processor
        encoding = self.processor(
            images=image,
            text=row['report'],
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt"
        )
        
        # Remove batch dimension added by processor
        for k in encoding.keys():
            encoding[k] = encoding[k].squeeze(0)
            
        return encoding

class XRayReportGenerator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(CONFIG['model']['name'])
        self.lora_layers = nn.ModuleDict()
        self._add_lora_layers()
        self._freeze_base_model()
    
    def _add_lora_layers(self):
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                lora_down = nn.Linear(module.in_features, CONFIG['model']['lora_rank'], bias=False)
                lora_up = nn.Linear(CONFIG['model']['lora_rank'], module.out_features, bias=False)
                
                nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
                nn.init.zeros_(lora_up.weight)
                
                self.lora_layers[layer_id] = nn.ModuleList([lora_down, lora_up])
    
    def _freeze_base_model(self):
        for param in self.model.parameters():
            param.requires_grad = False
        
        for layer in self.lora_layers.values():
            for param in layer.parameters():
                param.requires_grad = True
    
    def forward(self, pixel_values, input_ids, attention_mask=None, labels=None):
        outputs = self.model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_hidden_states=True
        )
        
        hidden_states = outputs.hidden_states[-1]
        
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                if layer_id in self.lora_layers:
                    lora_down, lora_up = self.lora_layers[layer_id]
                    lora_output = lora_up(lora_down(hidden_states))
                    module.weight.data += (CONFIG['model']['lora_alpha'] * lora_output)
        
        return outputs

def train_model(model, train_loader, val_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    optimizer = torch.optim.AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=CONFIG['training']['learning_rate'],
        weight_decay=CONFIG['training']['weight_decay']
    )
    
    best_val_loss = float('inf')
    
    for epoch in range(CONFIG['training']['epochs']):
        model.train()
        train_loss = 0
        train_steps = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}')
        for batch in progress_bar:
            try:
                batch = {k: v.to(device) for k, v in batch.items()}
                
                outputs = model(
                    pixel_values=batch['pixel_values'],
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['input_ids']
                )
                
                loss = outputs.loss
                if not torch.isfinite(loss):
                    continue
                
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                
                train_loss += loss.item()
                train_steps += 1
                progress_bar.set_postfix({'loss': loss.item()})
                
            except RuntimeError as e:
                if "out of memory" in str(e):
                    torch.cuda.empty_cache()
                    continue
                raise e
        
        print(f'Epoch {epoch + 1}:')
        print(f'Average training loss: {train_loss/train_steps:.4f}')
        
        if train_loss/train_steps < best_val_loss:
            best_val_loss = train_loss/train_steps
            torch.save({
                'lora_state_dict': model.lora_layers.state_dict(),
                'epoch': epoch,
                'loss': best_val_loss
            }, os.path.join(CONFIG['data']['output_path'], 'best_model_lora.pt'))

def main():
    prepare_metadata(CONFIG['data']['input_path'], CONFIG['data']['output_path'])
    
    transform = transforms.Compose([
        transforms.Resize((CONFIG['data']['image_size'], CONFIG['data']['image_size'])),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    
    dataset = XRayReportDataset(
        data_dir=CONFIG['data']['input_path'],
        transform=transform,
        max_length=CONFIG['data']['max_length']
    )
    
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=True,
        num_workers=2
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=False,
        num_workers=2
    )
    
    model = XRayReportGenerator()
    train_model(model, train_loader, val_loader)

if __name__ == "__main__":
    main()

Epoch 1:   0%|          | 0/747 [00:00<?, ?it/s]

Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. In the new behavior, if both images and text are provided, the last token (EOS token) of the input_ids and attention_mask tensors will be removed. To test the new behavior, set `legacy=False`as a processor call argument.
Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. In the new behavior, if both images and text are provided, the last token (EOS token) of the input_ids and attention_mask tensors will be removed. To test the new behavior, set `legacy=False`as a processor call argument.


ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 50, in fetch
    data = self.dataset.__getitems__(possibly_batched_index)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataset.py", line 420, in __getitems__
    return [self.dataset[self.indices[idx]] for idx in indices]
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataset.py", line 420, in <listcomp>
    return [self.dataset[self.indices[idx]] for idx in indices]
  File "<ipython-input-17-a4ca362ddd07>", line 78, in __getitem__
    encoding = self.processor(
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/git/processing_git.py", line 124, in __call__
    image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
  File "/usr/local/lib/python3.10/dist-packages/transformers/image_processing_utils.py", line 41, in __call__
    return self.preprocess(images, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/clip/image_processing_clip.py", line 325, in preprocess
    image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/clip/image_processing_clip.py", line 191, in resize
    return resize(
  File "/usr/local/lib/python3.10/dist-packages/transformers/image_transforms.py", line 337, in resize
    do_rescale = _rescale_for_pil_conversion(image)
  File "/usr/local/lib/python3.10/dist-packages/transformers/image_transforms.py", line 158, in _rescale_for_pil_conversion
    raise ValueError(
ValueError: The image to be converted to a PIL image contains values outside the range [0, 1], got [-2.1007792949676514, 2.6051416397094727] which cannot be converted to uint8.


In [18]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import pandas as pd
from PIL import Image
import os
import math
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoProcessor

CONFIG = {
    'data': {
        'input_path': '/kaggle/input/chest-xrays-indiana-university',
        'output_path': '/kaggle/working',
        'image_dir': 'images/images_normalized',
        'max_length': 512,
        'batch_size': 8
    },
    'model': {
        'name': 'microsoft/git-base',
        'lora_rank': 4,
        'lora_alpha': 32
    },
    'training': {
        'epochs': 10,
        'learning_rate': 2e-5,
        'weight_decay': 0.01
    }
}

def prepare_metadata(input_path, output_path):
    projections_df = pd.read_csv(os.path.join(input_path, 'indiana_projections.csv'))
    reports_df = pd.read_csv(os.path.join(input_path, 'indiana_reports.csv'))
    
    metadata_entries = []
    for _, row in projections_df.iterrows():
        report_data = reports_df[reports_df['uid'] == row['uid']]
        if len(report_data) == 0:
            continue
        report_data = report_data.iloc[0]
        report = f"Findings: {report_data['findings']} Impression: {report_data['impression']}"
        metadata_entries.append({
            'image_filename': row['filename'],
            'report': report,
        })
    
    metadata_df = pd.DataFrame(metadata_entries)
    metadata_path = os.path.join(output_path, 'metadata.csv')
    metadata_df.to_csv(metadata_path, index=False)
    return metadata_path

class XRayReportDataset(Dataset):
    def __init__(self, data_dir, max_length=512):
        self.data_dir = Path(data_dir)
        self.processor = AutoProcessor.from_pretrained(CONFIG['model']['name'])
        self.max_length = max_length
        self.metadata = pd.read_csv(Path(CONFIG['data']['output_path']) / 'metadata.csv')
    
    def __len__(self):
        return len(self.metadata)
        
    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        image_path = self.data_dir / CONFIG['data']['image_dir'] / row['image_filename']
        image = Image.open(image_path).convert('RGB')
        
        encoding = self.processor(
            images=image,
            text=row['report'],
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt",
            legacy=False
        )
        
        return {k: v.squeeze(0) for k, v in encoding.items()}

class XRayReportGenerator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(CONFIG['model']['name'])
        self.lora_layers = nn.ModuleDict()
        self._add_lora_layers()
        self._freeze_base_model()
    
    def _add_lora_layers(self):
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                if layer_id in self.lora_layers:
                lora_down = nn.Linear(module.in_features, CONFIG['model']['lora_rank'], bias=False)
                lora_up = nn.Linear(CONFIG['model']['lora_rank'], module.out_features, bias=False)
                
                nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
                nn.init.zeros_(lora_up.weight)
                
                self.lora_layers[layer_id] = nn.ModuleList([lora_down, lora_up])
    
    def _freeze_base_model(self):
        for param in self.model.parameters():
            param.requires_grad = False
        
        for layer in self.lora_layers.values():
            for param in layer.parameters():
                param.requires_grad = True
    
    def forward(self, pixel_values, input_ids, attention_mask=None, labels=None):
        outputs = self.model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_hidden_states=True
        )
        
        hidden_states = outputs.hidden_states[-1]
        
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                if layer_id in self.lora_layers:
                    lora_down, lora_up = self.lora_layers[layer_id]
                    lora_output = lora_up(lora_down(hidden_states))
                    module.weight.data += (CONFIG['model']['lora_alpha'] * lora_output)
        
        return outputs

def train_model(model, train_loader, val_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    optimizer = torch.optim.AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=CONFIG['training']['learning_rate'],
        weight_decay=CONFIG['training']['weight_decay']
    )
    
    best_loss = float('inf')
    
    for epoch in range(CONFIG['training']['epochs']):
        model.train()
        total_loss = 0
        steps = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}')
        for batch in progress_bar:
            try:
                batch = {k: v.to(device) for k, v in batch.items()}
                
                outputs = model(
                    pixel_values=batch['pixel_values'],
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['input_ids']
                )
                
                loss = outputs.loss
                if not torch.isfinite(loss):
                    continue
                
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                
                total_loss += loss.item()
                steps += 1
                progress_bar.set_postfix({'loss': loss.item()})
                
            except RuntimeError as e:
                if "out of memory" in str(e):
                    torch.cuda.empty_cache()
                    continue
                raise e
        
        avg_loss = total_loss / steps
        print(f'Epoch {epoch + 1} - Average loss: {avg_loss:.4f}')
        
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save({
                'lora_state_dict': model.lora_layers.state_dict(),
                'epoch': epoch,
                'loss': best_loss
            }, os.path.join(CONFIG['data']['output_path'], 'best_model_lora.pt'))

def main():
    prepare_metadata(CONFIG['data']['input_path'], CONFIG['data']['output_path'])
    
    dataset = XRayReportDataset(
        data_dir=CONFIG['data']['input_path'],
        max_length=CONFIG['data']['max_length']
    )
    
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=True,
        num_workers=2
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=False,
        num_workers=2
    )
    
    model = XRayReportGenerator()
    train_model(model, train_loader, val_loader)

if __name__ == "__main__":
    main()

Epoch 1:   0%|          | 0/747 [00:00<?, ?it/s]

RuntimeError: The size of tensor a (768) must match the size of tensor b (708) at non-singleton dimension 1

In [19]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import AutoModelForCausalLM, AutoProcessor
from pathlib import Path
import pandas as pd
from PIL import Image
import os
import math
from tqdm.notebook import tqdm

CONFIG = {
    'data': {
        'input_path': '/kaggle/input/chest-xrays-indiana-university',
        'output_path': '/kaggle/working',
        'image_dir': 'images/images_normalized',
        'image_size': 224,
        'max_length': 512,
        'batch_size': 8
    },
    'model': {
        'name': 'microsoft/git-base',
        'lora_rank': 4,
        'lora_alpha': 32
    },
    'training': {
        'epochs': 10,
        'learning_rate': 2e-5,
        'weight_decay': 0.01,
        'warmup_steps': 100
    }
}

def prepare_metadata(input_path, output_path):
    projections_df = pd.read_csv(os.path.join(input_path, 'indiana_projections.csv'))
    reports_df = pd.read_csv(os.path.join(input_path, 'indiana_reports.csv'))
    
    metadata_entries = []
    for _, row in projections_df.iterrows():
        report_data = reports_df[reports_df['uid'] == row['uid']]
        if len(report_data) == 0:
            continue
            
        report_data = report_data.iloc[0]
        report = f"Findings: {report_data['findings']} Impression: {report_data['impression']}"
        
        metadata_entries.append({
            'image_filename': row['filename'],
            'report': report,
        })
    
    metadata_df = pd.DataFrame(metadata_entries)
    metadata_path = os.path.join(output_path, 'metadata.csv')
    metadata_df.to_csv(metadata_path, index=False)
    return metadata_path

class XRayReportDataset(Dataset):
    def __init__(self, data_dir, transform=None, max_length=512):
        self.data_dir = Path(data_dir)
        self.transform = transform
        self.processor = AutoProcessor.from_pretrained(CONFIG['model']['name'])
        self.max_length = max_length
        self.metadata = pd.read_csv(Path(CONFIG['data']['output_path']) / 'metadata.csv')
    
    def __len__(self):
        return len(self.metadata)
        
    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        image_path = self.data_dir / CONFIG['data']['image_dir'] / row['image_filename']
        
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        
        # Process both image and text using the processor
        encoding = self.processor(
            images=image,
            text=row['report'],
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt"
        )
        
        # Remove batch dimension added by processor
        for k in encoding.keys():
            encoding[k] = encoding[k].squeeze(0)
            
        return encoding

class XRayReportGenerator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(CONFIG['model']['name'])
        self.lora_layers = nn.ModuleDict()
        self._add_lora_layers()
        self._freeze_base_model()
    
    def _add_lora_layers(self):
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                lora_down = nn.Linear(module.in_features, CONFIG['model']['lora_rank'], bias=False)
                lora_up = nn.Linear(CONFIG['model']['lora_rank'], module.out_features, bias=False)
                
                nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
                nn.init.zeros_(lora_up.weight)
                
                self.lora_layers[layer_id] = nn.ModuleList([lora_down, lora_up])
    
    def _freeze_base_model(self):
        for param in self.model.parameters():
            param.requires_grad = False
        
        for layer in self.lora_layers.values():
            for param in layer.parameters():
                param.requires_grad = True
    
    def forward(self, pixel_values, input_ids, attention_mask=None, labels=None):
        outputs = self.model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_hidden_states=True
        )
        
        hidden_states = outputs.hidden_states[-1]
        
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                if layer_id in self.lora_layers:
                    lora_down, lora_up = self.lora_layers[layer_id]
                    lora_output = lora_up(lora_down(hidden_states))
                    module.weight.data += (CONFIG['model']['lora_alpha'] * lora_output)
        
        return outputs

def train_model(model, train_loader, val_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    optimizer = torch.optim.AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=CONFIG['training']['learning_rate'],
        weight_decay=CONFIG['training']['weight_decay']
    )
    
    best_val_loss = float('inf')
    
    for epoch in range(CONFIG['training']['epochs']):
        model.train()
        train_loss = 0
        train_steps = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}')
        for batch in progress_bar:
            try:
                batch = {k: v.to(device) for k, v in batch.items()}
                
                outputs = model(
                    pixel_values=batch['pixel_values'],
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['input_ids']
                )
                
                loss = outputs.loss
                if not torch.isfinite(loss):
                    continue
                
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                
                train_loss += loss.item()
                train_steps += 1
                progress_bar.set_postfix({'loss': loss.item()})
                
            except RuntimeError as e:
                if "out of memory" in str(e):
                    torch.cuda.empty_cache()
                    continue
                raise e
        
        print(f'Epoch {epoch + 1}:')
        print(f'Average training loss: {train_loss/train_steps:.4f}')
        
        if train_loss/train_steps < best_val_loss:
            best_val_loss = train_loss/train_steps
            torch.save({
                'lora_state_dict': model.lora_layers.state_dict(),
                'epoch': epoch,
                'loss': best_val_loss
            }, os.path.join(CONFIG['data']['output_path'], 'best_model_lora.pt'))

def main():
    prepare_metadata(CONFIG['data']['input_path'], CONFIG['data']['output_path'])
    
    transform = transforms.Compose([
        transforms.Resize((CONFIG['data']['image_size'], CONFIG['data']['image_size'])),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    
    dataset = XRayReportDataset(
        data_dir=CONFIG['data']['input_path'],
        transform=transform,
        max_length=CONFIG['data']['max_length']
    )
    
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=True,
        num_workers=2
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=False,
        num_workers=2
    )
    
    model = XRayReportGenerator()
    train_model(model, train_loader, val_loader)

if __name__ == "__main__":
    main()

Epoch 1:   0%|          | 0/747 [00:00<?, ?it/s]

Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. In the new behavior, if both images and text are provided, the last token (EOS token) of the input_ids and attention_mask tensors will be removed. To test the new behavior, set `legacy=False`as a processor call argument.
Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. In the new behavior, if both images and text are provided, the last token (EOS token) of the input_ids and attention_mask tensors will be removed. To test the new behavior, set `legacy=False`as a processor call argument.


ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 50, in fetch
    data = self.dataset.__getitems__(possibly_batched_index)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataset.py", line 420, in __getitems__
    return [self.dataset[self.indices[idx]] for idx in indices]
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataset.py", line 420, in <listcomp>
    return [self.dataset[self.indices[idx]] for idx in indices]
  File "<ipython-input-19-a4ca362ddd07>", line 78, in __getitem__
    encoding = self.processor(
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/git/processing_git.py", line 124, in __call__
    image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
  File "/usr/local/lib/python3.10/dist-packages/transformers/image_processing_utils.py", line 41, in __call__
    return self.preprocess(images, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/clip/image_processing_clip.py", line 325, in preprocess
    image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/clip/image_processing_clip.py", line 191, in resize
    return resize(
  File "/usr/local/lib/python3.10/dist-packages/transformers/image_transforms.py", line 337, in resize
    do_rescale = _rescale_for_pil_conversion(image)
  File "/usr/local/lib/python3.10/dist-packages/transformers/image_transforms.py", line 158, in _rescale_for_pil_conversion
    raise ValueError(
ValueError: The image to be converted to a PIL image contains values outside the range [0, 1], got [-2.1007792949676514, 2.640000104904175] which cannot be converted to uint8.


In [24]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import pandas as pd
from PIL import Image
import os
import math
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoProcessor

CONFIG = {
    'data': {
        'input_path': '/kaggle/input/chest-xrays-indiana-university',
        'output_path': '/kaggle/working',
        'image_dir': 'images/images_normalized',
        'max_length': 512,
        'batch_size': 8
    },
    'model': {
        'name': 'microsoft/git-base',
        'lora_rank': 4,
        'lora_alpha': 32
    },
    'training': {
        'epochs': 10,
        'learning_rate': 2e-5,
        'weight_decay': 0.01
    }
}

def prepare_metadata(input_path, output_path):
    projections_df = pd.read_csv(os.path.join(input_path, 'indiana_projections.csv'))
    reports_df = pd.read_csv(os.path.join(input_path, 'indiana_reports.csv'))
    
    metadata_entries = []
    for _, row in projections_df.iterrows():
        report_data = reports_df[reports_df['uid'] == row['uid']]
        if len(report_data) == 0:
            continue
        report_data = report_data.iloc[0]
        report = f"Findings: {report_data['findings']} Impression: {report_data['impression']}"
        metadata_entries.append({
            'image_filename': row['filename'],
            'report': report,
        })
    
    metadata_df = pd.DataFrame(metadata_entries)
    metadata_path = os.path.join(output_path, 'metadata.csv')
    metadata_df.to_csv(metadata_path, index=False)
    return metadata_path

class XRayReportDataset(Dataset):
    def __init__(self, data_dir, max_length=512):
        self.data_dir = Path(data_dir)
        self.processor = AutoProcessor.from_pretrained(CONFIG['model']['name'])
        self.max_length = max_length
        self.metadata = pd.read_csv(Path(CONFIG['data']['output_path']) / 'metadata.csv')
    
    def __len__(self):
        return len(self.metadata)
        
    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        image_path = self.data_dir / CONFIG['data']['image_dir'] / row['image_filename']
        image = Image.open(image_path).convert('RGB')
        
        encoding = self.processor(
            images=image,
            text=row['report'],
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt",
            legacy=False
        )
        
        return {k: v.squeeze(0) for k, v in encoding.items()}

class XRayReportGenerator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(CONFIG['model']['name'])
        self.lora_layers = nn.ModuleDict()
        self._add_lora_layers()
        self._freeze_base_model()
    
    def _add_lora_layers(self):
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                lora_down = nn.Linear(module.in_features, CONFIG['model']['lora_rank'], bias=False)
                lora_up = nn.Linear(CONFIG['model']['lora_rank'], module.out_features, bias=False)
                
                nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
                nn.init.zeros_(lora_up.weight)
                
                self.lora_layers[layer_id] = nn.ModuleList([lora_down, lora_up])
    
    def _freeze_base_model(self):
        for param in self.model.parameters():
            param.requires_grad = False
        
        for layer in self.lora_layers.values():
            for param in layer.parameters():
                param.requires_grad = True
    
    def forward(self, pixel_values, input_ids, attention_mask=None, labels=None):
        outputs = self.model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_hidden_states=True
        )
        
        hidden_states = outputs.hidden_states[-1]
        
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                if layer_id in self.lora_layers:
                    lora_down, lora_up = self.lora_layers[layer_id]
                    lora_output = lora_up(lora_down(hidden_states))
                    
                    # Align dimensions if there is a mismatch
                    if lora_output.size(1) != module.weight.size(1):
                        lora_output = lora_output[:, :module.weight.size(1)]
                    
                    module.weight.data += (CONFIG['model']['lora_alpha'] * lora_output)
        
        return outputs

def train_model(model, train_loader, val_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    optimizer = torch.optim.AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=CONFIG['training']['learning_rate'],
        weight_decay=CONFIG['training']['weight_decay']
    )
    
    best_loss = float('inf')
    
    for epoch in range(CONFIG['training']['epochs']):
        model.train()
        total_loss = 0
        steps = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}')
        for batch in progress_bar:
            try:
                batch = {k: v.to(device) for k, v in batch.items()}
                
                outputs = model(
                    pixel_values=batch['pixel_values'],
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['input_ids']
                )
                
                loss = outputs.loss
                if not torch.isfinite(loss):
                    continue
                
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                
                total_loss += loss.item()
                steps += 1
                progress_bar.set_postfix({'loss': loss.item()})
                
            except RuntimeError as e:
                if "out of memory" in str(e):
                    torch.cuda.empty_cache()
                    continue
                raise e
        
        avg_loss = total_loss / steps
        print(f'Epoch {epoch + 1} - Average loss: {avg_loss:.4f}')
        
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save({
                'lora_state_dict': model.lora_layers.state_dict(),
                'epoch': epoch,
                'loss': best_loss
            }, os.path.join(CONFIG['data']['output_path'], 'best_model_lora.pt'))

def main():
    prepare_metadata(CONFIG['data']['input_path'], CONFIG['data']['output_path'])
    
    dataset = XRayReportDataset(
        data_dir=CONFIG['data']['input_path'],
        max_length=CONFIG['data']['max_length']
    )
    
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=True,
        num_workers=2
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=False,
        num_workers=2
    )
    
    model = XRayReportGenerator()
    train_model(model, train_loader, val_loader)

    print(f"Hidden states shape: {hidden_states.shape}")
    print(f"LoRA output shape: {lora_output.shape}")
    print(f"Module weight shape: {module.weight.shape}")

if __name__ == "__main__":
    main()


Epoch 1:   0%|          | 0/747 [00:00<?, ?it/s]

RuntimeError: The size of tensor a (768) must match the size of tensor b (708) at non-singleton dimension 1

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import pandas as pd
from PIL import Image
import os
import math
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoProcessor

CONFIG = {
    'data': {
        'input_path': '/kaggle/input/chest-xrays-indiana-university',
        'output_path': '/kaggle/working',
        'image_dir': 'images/images_normalized',
        'max_length': 512,
        'batch_size': 8
    },
    'model': {
        'name': 'microsoft/git-base',
        'lora_rank': 4,
        'lora_alpha': 32
    },
    'training': {
        'epochs': 10,
        'learning_rate': 2e-5,
        'weight_decay': 0.01
    }
}

def prepare_metadata(input_path, output_path):
    projections_df = pd.read_csv(os.path.join(input_path, 'indiana_projections.csv'))
    reports_df = pd.read_csv(os.path.join(input_path, 'indiana_reports.csv'))
    
    metadata_entries = []
    for _, row in projections_df.iterrows():  # Fixed variable name and added underscore
        report_data = reports_df[reports_df['uid'] == row['uid']]
        if len(report_data) == 0:
            continue
        report_data = report_data.iloc[0]
        report = f"Findings: {report_data['findings']} Impression: {report_data['impression']}"
        metadata_entries.append({
            'image_filename': row['filename'],
            'report': report,
        })
    
    metadata_df = pd.DataFrame(metadata_entries)
    metadata_path = os.path.join(output_path, 'metadata.csv')
    metadata_df.to_csv(metadata_path, index=False)
    return metadata_path

class XRayReportDataset(Dataset):
    def __init__(self, data_dir, max_length=512):  # Fixed method name
        self.data_dir = Path(data_dir)
        self.processor = AutoProcessor.from_pretrained(CONFIG['model']['name'])
        self.max_length = max_length
        self.metadata = pd.read_csv(Path(CONFIG['data']['output_path']) / 'metadata.csv')
    
    def __len__(self):  # Fixed method name
        return len(self.metadata)
        
    def __getitem__(self, idx):  # Fixed method name
        row = self.metadata.iloc[idx]
        image_path = self.data_dir / CONFIG['data']['image_dir'] / row['image_filename']
        image = Image.open(image_path).convert('RGB')
        
        encoding = self.processor(
            images=image,
            text=row['report'],
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt",
            legacy=False
        )
        
        return {k: v.squeeze(0) for k, v in encoding.items()}

class XRayReportGenerator(nn.Module):
    def __init__(self):  # Fixed method name
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(CONFIG['model']['name'])
        self.lora_layers = nn.ModuleDict()
        self._add_lora_layers()
        self._freeze_base_model()
    
    def _add_lora_layers(self):  # Fixed method name
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                lora_down = nn.Linear(module.in_features, CONFIG['model']['lora_rank'], bias=False)
                lora_up = nn.Linear(CONFIG['model']['lora_rank'], module.out_features, bias=False)
                
                nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
                nn.init.zeros_(lora_up.weight)
                
                self.lora_layers[layer_id] = nn.ModuleList([lora_down, lora_up])
    
    def _freeze_base_model(self):  # Fixed method name
        for param in self.model.parameters():
            param.requires_grad = False
        
        for layer in self.lora_layers.values():
            for param in layer.parameters():
                param.requires_grad = True
    
    def forward(self, pixel_values, input_ids, attention_mask=None, labels=None):
        outputs = self.model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_hidden_states=True
        )
        
        # Store original weights
        original_weights = {}
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                if layer_id in self.lora_layers:
                    original_weights[layer_id] = module.weight.data.clone()
                    
                    # Apply LoRA updates
                    lora_down, lora_up = self.lora_layers[layer_id]
                    hidden_states = outputs.hidden_states[-1]
                    lora_output = lora_up(lora_down(hidden_states))
                    
                    if lora_output.shape[-1] != module.weight.shape[-1]:
                        lora_output = lora_output[..., :module.weight.shape[-1]]
                    
                    module.weight.data = original_weights[layer_id] + (CONFIG['model']['lora_alpha'] * lora_output)
        
        # Get final output with modified weights
        final_output = self.model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        # Restore original weights
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                if layer_id in original_weights:
                    module.weight.data = original_weights[layer_id]
        
        return final_output

def train_model(model, train_loader, val_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    optimizer = torch.optim.AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=CONFIG['training']['learning_rate'],
        weight_decay=CONFIG['training']['weight_decay']
    )
    
    best_loss = float('inf')
    
    for epoch in range(CONFIG['training']['epochs']):
        model.train()
        total_loss = 0
        steps = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}')
        for batch in progress_bar:
            try:
                batch = {k: v.to(device) for k, v in batch.items()}
                
                outputs = model(
                    pixel_values=batch['pixel_values'],
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['input_ids']
                )
                
                loss = outputs.loss
                if not torch.isfinite(loss):
                    continue
                
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                
                total_loss += loss.item()
                steps += 1
                progress_bar.set_postfix({'loss': loss.item()})
                
            except RuntimeError as e:
                if "out of memory" in str(e):
                    torch.cuda.empty_cache()
                    continue
                raise e
        
        avg_loss = total_loss / steps
        print(f'Epoch {epoch + 1} - Average loss: {avg_loss:.4f}')
        
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save({
                'lora_state_dict': model.lora_layers.state_dict(),
                'epoch': epoch,
                'loss': best_loss
            }, os.path.join(CONFIG['data']['output_path'], 'best_model_lora.pt'))

def main():
    prepare_metadata(CONFIG['data']['input_path'], CONFIG['data']['output_path'])
    
    dataset = XRayReportDataset(
        data_dir=CONFIG['data']['input_path'],
        max_length=CONFIG['data']['max_length']
    )
    
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=True,
        num_workers=2
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=False,
        num_workers=2
    )
    
    model = XRayReportGenerator()
    train_model(model, train_loader, val_loader)

if __name__ == "__main__":  # Fixed conditional
    main()

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import pandas as pd
from PIL import Image
import os
import math
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoProcessor

CONFIG = {
    'data': {
        'input_path': '/kaggle/input/chest-xrays-indiana-university',
        'output_path': '/kaggle/working',
        'image_dir': 'images/images_normalized',
        'max_length': 512,
        'batch_size': 8
    },
    'model': {
        'name': 'microsoft/git-base',
        'lora_rank': 4,
        'lora_alpha': 32
    },
    'training': {
        'epochs': 10,
        'learning_rate': 2e-5,
        'weight_decay': 0.01
    }
}

def prepare_metadata(input_path, output_path):
    projections_df = pd.read_csv(os.path.join(input_path, 'indiana_projections.csv'))
    reports_df = pd.read_csv(os.path.join(input_path, 'indiana_reports.csv'))
    
    metadata_entries = []
    for _, row in projections_df.iterrows():  # Fixed variable name and added underscore
        report_data = reports_df[reports_df['uid'] == row['uid']]
        if len(report_data) == 0:
            continue
        report_data = report_data.iloc[0]
        report = f"Findings: {report_data['findings']} Impression: {report_data['impression']}"
        metadata_entries.append({
            'image_filename': row['filename'],
            'report': report,
        })
    
    metadata_df = pd.DataFrame(metadata_entries)
    metadata_path = os.path.join(output_path, 'metadata.csv')
    metadata_df.to_csv(metadata_path, index=False)
    return metadata_path

class XRayReportDataset(Dataset):
    def __init__(self, data_dir, max_length=512):  # Fixed method name
        self.data_dir = Path(data_dir)
        self.processor = AutoProcessor.from_pretrained(CONFIG['model']['name'])
        self.max_length = max_length
        self.metadata = pd.read_csv(Path(CONFIG['data']['output_path']) / 'metadata.csv')
    
    def __len__(self):  # Fixed method name
        return len(self.metadata)
        
    def __getitem__(self, idx):  # Fixed method name
        row = self.metadata.iloc[idx]
        image_path = self.data_dir / CONFIG['data']['image_dir'] / row['image_filename']
        image = Image.open(image_path).convert('RGB')
        
        encoding = self.processor(
            images=image,
            text=row['report'],
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt",
            legacy=False
        )
        
        return {k: v.squeeze(0) for k, v in encoding.items()}

class XRayReportGenerator(nn.Module):
    def __init__(self):  # Fixed method name
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(CONFIG['model']['name'])
        self.lora_layers = nn.ModuleDict()
        self._add_lora_layers()
        self._freeze_base_model()
    
    def _add_lora_layers(self):  # Fixed method name
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                lora_down = nn.Linear(module.in_features, CONFIG['model']['lora_rank'], bias=False)
                lora_up = nn.Linear(CONFIG['model']['lora_rank'], module.out_features, bias=False)
                
                nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
                nn.init.zeros_(lora_up.weight)
                
                self.lora_layers[layer_id] = nn.ModuleList([lora_down, lora_up])
    
    def _freeze_base_model(self):  # Fixed method name
        for param in self.model.parameters():
            param.requires_grad = False
        
        for layer in self.lora_layers.values():
            for param in layer.parameters():
                param.requires_grad = True
    
    def forward(self, pixel_values, input_ids, attention_mask=None, labels=None):
        outputs = self.model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_hidden_states=True
        )
        
        # Store original weights
        original_weights = {}
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                if layer_id in self.lora_layers:
                    original_weights[layer_id] = module.weight.data.clone()
                    
                    # Apply LoRA updates
                    lora_down, lora_up = self.lora_layers[layer_id]
                    hidden_states = outputs.hidden_states[-1]
                    lora_output = lora_up(lora_down(hidden_states))
                    
                    if lora_output.shape[-1] != module.weight.shape[-1]:
                        lora_output = lora_output[..., :module.weight.shape[-1]]
                    
                    module.weight.data = original_weights[layer_id] + (CONFIG['model']['lora_alpha'] * lora_output)
        
        # Get final output with modified weights
        final_output = self.model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        # Restore original weights
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(x in name for x in ["query", "value"]):
                layer_id = name.replace(".", "_")
                if layer_id in original_weights:
                    module.weight.data = original_weights[layer_id]
        
        return final_output

def train_model(model, train_loader, val_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    optimizer = torch.optim.AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=CONFIG['training']['learning_rate'],
        weight_decay=CONFIG['training']['weight_decay']
    )
    
    best_loss = float('inf')
    
    for epoch in range(CONFIG['training']['epochs']):
        model.train()
        total_loss = 0
        steps = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}')
        for batch in progress_bar:
            try:
                batch = {k: v.to(device) for k, v in batch.items()}
                
                outputs = model(
                    pixel_values=batch['pixel_values'],
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['input_ids']
                )
                
                loss = outputs.loss
                if not torch.isfinite(loss):
                    continue
                
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                
                total_loss += loss.item()
                steps += 1
                progress_bar.set_postfix({'loss': loss.item()})
                
            except RuntimeError as e:
                if "out of memory" in str(e):
                    torch.cuda.empty_cache()
                    continue
                raise e
        
        avg_loss = total_loss / steps
        print(f'Epoch {epoch + 1} - Average loss: {avg_loss:.4f}')
        
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save({
                'lora_state_dict': model.lora_layers.state_dict(),
                'epoch': epoch,
                'loss': best_loss
            }, os.path.join(CONFIG['data']['output_path'], 'best_model_lora.pt'))

def main():
    prepare_metadata(CONFIG['data']['input_path'], CONFIG['data']['output_path'])
    
    dataset = XRayReportDataset(
        data_dir=CONFIG['data']['input_path'],
        max_length=CONFIG['data']['max_length']
    )
    
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=True,
        num_workers=2
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=CONFIG['data']['batch_size'],
        shuffle=False,
        num_workers=2
    )
    
    model = XRayReportGenerator()
    train_model(model, train_loader, val_loader)

if __name__ == "__main__":  # Fixed conditional
    main()

preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/707M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Epoch 1:   0%|          | 0/747 [00:00<?, ?it/s]

RuntimeError: The size of tensor a (768) must match the size of tensor b (708) at non-singleton dimension 1