<a href="https://colab.research.google.com/github/jeet1912/gitbase_ablationStudy/blob/main/code/mimic_cxr_fft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dataset Class

In [1]:
from google.colab import drive
from google.colab import auth
from google.cloud import storage

auth.authenticate_user()


In [None]:
class CXR(Dataset):
  def __init__(self, dataframe, processor, max_length):
    super().__init__()
    self.dataframe = dataframe.reset_index(drop=True)
    self.processor = processor
    self.max_length = max_length
    self.prompt = "List pathalogical findings for this chest X-ray:"
    self.storage_client = storage.Client(project='silken-physics-467815-g5')
  def __len__(self):
    return len(self.dataframe)

  def _loadImage(self,subject_id, study_id, dicom_id):
    try:
      bucket_name = "mimic-cxr-jpg-2.1.0.physionet.org"
      image_path = f"files/p{subject_id[:2]}/p{subject_id}/s{study_id}/{dicom_id}.jpg"
      bucket = self.storage_client.bucket(bucket_name, user_project='silken-physics-467815-g5')
      blob = bucket.blob(image_path)
      image_bytes = blob.download_as_bytes()
      image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
      return image
    except Exception as e:
      print(f"Error loading image {image_path}: {str(e)}")
      return None # Return None if image loading fails

  def __getitem__(self, index):
    row = self.dataframe.iloc[index]
    miniReport = str(row['mini_report'])
    subject = str(row['subject_id'])
    study = str(row['study_id'])
    dicom = str(row['dicom_id'])
    image = self._loadImage(subject_id=subject,study_id=study,dicom_id=dicom)
    inputs = self.processor(images=image, text=self.prompt, return_tensors="pt", padding="max_length",
                            truncation=True, max_length=self.max_length)
    labels = self.processor.tokenizer(miniReport, return_tensors="pt", padding="max_length",
                                      truncation=True, max_length=self.max_length)["input_ids"]

    return {
      "pixel_values": inputs["pixel_values"],  # Shape: [1, 3, H, W]
      "input_ids": inputs["input_ids"],        # Shape: [1, max_length]
      "attention_mask": inputs["attention_mask"],  # Shape: [1, max_length]
      "labels": labels                         # Shape: [1, max_length]
    }

In [None]:
processor = AutoProcessor.from_pretrained("microsoft/git-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
train_df =pd.read_csv('./train_split.csv')
test_df =pd.read_csv('./test_split.csv')
val_df=pd.read_csv('./val_split.csv')
max_length = max(len(processor.tokenizer.encode(report)) for report in train_df['mini_report'])
print(f"Max length of mini-reports: {max_length}")

Max length of mini-reports: 92


In [None]:
train_dataset = CXR(train_df, processor, max_length)
val_dataset = CXR(val_df, processor, max_length)
test_dataset = CXR(test_df, processor, max_length)

In [None]:
print(train_dataset[0]['pixel_values'].shape)
print(train_dataset[0]['input_ids'].shape)
print(train_dataset[0]['attention_mask'].shape)
print(train_dataset[0]['labels'].shape)


torch.Size([1, 3, 224, 224])
torch.Size([1, 92])
torch.Size([1, 92])
torch.Size([1, 92])


## Load GIT Model

In [None]:
device = torch.device("cuda")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base", trust_remote_code=True)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/707M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

In [None]:
memory_allocated = torch.cuda.memory_allocated() / (1024 * 1024)
print('Memory Allocated before loading GIT Large :',memory_allocated,'MB')

Memory Allocated before loading GIT Large : 0.0 MB


In [None]:
# to analyze
for name, module in model.named_modules():
  print(name)
vision_modules = ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"]
text_modules = ["query", "key", "value", "dense"]

In [None]:
model.to(device)

GitForCausalLM(
  (git): GitModel(
    (embeddings): GitEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (image_encoder): GitVisionModel(
      (vision_model): GitVisionTransformer(
        (embeddings): GitVisionEmbeddings(
          (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
          (position_embedding): Embedding(197, 768)
        )
        (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (encoder): GitVisionEncoder(
          (layers): ModuleList(
            (0-11): 12 x GitVisionEncoderLayer(
              (self_attn): GitVisionAttention(
                (k_proj): Linear(in_features=768, out_features=768, bias=True)
                (v_proj): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
memory_allocated = torch.cuda.memory_allocated() / (1024 * 1024)
print('Memory Allocated after loading GIT Base :',memory_allocated,'MB')

Memory Allocated after loading GIT Base : 674.91845703125 MB


In [None]:
import gc
del model
torch.cuda.empty_cache()
gc.collect()

364

## Dataloader helper

In [None]:
def collate_fn(batch):
    pixel_values = torch.cat([item["pixel_values"] for item in batch], dim=0)  # [batch_size, 3, H, W]
    input_ids = torch.cat([item["input_ids"] for item in batch], dim=0)        # [batch_size, max_length]
    attention_mask = torch.cat([item["attention_mask"] for item in batch], dim=0)  # [batch_size, max_length]
    labels = torch.cat([item["labels"] for item in batch], dim=0)               # [batch_size, max_length]

    #print(f"Batch shapes: pixel_values={pixel_values.shape}, input_ids={input_ids.shape}, "
    #      f"attention_mask={attention_mask.shape}, labels={labels.shape}")

    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

## Training

In [None]:
import time

In [None]:
batch_size = 64
learning_rate = 1e-5
start_time = time.time()  # Record start time
torch.cuda.empty_cache()
gc.collect()
# Initialize model
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
model.to(device)
print('------------------------------------------------------------------------------------------------------------------------------------------------')
print(f'Hyperparameters are Batch Size = {batch_size}, Learning Rate = {learning_rate} with {model.num_parameters()} trainable parameters')
print('------------------------------------------------------------------------------------------------------------------------------------------------')
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3)

def make_train_step(model):
    def train_step(batch):
        model.train()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        return loss.item()
    return train_step

def make_val_step(model):
    def val_step(batch):
        model.eval()
        with torch.no_grad():
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            return loss.item()
    return val_step

def for_epochs(epochs, train_loader, val_loader, train_step, val_step, scheduler, early_stop_window=2):
    train_losses = []
    val_losses = []
    memory_usage = []
    best_val_loss = float('inf')
    best_weights = None
    epochs_with_no_improvement = 0

    for epoch in range(epochs):
        # Training
        train_loss = 0.0
        train_samples = 0
        for batch in train_loader:
            loss = train_step(batch)
            batch_size_actual = batch["pixel_values"].size(0)
            train_loss += loss * batch_size_actual
            train_samples += batch_size_actual
            break
        avg_train_loss = train_loss / train_samples
        train_losses.append(avg_train_loss)


        # Validation
        val_loss = 0.0
        val_samples = 0
        with torch.no_grad():
            for batch in val_loader:
                loss = val_step(batch)
                batch_size_actual = batch["pixel_values"].size(0)
                val_loss += loss * batch_size_actual
                val_samples += batch_size_actual
        avg_val_loss = val_loss / val_samples
        val_losses.append(avg_val_loss)

        # Memory tracking
        torch.cuda.synchronize()
        memory_mb = torch.cuda.memory_allocated() / (1024 * 1024)  # Convert to MB
        memory_usage.append(memory_mb)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.6f}, Val Loss: {avg_val_loss:.6f}, Memory: {memory_mb:.2f} MB")

        # Scheduler and early stopping
        scheduler.step(avg_val_loss)
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "best_weights.pt")
            epochs_with_no_improvement = 0
        else:
            epochs_with_no_improvement += 1
            if epochs_with_no_improvement >= early_stop_window:
                print("Early stopping triggered")
    best_weights = torch.load("best_weights.pt")

    training_time = time.time() - start_time
    print(f"Total Training Time: {training_time:.4f} seconds")
    return train_losses, val_losses, memory_usage, best_weights, training_time

train_step = make_train_step(model)
val_step = make_val_step(model)
train_losses, val_losses, memory_usage, best_weights, training_time = for_epochs(20, train_loader, val_loader, train_step, val_step, scheduler)

del model
torch.cuda.empty_cache()
gc.collect()

------------------------------------------------------------------------------------------------------------------------------------------------
Hyperparameters are Batch Size = 64, Learning Rate = 1e-05 with 176619066 trainable parameters
------------------------------------------------------------------------------------------------------------------------------------------------
Epoch 1/20, Train Loss: 11.004540, Val Loss: 10.256802, Memory: 5581.45 MB


## Testing

In [None]:
# Fix 1: Set tokenizer padding to left side for decoder-only models
processor.tokenizer.padding_side = 'left'

# Load and prepare model
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
model.load_state_dict(best_weights)
model.to(device)
model.eval()

test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loss = 0.0
test_samples = 0
predictions = []
references = []
bleu = load("bleu")

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        batch_size_actual = batch["pixel_values"].size(0)
        test_loss += loss.item() * batch_size_actual
        test_samples += batch_size_actual

        # Generate predictions
        pixel_values = batch["pixel_values"]
        input_ids = batch["input_ids"]

        # Fix 2: Use max_new_tokens instead of max_length to avoid length conflicts
        generated_ids = model.generate(
            pixel_values=pixel_values,
            input_ids=input_ids,
            max_new_tokens=50,  # Generate up to 50 new tokens
            do_sample=False,    # Use greedy decoding for reproducible results
            pad_token_id=processor.tokenizer.pad_token_id
        )

        generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
        predictions.extend([text.replace("List pathalogical findings for this chest X-ray:", "").strip() for text in generated_texts])

        # Decode labels for references
        label_ids = batch["labels"]
        reference_texts = processor.batch_decode(label_ids, skip_special_tokens=True)
        references.extend(reference_texts)
        #break

avg_test_loss = test_loss / test_samples
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
print(f"Test Results with Best Model:")
print(f"Test Loss: {avg_test_loss:.6f}, BLEU Score: {bleu_score['bleu']:.4f}")

del model
torch.cuda.empty_cache()
gc.collect()

KeyboardInterrupt: 