In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
a=0
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        a+=1
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
pip install transformers torch torchvision datasets pillow

Note: you may need to restart the kernel to use updated packages.


In [21]:
torch.cuda.empty_cache()

In [22]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [23]:
import os

In [27]:
import pandas as pd
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from torch.utils.data import Dataset, DataLoader

# Load dataset (assumed to be a CSV file)
df = pd.read_csv("/kaggle/input/dataasa/valid_data.csv")

# Filter dataset to only include 10 rows per entity_name
sampled_df = df

# Define a custom dataset for loading images and text
class ImageTextDataset(Dataset):
    def __init__(self, dataframe, processor):
        self.dataframe = dataframe
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Load the image
        image_path = self.dataframe.loc[idx, "image_path"]
        image = Image.open(image_path).convert("RGB")
        
        # Create the text input by combining group_id and entity_name
        group_id = self.dataframe.loc[idx, "group_id"]
        entity_name = self.dataframe.loc[idx, "entity_name"]
        text_input = f"Group ID: {group_id}, Entity Name: {entity_name}, generate just the entity value with the unit and nothing else."
        
        # The target is the entity_value (the value we want to predict)
        entity_value = self.dataframe.loc[idx, "entity_value"]
        
        # Process the image and text input
        inputs = self.processor(images=image, text=text_input, padding="max_length", return_tensors="pt", max_length=50, truncation=True)
        
        # Process the target (entity_value) as input IDs, with attention mask
        target_inputs = self.processor(text=entity_value, padding="max_length", return_tensors="pt", max_length=50, truncation=True)

        # Return processed inputs and target input IDs with attention mask
        return inputs, target_inputs['input_ids'], target_inputs['attention_mask']

# Initialize the BLIP processor (for image and text)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

# Create the dataset
dataset = ImageTextDataset(sampled_df, processor)

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Update collate function to handle padding for both image and text inputs, and targets
def collate_fn(batch):
    inputs = {
        'input_ids': [],
        'pixel_values': [],
        'attention_mask': []
    }
    targets = {
        'input_ids': [],
        'attention_mask': []
    }

    for item in batch:
        input_data, target_ids, target_mask = item
        inputs['input_ids'].append(input_data['input_ids'].squeeze(0))
        inputs['pixel_values'].append(input_data['pixel_values'].squeeze(0))
        inputs['attention_mask'].append(input_data['attention_mask'].squeeze(0))
        targets['input_ids'].append(target_ids.squeeze(0))
        targets['attention_mask'].append(target_mask.squeeze(0))

    # Pad input_ids and attention_mask to the same length
    inputs['input_ids'] = pad_sequence(inputs['input_ids'], batch_first=True, padding_value=processor.tokenizer.pad_token_id)
    inputs['attention_mask'] = pad_sequence(inputs['attention_mask'], batch_first=True, padding_value=0)

    # Pad target input_ids and attention_mask to the same length
    targets['input_ids'] = pad_sequence(targets['input_ids'], batch_first=True, padding_value=processor.tokenizer.pad_token_id)
    targets['attention_mask'] = pad_sequence(targets['attention_mask'], batch_first=True, padding_value=0)

    # Stack pixel values
    inputs['pixel_values'] = torch.stack(inputs['pixel_values'])

    return inputs, targets

# Create DataLoader with the custom collate function
dataloader = DataLoader(dataset, batch_size=5000, shuffle=True, collate_fn=collate_fn)





In [None]:
from transformers import AdamW
import torch.nn as nn
checkpoint_dir = '/kaggle/working/checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

# Initialize the BLIP model for conditional generation
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
model.train()

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move the model to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
checkpoint_file = os.path.join(checkpoint_dir, 'best_model.pth')
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for batch in dataloader:
        inputs, targets = batch
        
        # Move inputs and targets to GPU (if available)
        input_ids = inputs['input_ids'].to(device)
        pixel_values = inputs['pixel_values'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        target_ids = targets['input_ids'].to(device)
        target_attention_mask = targets['attention_mask'].to(device)
        
        # Forward pass
        outputs = model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            labels=target_ids,
            attention_mask=attention_mask,
            #decoder_attention_mask=target_attention_mask
        )
        
        # Compute loss
        loss = outputs.loss
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    torch.save(model.state_dict(), checkpoint_file)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")
model.load_state_dict(torch.load(checkpoint_file))



In [15]:
def predict_entity_value(image_path, group_id, entity_name, model, processor, device):
    # Load the image
    image = Image.open(image_path).convert("RGB")
    
    # Create the text input by combining group_id and entity_name
    text_input = f"Group ID: {group_id}, Entity Name: {entity_name}, generate just the entity value with the unit and nothing else."
    
    # Process the image and text input
    inputs = processor(images=image, text=text_input, return_tensors="pt", padding="max_length", max_length=50, truncation=True)
    
    # Move inputs to the correct device (GPU or CPU)
    pixel_values = inputs["pixel_values"].to(device)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    
    # Generate predictions
    with torch.no_grad():
        generated_ids = model.generate(
            pixel_values=pixel_values,
            max_length=50,  # Maximum length for generated entity_value
            num_beams=5,    # Number of beams for beam search
            early_stopping=True
        )
    
    # Decode the generated tokens into text (predicted entity_value)
    predicted_entity_value = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    return predicted_entity_value


model.eval()
# Example inference
image_path = "/kaggle/input/random-dataset/maximum_weight_recommendation-20240913T170711Z-001/maximum_weight_recommendation/6135h5ZfarL.jpg"

group_id = "801829"
entity_name = "maximum_weight_recommendation"

predicted_value = predict_entity_value(image_path, group_id, entity_name, model, processor, device)
print(f"Predicted entity value: {predicted_value}")

Predicted entity value: 100 gram gram


In [None]:
model.save_pretrained("/path/to/save/fine-tuned-blip")
processor.save_pretrained("/path/to/save/fine-tuned-blip")

In [57]:
# Set the model to evaluation mode
model.eval()

# Example inference function
def generate_entity_value(image_path, prompt_text, max_length=50):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")
    
    # Process the image and text prompt
    inputs = processor(images=image, text=prompt_text, return_tensors="pt").to(device)

    # Use the model to generate tokens autoregressively
    generated_ids = model.generate(
        pixel_values=inputs['pixel_values'],
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        num_beams=5,   # You can adjust this for better generation quality
        early_stopping=True
    )

    # Decode the generated tokens into text
    generated_text = processor.decode(generated_ids[0], skip_special_tokens=True)
    
    return generated_text

# Example usage
image_path = "/kaggle/input/random-dataset/voltage-20240913T163556Z-001/voltage610u3xH0MfL.jpg"
prompt_text = "Group ID: 271537, Entity Name: voltage"

# Generate the entity value
generated_value = generate_entity_value(image_path, prompt_text)
print("Generated Entity Value:", generated_value)

Generated Entity Value: group id : 271537, entity name : voltage
