In [1]:
!pip install -q -U keras-nlp
!pip install -q -U keras>=3

In [2]:
import os
import keras
import keras_nlp
import numpy as np
import PIL
import requests
import io
import matplotlib
import re
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image

os.environ["KERAS_BACKEND"] = "jax"
keras.config.set_floatx("bfloat16")

In [3]:
paligemma = keras_nlp.models.PaliGemmaCausalLM.from_preset("pali_gemma_3b_mix_224")
paligemma.summary()

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


In [4]:
import numpy as np
import PIL
import os

# Define crop_and_resize function to crop and resize an image
def crop_and_resize(image, target_size):
    """Crops the image from the center and resizes it to the target size."""
    width, height = image.size
    source_size = min(image.size)
    left = width // 2 - source_size // 2
    top = height // 2 - source_size // 2
    right, bottom = left + source_size, top + source_size
    return image.crop((left, top, right, bottom)).resize(target_size)

# Define read_image function to load, crop, and resize images from the local path
import PIL
import numpy as np

def read_image(image_path, target_size):
    """Reads the image from a local path, crops and resizes it to the target size, and ensures it's in RGB format."""
    # Open the image using PIL
    image = PIL.Image.open(image_path)
    
    # Convert the image to RGB if it's grayscale (L mode)
    if image.mode == 'L':  # L mode means it's a grayscale image
        image = image.convert('RGB')  # Convert grayscale to RGB
    
    # Crop and resize the image using your custom crop_and_resize function
    image = crop_and_resize(image, target_size)
    
    # Convert the image to a NumPy array
    image = np.array(image)
    
    # Remove alpha channel if present (RGBA images)
    if image.shape[2] == 4:  # If the image has 4 channels (RGBA), remove the alpha channel
        image = image[:, :, :3]  # Keep only the first 3 channels (RGB)
    
    return image


def parse_bbox_and_labels(detokenized_output: str):
  matches = re.finditer(
      '<loc(?P<y0>\d\d\d\d)><loc(?P<x0>\d\d\d\d)><loc(?P<y1>\d\d\d\d)><loc(?P<x1>\d\d\d\d)>'
      ' (?P<label>.+?)( ;|$)',
      detokenized_output,
  )
  labels, boxes = [], []
  fmt = lambda x: float(x) / 1024.0
  for m in matches:
    d = m.groupdict()
    boxes.append([fmt(d['y0']), fmt(d['x0']), fmt(d['y1']), fmt(d['x1'])])
    labels.append(d['label'])
  return np.array(boxes), np.array(labels)

def display_boxes(image, boxes, labels, target_image_size):
  h, l = target_size
  fig, ax = plt.subplots()
  ax.imshow(image)
  for i in range(boxes.shape[0]):
      y, x, y2, x2 = (boxes[i]*h)
      width = x2 - x
      height = y2 - y
      # Create a Rectangle patch
      rect = patches.Rectangle((x, y),
                               width,
                               height,
                               linewidth=1,
                               edgecolor='r',
                               facecolor='none')
      # Add label
      plt.text(x, y, labels[i], color='red', fontsize=12)
      # Add the patch to the Axes
      ax.add_patch(rect)

  plt.show()

def display_segment_output(image, segment_mask, target_image_size):
  # Calculate scaling factors
  h, w = target_image_size
  x_scale = w / 64
  y_scale = h / 64

  # Create coordinate grids for the new image
  x_coords = np.arange(w)
  y_coords = np.arange(h)
  x_coords = (x_coords / x_scale).astype(int)
  y_coords = (y_coords / y_scale).astype(int)
  resized_array = segment_mask[y_coords[:, np.newaxis], x_coords]
  # Create a figure and axis
  fig, ax = plt.subplots()

  # Display the image
  ax.imshow(image)

  # Overlay the mask with transparency
  ax.imshow(resized_array, cmap='jet', alpha=0.5)

In [6]:
import pandas as pd
import numpy as np
import PIL
import os
import re  # Import regex for adding space between number and unit
from tqdm import tqdm  # Import tqdm for the progress bar

# Load your dataset (assuming it's a CSV file with image_path and entity_name columns)
df = pd.read_csv('/kaggle/input/resr-data/66e31d6ee96cd_student_resource_3/student_resource 3/dataset/test.csv')
df = df.head(1000)
# Initialize an empty list to store the extracted entity values
extracted_texts = []

# Define target size for image resizing
target_size = (224, 224)

root_folder = "/kaggle/input/resr-data/test/test"

# Function to add a space between number and unit
def add_space_between_number_and_unit(text):
    # This regular expression matches a number followed immediately by a letter (which is the unit).
    return re.sub(r'(\d)([a-zA-Z])', r'\1 \2', text)

# Iterate through the dataset
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting text"):
    image_link = row.get('image_link', '')  # Get the image link or an empty string if missing
    entity_name = row['entity_name']
    
    # Check if the image link is valid (non-empty)
    if not image_link or pd.isna(image_link):
        extracted_texts.append('')  # Append an empty string for missing links
        continue
    
    image_name = os.path.basename(image_link)  # Extract image name from the link
    image_path = os.path.join(root_folder, image_name)
    
    # Check if the image file exists
    if not os.path.exists(image_path):
        extracted_texts.append('')  # Append an empty string if the image doesn't exist
        continue
    
    # Load and preprocess the image using the read_image function
    cow_image = read_image(image_path, target_size)
    
    # Create the prompt based on the entity_name
    prompt = f'answer en what is the {entity_name} in the image?'
    
    # Generate output using PaLiGemma
    output = paligemma.generate(
        inputs={
            "images": cow_image,
            "prompts": prompt,
        }
    )
    
    # Remove the prompt and clean up the extracted entity value
    entity_value = output.replace(prompt, '').strip()
    
    # Add space between number and unit
    entity_value = add_space_between_number_and_unit(entity_value)
    
    # Append the cleaned and formatted entity_value to the list
    extracted_texts.append(entity_value)

# Add the extracted text as a new column in the DataFrameblur
df['entity_value'] = extracted_texts

# Save the updated DataFrame to a new CSV file
df.to_csv('/kaggle/working/dataset_with_entity_values.csv', index=False)

# Check the results
print(df.head())


Extracting text: 100%|██████████| 1000/1000 [46:32<00:00,  2.79s/it]

   index                                         image_link  group_id  \
0      0  https://m.media-amazon.com/images/I/110EibNycl...    156839   
1      1  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
2      2  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
3      3  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
4      4  https://m.media-amazon.com/images/I/11gHj8dhhr...    792578   

  entity_name  entity_value  
0      height        100 cm  
1       width            54  
2      height          5.54  
3       depth  6.54 inches.  
4       depth  4.13 inches.  



