In [None]:
pip install opencv-python Pillow torch torchvision transformers diffusers


Collecting diffusers
  Downloading diffusers-0.30.3-py3-none-any.whl.metadata (18 kB)
Downloading diffusers-0.30.3-py3-none-any.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diffusers
Successfully installed diffusers-0.30.3


In [None]:
pip install diffusers




In [None]:
!pip install segment-anything


Collecting segment-anything
  Downloading segment_anything-1.0-py3-none-any.whl.metadata (487 bytes)
Downloading segment_anything-1.0-py3-none-any.whl (36 kB)
Installing collected packages: segment-anything
Successfully installed segment-anything-1.0


In [None]:
!wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth


--2024-10-03 12:17:36--  https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.164.78.121, 18.164.78.81, 18.164.78.128, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.164.78.121|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2564550879 (2.4G) [binary/octet-stream]
Saving to: ‘sam_vit_h_4b8939.pth’


2024-10-03 12:18:02 (92.4 MB/s) - ‘sam_vit_h_4b8939.pth’ saved [2564550879/2564550879]



# **Task 1: Masking of the object**

In [None]:
import cv2
import numpy as np
import torch
from PIL import Image
from segment_anything import sam_model_registry, SamPredictor
from transformers import CLIPProcessor, CLIPModel

# Function to load image using PIL
def load_image(image_path):
    return Image.open(image_path)

# Function to get CLIP model for text and image embeddings
def get_clip_model():
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    return model, processor

# Function to find the bounding box around the object based on text embedding similarity
def find_bounding_box(image, object_class):
    model, processor = get_clip_model()

    # Preprocess the image for CLIP
    inputs = processor(images=image, return_tensors="pt")
    image_features = model.get_image_features(**inputs)

    # Preprocess the object class prompt
    text_inputs = processor(text=object_class, return_tensors="pt")
    text_features = model.get_text_features(**text_inputs)

    # Calculate cosine similarity between text and image features
    similarity = torch.nn.functional.cosine_similarity(image_features, text_features)

    # For simplicity, let's assume we have a pre-defined bounding box around the laptop
    # In practice, this should come from an object detection model or better CLIP-based localization
    height, width = image.size
    box = [int(0.35 * width), int(0.25 * height), int(0.65 * width), int(0.55 * height)]  # Example laptop bounding box
    return box

# Function to segment the object using SAM
def segment_object(image, object_class):
    # Load SAM model and predictor
    sam_checkpoint = "sam_vit_h_4b8939.pth"
    sam = sam_model_registry["vit_h"](checkpoint=sam_checkpoint)
    predictor = SamPredictor(sam)

    # Convert the image to RGB and numpy format for SAM
    image_np = np.array(image.convert("RGB"))

    # Find the bounding box in the image using CLIP model
    box = find_bounding_box(image, object_class)
    input_box = np.array([box])  # Bounding box

    # Set image for the predictor
    predictor.set_image(image_np)

    # Perform segmentation with bounding box
    masks, _, _ = predictor.predict(box=input_box, multimask_output=False)

    # Extract the first mask and convert to binary
    mask = masks[0]
    mask = (mask > 0).astype(np.uint8) * 255  # Convert mask to 0-255 range

    return mask

# Function to apply a red border around the segmented object
def apply_border(image, mask):
    image_np = np.array(image)

    # Find contours in the mask
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Draw a red border around the contours of the mask
    cv2.drawContours(image_np, contours, -1, (255, 0, 0), 5)  # Red border with thickness 5

    return image_np

# Main function to execute segmentation and apply border
def main(image_path, object_class, output_path):
    # Load the image
    image = load_image(image_path)

    # Perform segmentation to get the object mask
    mask = segment_object(image, object_class)

    # Apply a red border around the segmented object
    result_image = apply_border(image, mask)

    # Save the result
    Image.fromarray(result_image).save(output_path)
    print(f"Result saved at {output_path}")

# Example usage
image_path = "/content/bagpack.jpg"  # Path to the input image
object_class = "laptop"  # Object to segment (laptop)
output_path = "./laptop_with_border.png"  # Path to save the output

main(image_path, object_class, output_path)


  state_dict = torch.load(f)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



Result saved at ./laptop_with_border.png


#Summary of Task 1:
This code integrates CLIP and SAM models to segment and highlight an object in an image based on a text description, such as "laptop." It first uses CLIP (Contrastive Language-Image Pretraining) to find the bounding box around the object by calculating the cosine similarity between the image and the text embeddings. Once the object is localized, the SAM (Segment Anything Model) takes over to generate a precise segmentation mask for the object. The mask is then used to draw a red contour around the segmented object. In the given example, if the input image contains a laptop, the model detects the laptop, segments it, and highlights it with a bold red border, making it visually pop out in the final saved image. The whole process efficiently transforms natural language understanding into a visual segmentation task, beautifully blending language and vision models.

# **Task 2 : Shifting and Inpainting of Object**


In [None]:
import cv2
import numpy as np
import torch
from PIL import Image
from segment_anything import sam_model_registry, SamPredictor
from transformers import CLIPProcessor, CLIPModel
from diffusers import StableDiffusionInpaintPipeline

# Function to load image using PIL
def load_image(image_path):
    return Image.open(image_path)

# Function to get CLIP model for text and image embeddings
def get_clip_model():
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    return model, processor

# Function to find the best point in the image based on text embedding similarity
def find_best_point(image, object_class):
    model, processor = get_clip_model()

    # Preprocess the image for CLIP
    inputs = processor(images=image, return_tensors="pt")
    image_features = model.get_image_features(**inputs)

    # Preprocess the object class prompt
    text_inputs = processor(text=object_class, return_tensors="pt")
    text_features = model.get_text_features(**text_inputs)

    # Calculate cosine similarity between text and image features
    similarity = torch.nn.functional.cosine_similarity(image_features, text_features)

    # Find the best matching point
    best_point = torch.argmax(similarity)
    height, width = image.size
    y = best_point // width
    x = best_point % width
    return x.item(), y.item()

# Function to segment the object using SAM
def segment_object(image, object_class):
    # Load SAM model and predictor
    sam_checkpoint = "sam_vit_h_4b8939.pth"
    sam = sam_model_registry["vit_h"](checkpoint=sam_checkpoint)
    predictor = SamPredictor(sam)

    # Convert the image to RGB and numpy format for SAM
    image_np = np.array(image.convert("RGB"))

    # Find the best point in the image using CLIP model
    best_point = find_best_point(image, object_class)
    input_point = np.array([[best_point[0], best_point[1]]])
    input_label = np.array([1])  # Foreground object

    # Set image for the predictor
    predictor.set_image(image_np)

    # Perform segmentation
    masks, _, _ = predictor.predict(point_coords=input_point, point_labels=input_label, multimask_output=False)

    # Extract the first mask and convert to binary
    mask = masks[0]
    mask = (mask > 0).astype(np.uint8) * 255  # Convert mask to 0-255 range

    return mask

# Function to shift the object and inpaint the background
def shift_and_inpaint(image_path, object_class, x_shift, y_shift, output_path):
    # Step 1: Load image and perform segmentation
    image = load_image(image_path)
    mask = segment_object(image, object_class)

    # Step 2: Shift the object
    image_np = np.array(image)
    shifted_image = np.zeros_like(image_np)

    # Get non-zero mask coordinates (object pixels)
    mask_coords = np.where(mask > 0)

    # Shift the object
    shifted_coords = (mask_coords[0] - y_shift, mask_coords[1] + x_shift)

    # Ensure shifted coordinates are within image bounds
    valid_indices = (shifted_coords[0] >= 0) & (shifted_coords[0] < image_np.shape[0]) & \
                    (shifted_coords[1] >= 0) & (shifted_coords[1] < image_np.shape[1])

    # Apply shift
    shifted_image[shifted_coords[0][valid_indices], shifted_coords[1][valid_indices]] = \
        image_np[mask_coords[0][valid_indices], mask_coords[1][valid_indices]]

    # Step 3: Inpaint the original area using Stable Diffusion Inpainting
    inpainting_model = StableDiffusionInpaintPipeline.from_pretrained('stabilityai/stable-diffusion-2-inpainting')
    inpainting_model = inpainting_model.to("cuda" if torch.cuda.is_available() else "cpu")

    # Resize images for inpainting (512x512 for compatibility)
    inpainting_image = image.resize((512, 512))
    mask_resized = Image.fromarray(cv2.resize(mask, (512, 512)))

    # Perform inpainting
    inpainted_image = inpainting_model(prompt="inpaint the background", image=inpainting_image, mask_image=mask_resized).images[0]

    # Step 4: Composite the shifted object onto the inpainted background
    inpainted_np = np.array(inpainted_image.resize(image.size))  # Resize back to original size
    combined_image = cv2.addWeighted(inpainted_np, 1.0, shifted_image, 1.0, 0)

    # Save the final output
    Image.fromarray(combined_image).save(output_path)
    print(f"Output saved at {output_path}")

# Execute Task 2 (Shifting and Inpainting)
image_path = "/content/bagpack.jpg"  # Path to your image
object_class = "laptop"  # Class of the object you want to shift
output_path = "./shifted_laptop.png"  # Where to save the shifted image

x_shift = 50  # Number of pixels to shift the object horizontally
y_shift = 20  # Number of pixels to shift the object vertically

# Perform object shifting and inpainting
shift_and_inpaint(image_path, object_class, x_shift, y_shift, output_path)


  state_dict = torch.load(f)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Output saved at ./shifted_laptop.png


#Summary of Task 2:
This code elegantly combines CLIP, SAM, and Stable Diffusion models to identify, segment, and shift an object in an image while seamlessly filling in the background using inpainting. First, it uses CLIP to find the best point that matches the given object class (e.g., "laptop"), and SAM segments the object based on this. Once segmented, the object is shifted by a specified number of pixels, and Stable Diffusion inpainting restores the original background where the object was located. In the example, if the image contains a laptop, the code identifies it, shifts it slightly to the right and upward (e.g., 50 pixels horizontally, 20 pixels vertically), and then smoothly inpaints the area where the laptop was, producing a natural and visually cohesive final image. The transformed image is saved with the laptop in its new position, providing a fluid blend of cutting-edge AI models for visual manipulation.






