In [1]:
import requests
from PIL import Image, ImageDraw

import matplotlib.pyplot as plt
import numpy as np
import torch

from transformers import SamModel, SamProcessor
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image, make_image_grid

import app

In [2]:
model_name = "facebook/sam-vit-base"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SamModel.from_pretrained(model_name).to(device)
processor = SamProcessor.from_pretrained(model_name)

In [2]:
def mask_to_rgb(mask):
    """
    Transforms a binary mask into an RGBA image for visualization
    """
    
    bg_transparent = np.zeros(mask.shape + (4, ), dtype=np.uint8)
    
    # Color the area we will replace in green
    # (this vector is [Red, Green, Blue, Alpha])
    bg_transparent[mask == 1] = [0, 255, 0, 127]
    
    return bg_transparent


def get_processed_inputs(image, input_points):
    
    # Use the processor to generate the right inputs for SAM
    # Use "image" as your image
    # Use 'input_points' as your input_points,
    # and remember to use the option return_tensors='pt'
    # Also, remember to add .to("cuda") at the end


    inputs = processor(image, input_points=input_points, return_tensors="pt").to(device)


    with torch.no_grad():
        outputs = model(**inputs)

    pred_masks = outputs.pred_masks.cpu()
    original_sizes = inputs["original_sizes"].cpu()
    reshaped_input_sizes = inputs["reshaped_input_sizes"].cpu()

    print(f"pred_masks shape: {pred_masks.shape}")
    print(f"original_sizes shape: {original_sizes.shape}")
    print(f"reshaped_input_sizes shape: {reshaped_input_sizes.shape}")

    
    # Now let's post process the outputs of SAM to obtain the masks
    masks = processor.image_processor.post_process_masks(pred_masks, original_sizes, reshaped_input_sizes)
    
    # Here we select the mask with the highest score
    # as the mask we will use. You can experiment with also
    # other selection criteria, for example the largest mask
    # instead of the most confident mask
    best_mask = masks[0][0][outputs.iou_scores.argmax()] 

    # NOTE: we invert the mask by using the ~ operator because
    # so that the subject pixels will have a value of 0 and the
    # background pixels a value of 1. This will make it more convenient
    # to infill the background
    return ~best_mask.cpu().numpy()

In [None]:
# open the image and resize it to 512x512 pixels
raw_image = Image.open("car.png").convert("RGB").resize((512, 512))

# define a few points on the image (that indicate where the car is)
input_points = [[[150, 170], [300, 250]]]

# Create a drawing object
image_with_stars = raw_image.copy()
draw = ImageDraw.Draw(image_with_stars)

# Draw each point on the image
for point in input_points[0]:
    x, y = point
    radius = 5  # Marker size
    draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill="red", outline="black")


# generate a mask using SAM
mask = get_processed_inputs(raw_image, input_points)

# visualize the mask
mask_image = Image.fromarray(mask_to_rgb(mask)).resize((128, 128))


fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)
ax1.imshow(image_with_stars)
ax2.imshow(mask_image)
ax1.axis('off')
ax2.axis('off')
plt.show()

### Inpainting

In [None]:
# Load the AutoPipelineForInpainting pipeline 
# (remember the diffusers demo in lesson 5)
# The checkpoint we want to use is 
# "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
# Remember to add torch_dtype=torch.float16 as an option

model_name = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
pipeline = AutoPipelineForInpainting.from_pretrained(model_name, torch_dtype=torch.float16)
# This will make it more efficient on our hardware
pipeline.enable_model_cpu_offload()

In [3]:
def inpaint(raw_image, input_mask, prompt, negative_prompt=None, seed=74294536, cfgs=7):
    
    mask_image = Image.fromarray(input_mask)
    rand_gen = torch.manual_seed(seed)
    
    image = pipeline(
        prompt=prompt, 
        negative_prompt=negative_prompt, 
        image=raw_image, 
        mask_image=mask_image, 
        generator=rand_gen, 
        guidance_scale=cfgs
    ).images[0]
    
    return image

In [None]:
prompt = "a car driving on Mars. Studio lights, 1970s"
negative_prompt = "artifacts, low quality, distortion"

image = inpaint(raw_image, mask, prompt, negative_prompt)

fig = make_image_grid([raw_image, Image.fromarray(mask_to_rgb(mask)), image.resize((512, 512))], rows=1, cols=3)

plt.imshow(fig)
plt.axis("off")
plt.show()

### Interactive App

In [4]:
my_app = app.generate_app(get_processed_inputs, inpaint)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://78713d80a3c3a632c3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://78713d80a3c3a632c3.gradio.live


In [5]:
my_app.close()

Closing server running on port: 7860


In [None]:
# image_file = "monalisa.png"
# mask_file = "monalisa_mask.png"

# prompt = "oil painting of a woman, sfumato, renaissance, low details, Da Vinci"
# negative_prompt = "bad anatomy, deformed, ugly, disfigured"
# guidance_scale = 1.5
# rand_gen = torch.manual_seed(74294536)

# init_image = load_image(image_file).resize((512, 512))
# mask_image = load_image(mask_file).resize((512, 512))

# image = pipeline(
#     prompt=prompt, 
#     negative_prompt=negative_prompt, 
#     image=init_image, 
#     mask_image=mask_image, 
#     generator=rand_gen, 
#     guidance_scale=guidance_scale
# ).images[0]


# fig = make_image_grid([init_image, mask_image, image], rows=1, cols=3)
# fig