In [None]:
from google.colab import output

!pip install torch torchvision
!pip install -U git+https://github.com/luca-medeiros/lang-segment-anything.git
!pip install -U llamaapi langchain-experimental

output.clear()

In [None]:
!pip install -q streamlit
!npm install localtunnel

!git clone https://github.com/omriav/blended-latent-diffusion.git
!conda env create -f environment.yaml
!pip install diffusers
!pip install transformers
!pip install accelerate

output.clear()

In [None]:
import os
import locale

locale.getpreferredencoding = lambda: "UTF-8"
os.makedirs("/content/outputs")

In [None]:
# NLP Frontend
%%writefile nlp_frontend.py
import locale
import json
from typing import List

from langchain.llms import OpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator

from llamaapi import LlamaAPI
from langchain_experimental.llms import ChatLlamaAPI


def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

llama = LlamaAPI("LL-IKBw9svOF3WJ18BD2KtxKCcClwg0OxJWPLCdsNQpPPjyLaxb02L1CqK2K1Rc4i9X")
model = ChatLlamaAPI(client=llama)

class ImageEdit(BaseModel):
    mask: str = Field(description="The part of the input that signifies what object, entity or noun needs to removed from a particular image.")
    subject: str = Field(description="The part of the input that declares what object, entity or noun needs to be added to the image.")

from langchain.chains import create_tagging_chain

schema = {
    "properties": {
        "mask": {
            "type": "string",
            "description": "TThe part of the input that signifies what object, entity or noun needs to removed from a particular image.",
        },
        "subject": {
            "type": "string",
            "description": "The part of the input that declares what object, entity or noun needs to be added to the image.",
        }
    }
}

nlp_frontend = create_tagging_chain(schema, model)

def prompt_split(prompt):
  try:
    x = nlp_frontend.run(prompt)
    if type(x['mask']) != str:
      return x['mask']['enum'], x['subject']['enum']
    else:
      return x['mask'], x['subject']
  except Exception as e:
    print("Error. Try again.")

Writing nlp_frontend.py


In [None]:
# LangSAM Model
%%writefile langsam.py
import warnings
import numpy as np
import matplotlib.pyplot as plt
import requests
from PIL import Image
from io import BytesIO
from lang_sam import LangSAM


def download_image(url):
    response = requests.get(url)
    response.raise_for_status()
    return Image.open(BytesIO(response.content)).convert("RGB")

def save_mask(mask_np, filename):
    mask_image = Image.fromarray((mask_np * 255).astype(np.uint8))
    mask_image.save(filename)

def display_image_with_masks(image, masks):
    num_masks = len(masks)

    fig, axes = plt.subplots(1, num_masks + 1, figsize=(15, 5))
    axes[0].imshow(image)
    axes[0].set_title("Original Image")
    axes[0].axis('off')

    for i, mask_np in enumerate(masks):
        axes[i+1].imshow(mask_np, cmap='gray')
        axes[i+1].set_title(f"Mask {i+1}")
        axes[i+1].axis('off')

    plt.tight_layout()
    plt.show()

def display_image_with_boxes(image, boxes, logits):
    fig, ax = plt.subplots()
    ax.imshow(image)
    ax.set_title("Image with Bounding Boxes")
    ax.axis('off')

    for box, logit in zip(boxes, logits):
        x_min, y_min, x_max, y_max = box
        confidence_score = round(logit.item(), 2)  # Convert logit to a scalar before rounding
        box_width = x_max - x_min
        box_height = y_max - y_min

        # Draw bounding box
        rect = plt.Rectangle((x_min, y_min), box_width, box_height, fill=False, edgecolor='red', linewidth=2)
        ax.add_patch(rect)

        # Add confidence score as text
        ax.text(x_min, y_min, f"Confidence: {confidence_score}", fontsize=8, color='red', verticalalignment='top')

    plt.show()

def print_bounding_boxes(boxes):
    print("Bounding Boxes:")
    for i, box in enumerate(boxes):
        print(f"Box {i+1}: {box}")

def print_detected_phrases(phrases):
    print("\nDetected Phrases:")
    for i, phrase in enumerate(phrases):
        print(f"Phrase {i+1}: {phrase}")

def print_logits(logits):
    print("\nConfidence:")
    for i, logit in enumerate(logits):
        print(f"Logit {i+1}: {logit}")

def langsam(image, text_prompt):
    # Suppress warning messages
    warnings.filterwarnings("ignore")

    image_pil = Image.open(image).convert("RGB")
    image_pil.save("/content/image.jpeg")

    model = LangSAM()
    masks, boxes, phrases, logits = model.predict(image_pil, text_prompt)

    if len(masks) == 0:
        print(f"No objects of the '{text_prompt}' prompt detected in the image.")
    else:
        # Convert masks to numpy arrays
        masks_np = [mask.squeeze().cpu().numpy() for mask in masks]

        # Display the original image and masks side by side
        #display_image_with_masks(image_pil, masks_np)

        # Display the image with bounding boxes and confidence scores
        #display_image_with_boxes(image_pil, boxes, logits)

        # Save the masks
        for i, mask_np in enumerate(masks_np):
            mask_path = f"image_mask_{i+1}.png"
            save_mask(mask_np, mask_path)
        im_mask = plt.imread("image_mask_1.png")
        # Print the bounding boxes, phrases, and logits
        #print_bounding_boxes(boxes)
        #print_detected_phrases(phrases)
        #print_logits(logits)
        return masks_np[0]

Writing langsam.py


In [None]:
%%writefile app.py
import streamlit as st
import PIL.Image
import os
from nlp_frontend import prompt_split
from langsam import langsam

st.markdown("<h1 style='text-align: center;'>Text2Scene</h1>", unsafe_allow_html=True)

# Row 1: Text Prompt
text_prompt = st.text_input("Enter a description of the image you want:")

# Row 2: Image Selector
image_file = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png"])
if image_file is not None:
    st.image(image_file, caption="Selected Image")

# Button - Bottom of screen
if st.button("Generate Image"):
    # Replace this with your image generation logic
    maskprompt, genprompt = prompt_split(text_prompt)
    st.write('To mask  - ', maskprompt)
    st.write('To fill  - ', genprompt)
    output_mask = langsam(image_file, maskprompt)
    st.image("/content/image_mask_1.png", caption="Generated Mask")
    os.system(f'python /content/blended-latent-diffusion/scripts/text_editing_SD2.py --prompt "{genprompt}" --init_image "/content/image.jpeg" --mask "/content/image_mask_1.png"')
    st.image("/content/outputs/res.jpg", caption="Generated Image")

Writing app.py


In [None]:
!streamlit run app.py &>/content/logs.txt & npx localtunnel --port 8501 & curl ipv4.icanhazip.com

34.87.164.207
[K[?25hnpx: installed 22 in 4.661s
your url is: https://shaky-kids-rest.loca.lt
