# CLIP Interrogator

The CLIP Interrogator is a prompt engineering tool that combines OpenAI's CLIP and Salesforce's BLIP to optimize text prompts to match a given image.



In [76]:
#@title Check GPU
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-26895703-08a8-0229-0429-ca3156b02c0b)


In [77]:
#@title Setup
import os, subprocess

def setup():
    install_cmds = [
        ['pip', 'install', 'gradio'],
        ['pip', 'install', 'open_clip_torch'],
        ['pip', 'install', 'clip-interrogator'],
    ]
    for cmd in install_cmds:
        print(subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode('utf-8'))

setup()


caption_model_name = 'blip-large'
clip_model_name = 'ViT-L-14/openai'

import gradio as gr
from clip_interrogator import Config, Interrogator

config = Config()
config.clip_model_name = clip_model_name
config.caption_model_name = caption_model_name
ci = Interrogator(config)

def image_to_prompt(image, mode):
    ci.config.chunk_size = 2048 if ci.config.clip_model_name == "ViT-L-14/openai" else 1024
    ci.config.flavor_intermediate_count = 2048 if ci.config.clip_model_name == "ViT-L-14/openai" else 1024
    image = image.convert('RGB')
    if mode == 'best':
        return ci.interrogate(image)
    elif mode == 'classic':
        return ci.interrogate_classic(image)
    elif mode == 'fast':
        return ci.interrogate_fast(image)
    elif mode == 'negative':
        return ci.interrogate_negative(image)





Loading caption model blip-large...
Loading CLIP model ViT-L-14/openai...
Loaded CLIP model and data in 9.91 seconds.


In [82]:
import gradio as gr
import requests
import google.auth
import google.auth.transport.requests
from google.oauth2 import service_account
import google.auth
from google.oauth2 import service_account
import requests

def generate(text, endpoint_id,title):
    # Define the required SCOPES
    SCOPES = ['https://www.googleapis.com/auth/cloud-platform']

    # Load credentials from the service account file with the specified SCOPES
    cred = service_account.Credentials.from_service_account_file("/content/platformwenda-vgm-test-apikey.json", scopes=SCOPES)

    # Create an authentication request
    auth_req = google.auth.transport.requests.Request()

    # Refresh the credentials
    cred.refresh(auth_req)

    project_id="platformwenda-1571901528109"

    # Obtain the bearer token
    bearer_token = cred.token

    # Define the base URL for your specific region (us-central1 in this example)
    base_url = "https://us-central1-aiplatform.googleapis.com/v1beta1/projects/{project_id}/locations/us-central1/endpoints/{endpoint_id}:predict"

    # Define the headers with the bearer token and content type
    headers = {
        "Authorization": "Bearer {bearer_token}".format(bearer_token=bearer_token),
        "Content-Type": "application/json"}

    prompt = f"""
    Role: A restaurant Menu manager.
    Task: Use the information in the "Title"  and "Image description" of the dish (provided after <User Input:>) and write the following information.
    1. Ingredients of the dish: Contains list of ingredients
    2. Description of the dish: A small description of the dish
    3. Additional Information (ONLY Choose values present in  <Additional Information list>)
    3. Allergens if any. (ONLY Choose values present in <Allergens list>)

    Lists:
    <Allergens list>: [Cereals,Crustaceans,Egg,Fish,Peanuts,SOYBEAN,Latte,Nuts,Celery,Mustard,Sesame seeds,Sulfur dioxide and sulphites,Shell,Clams]

    <Additional Information list>:
    [Spicy, Vegan, Gluten free, Vegetarian]

    <User Input:>
    Title:
    ```{title}```

    Image description:
    ```{text}```

    """

    # Define the request body for your specific prompt and parameters
    request_body = {
        "instances": [
            {
               "prompt": prompt,
               "max_tokens": 400,
               "temperature": 1.0,
               "top_p": 1.0,
               "top_k": 10,
            }
        ]
    }
    project_id="platformwenda-1571901528109"
    # Create the full URL using the project and endpoint IDs
    full_url = base_url.format(project_id=project_id, endpoint_id=endpoint_id)

    # Send a POST request to the model endpoint
    resp = requests.post(full_url, json=request_body, headers=headers)
    t = resp.json()
    #import pandas as pd
    #df1 = pd.DataFrame.from_dict(df)
    #prediction_text = df1['predictions'][0]
    return t


import re
'''
def extract_text(prediction_text):
    # Find index of "# Title" in the prediction_text
    title_index = prediction_text.find("# Description ")

    # Extract text until "# Description"
    title_match = re.search(r'(?<=# Description )(.+?)(?=# Ingredients)', prediction_text[title_index:], re.DOTALL)
    title = title_match.group(1) if title_match else "Ingredients pattern not found."

    # Find index of "# Ingredients" in the prediction_text
    description_index = prediction_text.find("# Ingredients")

    # Extract text until "# Allergens"
    description_match = re.search(r'(?<=# Ingredients)(.+?)(?=# Allergens)', prediction_text[description_index:], re.DOTALL)
    description = description_match.group(1) if description_match else "Ingredients pattern not found."

    # Find index of "# Ingredients" in the prediction_text
    ingredients_index = prediction_text.find("# Allergens")

    # Extract text until the end
    ingredients = prediction_text[ingredients_index + 13:]

    return title.strip(), description.strip(), ingredients.strip()
'''


def final(image, mode,title):
    text=image_to_prompt(image, mode)
    prediction_text=generate(text,'6854472035788128256',title)
    #title, description, ingredients = extract_text(prediction_text)
    return text,prediction_text





title = "Interactive demo: MenuCopyWriter"
description = ""

examples = [["example01.jpg", "best"],["example02.jpg", "best"]]

iface = gr.Interface(
    fn=final,
    inputs=[
        gr.Image(type='pil', label="Image"),
        gr.Radio(['best', 'fast', 'classic', 'negative'], label='Mode', value='best'),
        "text",
    ],
    outputs=[
        gr.Textbox(label="Prompt"),
        gr.Textbox(label="Description"),
        #gr.Textbox(label="Ingredients")
        #gr.Textbox(label="Allergens")
        #gr.Textbox(label="Additional Information")
    ],
    title=title,
    description=description,
)
iface.launch(inline=True, debug=True)



Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://3d74aea499898531c2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


100%|██████████| 55/55 [00:00<00:00, 131.68it/s]
Flavor chain:  28%|██▊       | 9/32 [00:26<01:07,  2.92s/it]
100%|██████████| 55/55 [00:00<00:00, 131.63it/s]
100%|██████████| 6/6 [00:00<00:00, 95.79it/s]
100%|██████████| 50/50 [00:00<00:00, 137.38it/s]
100%|██████████| 55/55 [00:00<00:00, 143.91it/s]
Flavor chain:  28%|██▊       | 9/32 [00:25<01:06,  2.88s/it]
100%|██████████| 55/55 [00:00<00:00, 150.18it/s]
100%|██████████| 6/6 [00:00<00:00, 107.11it/s]
100%|██████████| 50/50 [00:00<00:00, 152.80it/s]


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://3d74aea499898531c2.gradio.live




In [89]:
def extract_text(prediction_text):
    # Find index of "# Title" in the prediction_text
    title_index = prediction_text.find("# Title: ")

    # Extract text until "# Description"
    title_match = re.search(r'(?<=# Title )(.+?)(?=# Description:)', prediction_text[title_index:], re.DOTALL)
    title = title_match.group(1) if title_match else "Ingredients pattern not found."

    # Find index of "# Ingredients" in the prediction_text
    description_index = prediction_text.find("# Ingredients")

    # Extract text until "# Allergens"
    description_match = re.search(r'(?<=# Ingredients)(.+?)(?=# Allergens)', prediction_text[description_index:], re.DOTALL)
    description = description_match.group(1) if description_match else "Ingredients pattern not found."

    # Find index of "# Ingredients" in the prediction_text
    ingredients_index = prediction_text.find("# Allergens")

    # Extract text until the end
    ingredients = prediction_text[ingredients_index + 13:]

    return title.strip(), description.strip(), ingredients.strip()



In [96]:
f={'predictions': ['Prompt:\nRole: A restaurant Menu manager. \n    Task: Use the information in the "Title"  and "Image description" of the dish (provided after <User Input:>) and write the following information.\n    1. Ingredients of the dish: Contains list of ingredients\n    2. Description of the dish: A small description of the dish\n    3. Additional Information (ONLY Choose values present in  <Additional Information list>)\n    3. Allergens if any. (ONLY Choose values present in <Allergens list>)\n\n    Lists:\n    <Allergens list>: [Cereals,Crustaceans,Egg,Fish,Peanuts,SOYBEAN,Latte,Nuts,Celery,Mustard,Sesame seeds,Sulfur dioxide and sulphites,Shell,Clams]\n\n    <Additional Information list>: \n    [Spicy, Vegan, Gluten free, Vegetarian]\n\n    <User Input:>\n    Title:\n    ```lasgne```\n\n    Image description:\n    ```a close up of a plate of lasagna with cheese and spinach, still from avengers endgame, gazeta, epicurious, real life big mom, highly upvoted, scene from supplizia, website banner, parallel content, gourmet```\nOutput:\nOutput:\n    \n    Title: Lasagna\n    Ingredients: \n    Noodles\n\tParmesan\n\tRicotta\n\tMozzarella\n\tSpinach\n\tGround beef\n\tTomatoes\n    \n    Description:\n    A cheesy lasagna with a layer of spinach, grounded beef, and tangy tomatoes.\n\n    Additional Information:\n    Vegan\n\n    Allergens:\n    None'], 'deployedModelId': '7655747731599654912', 'model': 'projects/417928338756/locations/us-central1/models/mistralai_mixtral-8x7b-instruct-v0_1-1711223280330', 'modelDisplayName': 'mistralai_mixtral-8x7b-instruct-v0_1-1711223280330', 'modelVersionId': '1'}
import pandas as pd
df1 = pd.DataFrame.from_dict(f)
prediction_text = df1['predictions'][0]
prediction_text

'Prompt:\nRole: A restaurant Menu manager. \n    Task: Use the information in the "Title"  and "Image description" of the dish (provided after <User Input:>) and write the following information.\n    1. Ingredients of the dish: Contains list of ingredients\n    2. Description of the dish: A small description of the dish\n    3. Additional Information (ONLY Choose values present in  <Additional Information list>)\n    3. Allergens if any. (ONLY Choose values present in <Allergens list>)\n\n    Lists:\n    <Allergens list>: [Cereals,Crustaceans,Egg,Fish,Peanuts,SOYBEAN,Latte,Nuts,Celery,Mustard,Sesame seeds,Sulfur dioxide and sulphites,Shell,Clams]\n\n    <Additional Information list>: \n    [Spicy, Vegan, Gluten free, Vegetarian]\n\n    <User Input:>\n    Title:\n    ```lasgne```\n\n    Image description:\n    ```a close up of a plate of lasagna with cheese and spinach, still from avengers endgame, gazeta, epicurious, real life big mom, highly upvoted, scene from supplizia, website bann

In [97]:
title, description, ingredients = extract_text(prediction_text)
title

'Ingredients pattern not found.'

### Langchain Chain of thoughts

In [13]:
!pip install --upgrade langchain
!pip install langchain-groq
!pip install wikipedia
!pip install -U duckduckgo-search
!pip install gradio==3.50

#@title Setup
import os, subprocess

def setup():
    install_cmds = [
        ['pip', 'install', 'gradio'],
        ['pip', 'install', 'open_clip_torch'],
        ['pip', 'install', 'clip-interrogator'],
    ]
    for cmd in install_cmds:
        print(subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode('utf-8'))

setup()






In [14]:
caption_model_name = 'blip-large'
clip_model_name = 'ViT-L-14/openai'

import gradio as gr
from clip_interrogator import Config, Interrogator

config = Config()
config.clip_model_name = clip_model_name
config.caption_model_name = caption_model_name
ci = Interrogator(config)

def image_to_prompt(image, mode):
    ci.config.chunk_size = 2048 if ci.config.clip_model_name == "ViT-L-14/openai" else 1024
    ci.config.flavor_intermediate_count = 2048 if ci.config.clip_model_name == "ViT-L-14/openai" else 1024
    image = image.convert('RGB')
    if mode == 'best':
        return ci.interrogate(image)
    elif mode == 'classic':
        return ci.interrogate_classic(image)
    elif mode == 'fast':
        return ci.interrogate_fast(image)
    elif mode == 'negative':
        return ci.interrogate_negative(image)

Loading caption model blip-large...
Loading CLIP model ViT-L-14/openai...
Loaded CLIP model and data in 10.39 seconds.


In [15]:
import gradio as gr
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from google.colab import userdata
from langchain.utilities import WikipediaAPIWrapper
from langchain.tools import DuckDuckGoSearchRun
from langchain.agents import Tool, initialize_agent

In [16]:
llm = ChatGroq(temperature=0.5,
               groq_api_key=API_KEY,
               model_name="mixtral-8x7b-32768",
               max_tokens=20000)

wikipedia = WikipediaAPIWrapper()
search = DuckDuckGoSearchRun()

wikipedia_tool = Tool(
    name='wikipedia',
    func= wikipedia.run,
    description="Useful for when you need to look up a topic, country or person on wikipedia is the best website for fact check and any other details on any subject."
)

duckduckgo_tool = Tool(
    name='DuckDuckGo Search',
    func= search.run,
    description="Useful for when you need to do a search on the internet to find information that another tool can't find. Always be specific with your input."
)

tools = [
    Tool(
        name = "DuckDuckGo Search",
        func=duckduckgo_tool.run,
        description="useful for when you need answer questions from internet"
    )
]

tools.append(wikipedia_tool)

zero_shot_agent = initialize_agent(
    agent="zero-shot-react-description",
    tools=tools,
    llm=llm,
    verbose=True,
    handle_parsing_errors=True,
    max_iterations=10,
)

# Define function to run agent on user input
def prompt(image, mode,title):
    image_description=image_to_prompt(image, mode)
    final_response = zero_shot_agent.run(f"""
    As a restaurant menu manager, your role is to gather essential details about a dish based on its title and description, and format the output in JSON. Please provide the following information:
    1. Ingredients: List all ingredients included in the dish.
    2. Description: Briefly describe the dish.
    3. Additional Information: Choose relevant options from this list - [Cereals, Crustaceans, Egg, Fish, Peanuts, SOYBEAN, Latte, Nuts, Celery, Mustard, Sesame seeds, Sulfur dioxide and sulphites, Shell, Clams].
    4. Allergens: Select from the following options - [Spicy, Vegan, Gluten free, Vegetarian].

    Title:
    <{title}>

    Description:
    <{image_description}>
    """)
    return final_response

title = "Interactive demo: MenuCopyWriterAgent"
description = ""

examples = [["example01.jpg", "best",""],["example02.jpg", "best"]]

# Create Gradio interface
iface = gr.Interface(prompt,
                          inputs=[
                              gr.Image(type='pil', label="Image"),
                              gr.Radio(['best', 'fast', 'classic', 'negative'], label='Mode', value='best'),
                              "text",
    ],
                      outputs="text",
                     title=title,
                     description=description)
                     #examples=examples)
iface.launch(inline=True, debug=True)



Thanks for being a Gradio user! If you have questions or feedback, please join our Discord server and chat with us: https://discord.gg/feTf9x3ZSB
Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://4ab278184fc0e8e369.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


100%|██████████| 55/55 [00:00<00:00, 152.87it/s]
Flavor chain:  28%|██▊       | 9/32 [00:25<01:06,  2.89s/it]
100%|██████████| 55/55 [00:00<00:00, 159.68it/s]
100%|██████████| 6/6 [00:00<00:00, 117.37it/s]
100%|██████████| 50/50 [00:00<00:00, 166.03it/s]




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo answer this question, I need to find the ingredients and other details about the dish "Lasagna" from the internet.

Action: DuckDuckGo Search

Action Input: "Lasagna ingredients"
[0m
Observation: [36;1m[1;3mTransfer the beef mixture to a medium-sized (3- to 4-quart) pot. Add the crushed tomatoes, tomato sauce, and tomato paste to the pot. Add the parsley, oregano, and Italian seasonings, adjusting the amounts to taste. Sprinkle with garlic powder and/or garlic salt, to taste. Sprinkle with red or white wine vinegar. Stir in tomatoes, tomato paste, water, sugar, 3 tablespoons parsley, basil, fennel, 1/2 teaspoon salt and pepper; bring to a boil. Reduce heat; simmer, uncovered, 30 minutes, stirring occasionally. In a small bowl, mix egg, ricotta cheese and remaining 1/4 cup parsley and 1/4 teaspoon salt. Preheat oven to 375°. In a mixing bowl, combine ricotta cheese with egg, remaining 2 tablespoons parsley, and 1/2 teasp



In [17]:
!pip freeze requirements.txt

absl-py==1.4.0
accelerate==0.28.0
aiofiles==23.2.1
aiohttp==3.9.3
aiohttp-cors==0.7.0
aiosignal==1.3.1
alabaster==0.7.16
albumentations==1.3.1
altair==4.2.2
annotated-types==0.6.0
anyio==3.7.1
appdirs==1.4.4
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array-record==0.5.0
arviz==0.15.1
astropy==5.3.4
astunparse==1.6.3
async-timeout==4.0.3
atpublic==4.0
attrs==23.2.0
audioread==3.0.1
autograd==1.6.2
Babel==2.14.0
backcall==0.2.0
beautifulsoup4==4.12.3
bidict==0.23.1
bigframes==0.22.0
bleach==6.1.0
blessed==1.20.0
blinker==1.4
blis==0.7.11
blosc2==2.0.0
bokeh==3.3.4
bqplot==0.12.43
branca==0.7.1
build==1.1.1
CacheControl==0.14.0
cachetools==5.3.3
catalogue==2.0.10
certifi==2024.2.2
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
chex==0.1.85
click==8.1.7
click-plugins==1.1.1
cligj==0.7.2
clip-interrogator==0.6.0
cloudpathlib==0.16.0
cloudpickle==2.2.1
cmake==3.27.9
cmdstanpy==1.2.1
colorama==0.4.6
colorcet==3.1.0
colorful==0.5.6
colorlover==0.3.0
colour==0.1.5
community==1.0.0b