# Sheep Age Estimation Notebook

### Introduction:
This notebook covers the creation of a simple application that helps estimate the age of a sheep using just an image of the sheep.\
We use the following plug-in components:
- LLM using ai-mixtral-8x7b-instruct
- Fuyu-8b model from NVIDIA AI Catalog for image analysis and visual reasoning
- Gradio for a simple User Interface to upload images

![UI Example](assets/ui-example.png "UI Example")

### Step 1: Install required packages

In [None]:
%pip install fastapi==0.104.1 uvicorn[standard]==0.24.0 python-multipart==0.0.6 langchain==0.1.9 unstructured[all-docs]==0.11.2 sentence-transformers==2.2.2 llama-index==0.9.22 dataclass-wizard==0.22.2 opencv-python==4.8.0.74 llama-hub==0.0.43 pymilvus==2.3.1 jupyterlab==4.0.8 langchain-core==0.1.29 langchain-nvidia-ai-endpoints==0.1.1 atlassian-python-api==3.41.4 gradio==3.48.0 markdownify==0.12.1 scikit-image

### Step 2: Export the NVIDIA_API_KEY

In [None]:
import getpass
import os

if os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
    print("Valid NVIDIA_API_KEY already in environment. Delete to reset")
else:
    nvapi_key = getpass.getpass("NVAPI Key (starts with nvapi-): ")
    assert nvapi_key.startswith("nvapi-"), f"{nvapi_key[:5]}... is not a valid key"
    os.environ["NVIDIA_API_KEY"] = nvapi_key
global nvapi_key

### Step 3: Set 'ai-mixtral_8x7b-instruct' model as LLM

In [None]:
#Set up Prerequisites for Image Captioning App User Interface
import os
import io
import IPython.display
from PIL import Image
import base64
import requests
import gradio as gr
import openai, httpx, sys
import json

In [None]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA
llm = ChatNVIDIA(model="ai-mixtral-8x7b-instruct", nvidia_api_key=nvapi_key, max_tokens=1024)

### Step 4: Wrap 'fuyu-8b' model in a tool for later use

In [None]:
from langchain.tools import BaseTool
from transformers import BlipProcessor, BlipForConditionalGeneration, DetrImageProcessor, DetrForObjectDetection
import torch
from tempfile import NamedTemporaryFile
from langchain.agents import initialize_agent
from langchain.chains.conversation.memory import ConversationBufferWindowMemory

def fetch_outputs(output):
    collect_streaming_outputs=[]
    for o in output:
        try:
            start = o.index('{')
            jsonString=o[start:]
            d = json.loads(jsonString)
            temp=d['choices'][0]['delta']['content']
            collect_streaming_outputs.append(temp)
        except:
            pass
    outputs=''.join(collect_streaming_outputs)
    return outputs.replace('\\','').replace('\'','')

def img2base64_string(img_path):
    image = Image.open(img_path)
    if image.width > 800 or image.height > 800:
        image.thumbnail((800, 800))
    buffered = io.BytesIO()
    image.convert("RGB").save(buffered, format="JPEG", quality=85)
    image_base64 = base64.b64encode(buffered.getvalue()).decode()
    return image_base64


class ImageCaptionTool(BaseTool):
    name = "Image captioner from Fuyu"
    description = "Use this tool when given the path to an image that you would like to be described. " \
                  "It will return a simple caption describing the image."

    def _run(self, img_path):
        print("run function in ImageCaptionTool called")
        invoke_url = "https://ai.api.nvidia.com/v1/vlm/adept/fuyu-8b"
        stream = True


        image_b64=img2base64_string(img_path)


        assert len(image_b64) < 200_000, \
          "To upload larger images, use the assets API (see docs)"
        headers = {
          "Authorization": f"Bearer {nvapi_key}",
          "Accept": "text/event-stream" if stream else "application/json"
        }

        payload = {
          "messages": [
            {
              "role": "user",
              "content": f'Estimate the age of the sheep in this image in years, return a numerical estimate and include some reasoning for this estimate in your response. The response should be in the following format: "Age: <numerical estimate> years. Reasoning: <reasoning>". <img src="data:image/png;base64,{image_b64}" />'
            }
          ],
          "max_tokens": 1024,
          "temperature": 0.20,
          "top_p": 0.70,
          "seed": 0,
          "stream": stream
        }

        response = requests.post(invoke_url, headers=headers, json=payload)

        if stream:
            output=[]
            for line in response.iter_lines():
                if line:
                    output.append(line.decode("utf-8"))
        else:
            output=response.json()
        out=fetch_outputs(output)
        print(out)
        return out

    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support async")

### Step 5: Initialize our LangChain agent with the ImageCaptionTool

In [None]:
#initialize the agent
tools = [ImageCaptionTool()]

conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)


agent = initialize_agent(
    agent="chat-conversational-react-description",
    tools=tools,
    llm=llm,
    max_iterations=5,
    verbose=True,
    memory=conversational_memory,
    handle_parsing_errors=True,
    early_stopping_method='generate'
)

In [None]:
def my_agent(img_path):
    response = agent.invoke({"input":f'this is the image path: {img_path}'})
    return response['output']

### Step 6: Wrap the agent in a Gradio UI for easy interaction

In [None]:
import gradio as gr
ImageCaptionApp = gr.Interface(fn=my_agent,
                    inputs=[gr.Image(label="Upload image", type="filepath")],
                    outputs=[gr.Textbox(label="Caption")],
                    title="Sheep age estimation using Nvidia API",
                    description="Upload an image of a sheep to get an estimate for its age",
                    debug=True,
                    allow_flagging="never")

ImageCaptionApp.launch(share=True)