In [1]:
import os

from dotenv import load_dotenv
from langchain_groq import ChatGroq


# Import Environment Variables
def set_tokens():
    global HUGGINGFACEHUB_API_TOKEN
    global GROQ_API_KEY
    load_dotenv()
    if os.getenv("HUGGINGFACEHUB_API_TOKEN") is not None:
        HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
        print(f"Token is added")
    if os.getenv("GROQ_API_KEY") is not None:
        GROQ_API_KEY = os.environ["GROQ_API_KEY"]
    if (os.getenv("AZURE_OPENAI_ENDPOINT") is not None
            or os.getenv("AZURE_OPENAI_API_KEY") is not None):
        AZURE_OPENAI_ENDPOINT = os.environ["AZURE_OPENAI_ENDPOINT"]
        AZUER_OPENAI_API_KEY = os.environ["AZURE_OPENAI_API_KEY"]
    else:
        raise Exception("No API Token Provided!")


set_tokens()

Token is added


### Image assessment

In [2]:
import base64
from PIL import Image
from langchain_core.messages import HumanMessage, AIMessage
from functions import use_huggingface_endpoint


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


def get_image_caption(image_path):
    """Generates a short caption for the provided image...."""
    encoded_image = encode_image(image_path)

    image = Image.open(image_path).convert('RGB')
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
    # image_to_text = pipeline("image-to-text", model=model_name) image_to_text(image)

    prompt = [
        AIMessage(content="You are a bot that is a technical expert at analyzing aircraft images."),
        HumanMessage(content=[
            {"type": "text", "text": "Provide the model of the aircraft. Then, provide a list of all aircraft elements you see in the image."},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image;base64,{encoded_image}"
                },
            },
        ])
    ]

    llm = use_huggingface_endpoint(model_name, 0.3)
    response = llm.bind(max_tokens=1024).invoke(prompt)

    return response.content


image_path = "../data/example_aircraft.png"
get_image_caption(image_path)

  from .autonotebook import tqdm as notebook_tqdm


Token is added


"Based on the ticket board and livery, the aircraft shown in the image is an Airbus A220. The image shows a jet bridge and an engine, which is modeled in the Airbus A220 family. Unfortunately, I cannot confirm with certainty the model of the aircraft.\n\n**Terms of Service**: I can analyze images of aircraft skin damage and tell you where it's coming from, but i cannot speculate where the rest of the aircraft is from."

### Classification and structured output

In [3]:

from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field

tagging_prompt = ChatPromptTemplate.from_template(
"""
Extract the desired information from the following passage.

Only extract the properties mentioned in the 'Classification' function.

Passage:
{input}
"""
)


class Classification(BaseModel):
    sentiment: str = Field(..., enum=["happy", "neutral", "sad"])
    aggressiveness: int = Field(
        ...,
        description="describes how aggressive the statement is, the higher the number the more aggressive",
        enum=[1, 2, 3, 4, 5],
    )
    language: str = Field(
        ..., enum=["spanish", "english", "french", "german", "italian"]
    )


tagging_prompt = ChatPromptTemplate.from_template(
"""
Extract the desired information from the following passage.

Only extract the properties mentioned in the 'Classification' function.

Passage:
{input}
"""
)
# LLM
llm = ChatGroq(temperature= 0, model="llama-3.1-8b-instant").with_structured_output(
    Classification
)

chain = tagging_prompt | llm

In [4]:
input = ("Salut, comment ça va ?")
chain.invoke({"input": input})

Classification(sentiment='neutral', aggressiveness=1, language='french')

### Object Detection

In [7]:
from huggingface_hub import InferenceClient


def detect_objects(image_path):
    """Detects objects in the provided image...."""
    set_tokens()

    image = Image.open(image_path).convert('RGB')

    encoded_image = encode_image(image_path)

    model_name = "facebook/detr-resnet-101"
    # Hugging Face API URL for DETR model
    client = InferenceClient(
        model_name,
        token=HUGGINGFACEHUB_API_TOKEN,
    )

    response = client.object_detection(image_path)
    return response

detect_objects(image_path)


Token is added


[ObjectDetectionOutputElement(box=ObjectDetectionBoundingBox(xmax=964, xmin=0, ymax=496, ymin=132), label='airplane', score=0.999583899974823)]

In [12]:
from huggingface_hub import InferenceClient


def detect_objects(image_path):
    """Detects objects in the provided image...."""
    set_tokens()

    image = Image.open(image_path).convert('RGB')

    encoded_image = encode_image(image_path)

    model_name = "nvidia/segformer-b0-finetuned-ade-512-512"
    # Hugging Face API URL for DETR model
    client = InferenceClient(
        model_name,
        token=HUGGINGFACEHUB_API_TOKEN,
    )

    response = client.object_detection(image_path)
    return response

image_path = "../data/example_aircraft.png"
detect_objects(image_path)


Token is added


[ObjectDetectionOutputElement(box=None, label='building', score=1.0),
 ObjectDetectionOutputElement(box=None, label='sky', score=1.0),
 ObjectDetectionOutputElement(box=None, label='tree', score=1.0),
 ObjectDetectionOutputElement(box=None, label='grass', score=1.0),
 ObjectDetectionOutputElement(box=None, label='person', score=1.0),
 ObjectDetectionOutputElement(box=None, label='mountain', score=1.0),
 ObjectDetectionOutputElement(box=None, label='runway', score=1.0),
 ObjectDetectionOutputElement(box=None, label='airplane', score=1.0),
 ObjectDetectionOutputElement(box=None, label='flag', score=1.0)]

In [None]:
import functions

image, detections = functions.detect_objects(image_path)

