## Image Descriptions with Gemini Vision

Generate detailed textual descriptions for extracted images using Gemini 2.0 Flash.

**Prerequisites:**
- Run notebook 06-01 first to extract images
- Google API key set in .env file

**Output:**
- Markdown descriptions saved to `data/rag-data/images_desc/{company}/{document}/page_X.md`

### 1. Setup and Imports

In [1]:
from dotenv import load_dotenv
load_dotenv()

from pathlib import Path
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
from PIL import Image
import base64
import io

### 2. Configuration

In [2]:
# Paths
IMAGES_DIR = "data/rag-data/images"
OUTPUT_DESC_DIR = "data/rag-data/images_desc"

# Model configuration
MODEL_NAME = "gemini-2.5-flash"

### 3. Initialize Gemini Model

In [3]:
model = ChatGoogleGenerativeAI(model=MODEL_NAME)

### 4. Description Generation Function

In [4]:
describe_image_prompt = """Analyze this financial document page and extract meaningful data in a concise format.

For charts and graphs:
- Identify the metric being measured
- List key data points and values
- Note significant trends (growth, decline, stability)

For tables:
- Extract column headers and key rows
- Note important values and totals

For text:
- Summarize key facts and numbers only
- Skip formatting, headers, and navigation elements

Be direct and factual. Focus on numbers, trends, and insights that would be useful for retrieval."""

In [5]:
def generate_image_description(image_path: Path):
    """Generate detailed description of image using Gemini Vision."""
    
    image = Image.open(image_path)
    
    buffered = io.BytesIO()
    image.save(buffered, format="PNG")
    image_base64 = base64.b64encode(buffered.getvalue()).decode()
    
    message = HumanMessage(
        content=[
            {"type": "text", "text": describe_image_prompt},
            {"type": "image_url", "image_url": f"data:image/png;base64,{image_base64}"}
        ]
    )
    
    response = model.invoke([message])
    return response.content

### 5. Process All Images

In [6]:
def generate_and_save_description(image_path: Path):
    """Generate and save description for a single image."""
    
    # Extract metadata from path
    company_name = image_path.parent.parent.name
    doc_name = image_path.parent.name
    
    # Create output path
    output_dir = Path(OUTPUT_DESC_DIR) / company_name / doc_name
    output_dir.mkdir(parents=True, exist_ok=True)
    
    desc_file = output_dir / f"{image_path.stem}.md"
    
    # Skip if already exists
    if desc_file.exists():
        return False
    
    # Generate and save description
    try:
        description = generate_image_description(image_path)
        desc_file.write_text(description, encoding='utf-8')
        return True
    except Exception as e:
        print(f"Error: {image_path.name} - {e}")
        return False

In [None]:
images_path = Path(IMAGES_DIR)
image_files = list(images_path.rglob("page_*.png"))

total_generated = 0
for image_file in image_files:
    if generate_and_save_description(image_file):
        total_generated += 1

    print(f"Done: {image_file}")

print(f"\nTotal generated: {total_generated} descriptions")