## Image Descriptions with Gemini Vision

Generate detailed textual descriptions for extracted images using Gemini 2.0 Flash.

**Prerequisites:**
- Run notebook 06-01 first to extract images
- Google API key set in .env file

**Output:**
- Markdown descriptions saved to `data/rag-data/images_desc/{company}/{document}/page_X.md`

### 1. Setup and Imports

In [None]:
from dotenv import load_dotenv
load_dotenv()

from pathlib import Path
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
from PIL import Image
import base64
import io

### 2. Configuration

In [2]:
# Paths
IMAGES_DIR = "data/rag-data/images"
OUTPUT_DESC_DIR = "data/rag-data/images_desc"

# Model configuration
MODEL_NAME = "gemini-2.5-flash"

### 3. Initialize Gemini Model

In [None]:
model = ChatGoogleGenerativeAI(model=MODEL_NAME)

### 4. Description Generation Function

In [None]:
describe_image_prompt = """Analyze this financial document page image and provide a detailed description.

Focus on:
- Charts and graphs: describe data trends, axes labels, and key insights
- Tables: describe structure and key data points
- Text content: summarize main points
- Visual elements: describe layout and important visual information

Provide a comprehensive description that would help someone understand the content without seeing the image."""

In [None]:
def generate_image_description(image_path: Path):
    """Generate detailed description of image using Gemini Vision."""
    
    image = Image.open(image_path)
    
    buffered = io.BytesIO()
    image.save(buffered, format="PNG")
    image_base64 = base64.b64encode(buffered.getvalue()).decode()
    
    message = HumanMessage(
        content=[
            {"type": "text", "text": describe_image_prompt},
            {"type": "image_url", "image_url": f"data:image/png;base64,{image_base64}"}
        ]
    )
    
    response = model.invoke([message])
    return response.content

### 5. Process All Images

In [None]:
def process_company_images(company_dir: Path):
    """Process all images for a company."""
    
    desc_count = 0
    
    for doc_dir in company_dir.iterdir():
        if not doc_dir.is_dir():
            continue
            
        output_dir = Path(OUTPUT_DESC_DIR) / company_dir.name / doc_dir.name
        output_dir.mkdir(parents=True, exist_ok=True)
        
        for image_file in doc_dir.glob("page_*.png"):
            desc_file = output_dir / f"{image_file.stem}.md"
            
            if desc_file.exists():
                continue
            
            try:
                description = generate_image_description(image_file)
                desc_file.write_text(description, encoding='utf-8')
                desc_count += 1
            except Exception as e:
                print(f"Error: {image_file.name} - {e}")
    
    return desc_count

In [None]:
images_path = Path(IMAGES_DIR)
company_dirs = [d for d in images_path.iterdir() if d.is_dir()]

total_descriptions = 0
for company_dir in company_dirs:
    count = process_company_images(company_dir)
    total_descriptions += count
    print(f"{company_dir.name}: {count} descriptions")

print(f"\nTotal: {total_descriptions} descriptions")