In [52]:
!pip install instructor

Collecting instructor
  Downloading instructor-1.10.0-py3-none-any.whl.metadata (11 kB)
Collecting diskcache>=5.6.3 (from instructor)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting docstring-parser<1.0,>=0.16 (from instructor)
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting openai<2.0.0,>=1.70.0 (from instructor)
  Downloading openai-1.99.9-py3-none-any.whl.metadata (29 kB)
Downloading instructor-1.10.0-py3-none-any.whl (119 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
Downloading docstring_parser-0.17.0-py3-none-any.whl (36 kB)
Downloading openai-1.99.9-py3-none-any.whl (786 kB)
   ---------------------------------------- 0.0/786.8 kB ? eta -:--:--
   --------------------------------------- 786.8/786.8 kB 11.4 MB/s eta 0:00:00
Installing collected packages: docstring-parser, diskcache, openai, instructor
  Attempting uninstall: openai
    Found existing installation: openai 1.54.4
    Uninstalling openai-1.54.

In [2]:
from typing import Dict, Any
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
# from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.pipeline_options import smolvlm_picture_description
from docling.document_converter import DocumentConverter, PdfFormatOption





In [3]:
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True  # Enable OCR for scanned documents
pipeline_options.ocr_options.use_gpu = False
pipeline_options.do_table_structure = True  # Extract table structure
pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.do_formula_enrichment = True
# pipeline_options.generate_picture_images = True
# pipeline_options.images_scale = 2
# pipeline_options.do_picture_classification = True
# pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = smolvlm_picture_description

In [4]:
converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
            }
        )

In [5]:
pdf_path = "CHEELIZZA PIZZA INDIA LTD - INVESTMENT DECK.pdf"
result = converter.convert(pdf_path)



In [9]:
print(result.document.model_dump().keys())

dict_keys(['schema_name', 'version', 'name', 'origin', 'furniture', 'body', 'groups', 'texts', 'pictures', 'tables', 'key_value_items', 'form_items', 'pages'])


In [7]:
print(result.document.name)

CHEELIZZA PIZZA INDIA LTD - INVESTMENT DECK


In [10]:
print(result.document.origin)

mimetype='application/pdf' binary_hash=2797362585584270075 filename='CHEELIZZA PIZZA INDIA LTD - INVESTMENT DECK.pdf' uri=None


In [40]:
print(result.document.pictures[0].model_dump().keys())

dict_keys(['self_ref', 'parent', 'children', 'content_layer', 'label', 'prov', 'captions', 'references', 'footnotes', 'image', 'annotations'])


In [41]:
print(result.document.pictures[0])

self_ref='#/pictures/0' parent=RefItem(cref='#/body') children=[] content_layer=<ContentLayer.BODY: 'body'> label=<DocItemLabel.PICTURE: 'picture'> prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=315.87042236328125, t=402.30165100097656, r=909.1106567382812, b=158.37625122070312, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 0))] captions=[] references=[] footnotes=[] image=None annotations=[]


In [42]:
text = result.document.export_to_markdown()

In [44]:
full_text = result.document.export_to_markdown()

# Extract tables if any
tables = []
for table in result.document.tables:
    table_data = {
        "caption": getattr(table, 'caption', ''),
        "data": table.export_to_dataframe().to_dict('records') if hasattr(table, 'export_to_dataframe') else []
    }
    tables.append(table_data)

# Extract key-value pairs and metadata
metadata = {
    "page_count": len(result.document.pages),
    "title": getattr(result.document, 'title', ''),
    "tables_count": len(tables),
    "has_images": len(result.document.pictures) > 0,
    "word_count": len(full_text.split()) if full_text else 0
}

# Structure the extracted content
extracted_content = {
    "full_text": full_text,
    "tables": tables,
    "metadata": metadata,
}

In [45]:
filename = "CHEELIZZA PIZZA INDIA LTD - INVESTMENT DECK.pdf"

In [49]:
from jinja2 import Environment, FileSystemLoader
import os
template_dir = "./../prompts"
jinja_env = Environment(loader=FileSystemLoader(template_dir))
template = jinja_env.get_template("ipo_analysis_prompt.j2")
        
prompt = template.render(
    filename=filename,
    full_text=extracted_content.get("full_text", ""),
    # sections=content.get("sections", {}),
    tables=extracted_content.get("tables", []),
    metadata=extracted_content.get("metadata", {})
)

In [50]:
print(prompt)

You are an expert investment banker and IPO advisor analyzing a pitch deck for IPO readiness. Your task is to thoroughly evaluate the company based on the provided pitch deck content and provide a comprehensive assessment.

## Company Information
**Filename:** CHEELIZZA PIZZA INDIA LTD - INVESTMENT DECK.pdf
**Document Length:** 1637 words
**Pages:** 27

## Extracted Content
## IZZA

<!-- image -->

## SAY CHEESE!!

DOESN'T IT BRING A SMILE ON YOUR FACE?

## Current Indian F&amp;B landscape

<!-- image -->

<!-- image -->

Future growth will be led by home grown brands

<!-- image -->

## Pizza - Continues to Lead &amp; Grow

<!-- image -->

<!-- image -->

Pizza remains one of the most consume cuisine in the country

<!-- image -->

<!-- image -->

## West and North India

Are Potentially Large Markets For A Vegetarian Only Restaurant Chain

North and Central

West

Preferred Eat-out Format

Preferred Type

Preferred Cuisine

#Cities in Top 20

Affordable Casual Dining  (ACDR) Affordab

In [53]:
import instructor

In [None]:
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY environment variable is required")

genai_client = Client(api_key=api_key)
    
# Initialize Gemini model with Instructor
model = instructor.from_genai(
    client=genai_client,
    mode=Mode.GENAI_STRUCTURED_OUTPUTS
)

In [None]:
response = model.create(
                messages=[{"role": "user", "content": prompt}],
                response_model=StructuredAnalysis,
                model="gemini-2.5-flash",
                max_retries=3
            )