In [None]:
# Step 1: Install all dependencies

%pip install scrapfly-sdk
%pip install llama-index-embeddings-openai
%pip install llama-index-readers-web
%pip install llama-index-llms-openai



In [None]:
# Step 2: Load environmental variables (Colab Secrets) from Colab

import os
try:
    from google.colab import userdata  # type: ignore
    os.environ.setdefault("OPENAI_API_KEY", userdata.get("OPENAI_API_KEY") or "")
    os.environ.setdefault("SCRAPFLY_API_KEY", userdata.get("SCRAPFLY_API_KEY") or "")
except Exception:
    pass  # Not in Colab or secrets not set

if not os.getenv("OPENAI_API_KEY"):
    print("⚠️ OPENAI_API_KEY is not set. Add it in Colab Secrets or export it in the environment.")
if not os.getenv("SCRAPFLY_API_KEY"):
    print("⚠️ SCRAPFLY_API_KEY is not set. Add it in Colab Secrets or export it in the environment.")

In [None]:
import llama_index

from llama_index.readers.web import ScrapflyReader
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
from IPython.display import Markdown, display

# Fix: Rebuild model to resolve forward references
ScrapflyReader.model_rebuild()

# Initiate ScrapflyReader with Scrapfly API key
scrapfly_reader = ScrapflyReader(
    api_key=os.environ['SCRAPFLY_API_KEY'],
    ignore_scrape_failures=True,  # Ignore unprocessable web pages and log their exceptions
)

scrapfly_scrape_config = {
    "asp": True,  # Bypass scraping blocking and antibot solutions, like Cloudflare
    "render_js": True,  # Enable JavaScript rendering with a cloud headless browser
    "proxy_pool": "public_residential_pool",  # Select a proxy pool (datacenter or residnetial)
    "country": "us",  # Select a proxy location
    "auto_scroll": True,  # Auto scroll the page
    "js": "",  # Execute custom JavaScript code by the headless browser
}

# Load documents from URLs as markdown
documents = scrapfly_reader.load_data(
    urls=["https://www.iii.org/publications/commercial-insurance/rankings",
          "https://www.iii.org/publications/commercial-insurance/what-it-does/lines-of-business/standard-lines-premiums",
          "https://www.insurancebusinessmag.com/us/news/breaking-news/the-10-largest-pc-commercial-and-personal-lines-insurers-19638.aspx",
          #"https://content.naic.org/sites/default/files/pc-and-title-2024mid-year-industry-report.pdf",
          #"https://content.naic.org/sites/default/files/research-actuarial-property-casualty-market-share.pdf",
          "https://www.reinsurancene.ws/top-100-u-s-property-casualty-insurance-companies/",
          "https://www.sci-tech-today.com/stats/insurance-industry-statistics/#Type_of_Insurance_Industry_Statistics",
          "https://www.statista.com/statistics/186464/leading-us-commercial-lines-insurance-by-market-share/"], # List of URLs to scrape
    scrape_config=scrapfly_scrape_config,  # Pass the scrape config
    scrape_format="markdown",  # The scrape result format, either `markdown`(default) or `text`
)

print(documents)

# Print each document vertically
for idx, doc in enumerate(documents, start=1):
    #print(f"\n--- Document {idx} ---\n")
    #print(doc.text if hasattr(doc, 'text') else str(doc))
    display(Markdown(f"\n--- Document {idx} ---\n"))
    display(Markdown(doc.text if hasattr(doc, 'text') else str(doc)))

[Document(id_='3bdc78f6-7c7b-4704-8c34-e597ddb0c128', embedding=None, metadata={'url': 'https://www.iii.org/publications/commercial-insurance/rankings'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=None, image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}'), Document(id_='20b4692c-88d9-4dfc-8ec3-e119188ac4c4', embedding=None, metadata={'url': 'https://www.iii.org/publications/commercial-insurance/what-it-does/lines-of-business/standard-lines-premiums'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=None, image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}'), Document(id_='77927552-6069-4681-a533-14a9767b7649', embedding=None, metadata={'url': 'https://www.insu


--- Document 1 ---





--- Document 2 ---





--- Document 3 ---





--- Document 4 ---





--- Document 5 ---





--- Document 6 ---




In [None]:
import os
from llama_index.core import VectorStoreIndex
from IPython.display import Markdown, display

# Set the OpenAI key as a environment variable
os.environ['OPENAI_API_KEY'] = "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"

# Create an index store for the documents
index = VectorStoreIndex.from_documents(documents)

# Create the RAG engine with using the index store
query_engine = index.as_query_engine()

# Submit a query
response = query_engine.query("List down top 3 Commercial Lines Insurers in 2024 by direct written premium. List the company name and the respective premium, one in each line.")
print(response)
#display(Markdown(response.response))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Travelers - $15.5 billion  
Chubb - $14.2 billion  
Liberty Mutual - $12.8 billion


In [None]:
print("\n--- Sources ---")
for source_node in response.source_nodes:
    source_url = source_node.metadata.get('url', 'URL not found')
    print(f"Source URL: {source_url}")


--- Sources ---
Source URL: https://www.statista.com/statistics/186464/leading-us-commercial-lines-insurance-by-market-share/
Source URL: https://www.insurancebusinessmag.com/us/news/breaking-news/the-10-largest-pc-commercial-and-personal-lines-insurers-19638.aspx


In [None]:
response = query_engine.query("What is the Direct Written Premium of Travelers in Commercial Lines in 2024?")
display(Markdown(response.response))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The Direct Written Premium of Travelers in Commercial Lines in 2024 is $16.5 billion.

In [None]:
# Submit a query
response = query_engine.query("What is the rank of Progressive insurance by net premium written in 2023?")
display(Markdown(response.response))

print("\n--- Sources ---")
for source_node in response.source_nodes:
    source_url = source_node.metadata.get('url', 'URL not found')
    print(f"Source URL: {source_url}")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Progressive insurance is ranked as the 3rd largest in terms of net premium written in 2023.


--- Sources ---
Source URL: https://www.iii.org/publications/commercial-insurance/rankings
Source URL: https://www.insurancebusinessmag.com/us/news/breaking-news/the-10-largest-pc-commercial-and-personal-lines-insurers-19638.aspx


In [None]:
# Submit a query
response = query_engine.query("What was the revenue of Allianz in 2023?")
display(Markdown(response.response))

print("\n--- Sources ---")
for source_node in response.source_nodes:
    source_url = source_node.metadata.get('url', 'URL not found')
    print(f"Source URL: {source_url}")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


I'm sorry, but I cannot provide the revenue of Allianz in 2023 based on the context information provided.


--- Sources ---
Source URL: https://content.naic.org/sites/default/files/research-actuarial-property-casualty-market-share.pdf
Source URL: https://www.statista.com/statistics/186464/leading-us-commercial-lines-insurance-by-market-share/


In [None]:
# Submit a query
response = query_engine.query("What is the rank of AXA by revenue in 2023?")
display(Markdown(response.response))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The rank of AXA by revenue in 2023 is not provided in the context information.

In [None]:
# Submit a query
response = query_engine.query("List down TOP TEN WRITERS OF COMMERCIAL LINES INSURANCE BY DIRECT PREMIUMS WRITTEN in 2024. Also mention respective direct written premiums. List only one name in each line.")
display(Markdown(response.response))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Travelers Companies Inc. - $26,232,201  
Chubb Ltd. - $26,123,774  
Liberty Mutual - $19,970,975  
Berkshire Hathaway Inc. - $19,202,724  
Zurich Insurance Group - $17,991,721  
American International Group (AIG) - $14,151,844  
Hartford Financial Services - $13,829,345  
CNA Financial Corp. - $13,450,851  
Progressive - $12,547,795  
Tokio Marine - $10,255,294

In [None]:
# Submit a query
response = query_engine.query("List down TOP TEN property and casualty insurance companies in the United States BY NET PREMIUMS WRITTEN in 2024. Also mention respective direct written premiums.")
display(Markdown(response.response))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The top ten property and casualty insurance companies in the United States by net premiums written in 2024, along with their respective direct written premiums, are as follows:

1. Company A - Net Premiums Written: $X billion, Direct Written Premiums: $Y billion
2. Company B - Net Premiums Written: $X billion, Direct Written Premiums: $Y billion
3. Company C - Net Premiums Written: $X billion, Direct Written Premiums: $Y billion
4. Company D - Net Premiums Written: $X billion, Direct Written Premiums: $Y billion
5. Company E - Net Premiums Written: $X billion, Direct Written Premiums: $Y billion
6. Company F - Net Premiums Written: $X billion, Direct Written Premiums: $Y billion
7. Company G - Net Premiums Written: $X billion, Direct Written Premiums: $Y billion
8. Company H - Net Premiums Written: $X billion, Direct Written Premiums: $Y billion
9. Company I - Net Premiums Written: $X billion, Direct Written Premiums: $Y billion
10. Company J - Net Premiums Written: $X billion, Direct Written Premiums: $Y billion

In [None]:
print("\n--- Sources ---")
for source_node in response.source_nodes:
    source_url = source_node.metadata.get('url', 'URL not found')
    print(f"Source URL: {source_url}")


--- Sources ---
Source URL: https://www.statista.com/statistics/186464/leading-us-commercial-lines-insurance-by-market-share/
Source URL: https://www.insurancebusinessmag.com/us/news/breaking-news/the-10-largest-pc-commercial-and-personal-lines-insurers-19638.aspx


In [None]:
# Submit a query
response = query_engine.query("List down Top 10 P&C insurers in the US – commercial & personal lines insurance.")
display(Markdown(response.response))

print("\n--- Sources ---")
for source_node in response.source_nodes:
    source_url = source_node.metadata.get('url', 'URL not found')
    print(f"Source URL: {source_url}")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The top 10 P&C insurers in the US for commercial and personal lines insurance are as follows:

1. State Farm
2. Berkshire Hathaway
3. Progressive
4. Liberty Mutual
5. Allstate
6. Travelers
7. USAA
8. Chubb
9. Farmers Insurance Group
10. Nationwide


--- Sources ---
Source URL: https://www.insurancebusinessmag.com/us/news/breaking-news/the-10-largest-pc-commercial-and-personal-lines-insurers-19638.aspx
Source URL: https://www.reinsurancene.ws/top-100-u-s-property-casualty-insurance-companies/


In [None]:
from llama_index.readers.web import ScrapflyReader
from llama_index.core import VectorStoreIndex

scrapfly_reader = ScrapflyReader(
    api_key=os.environ['SCRAPFLY_API_KEY'],
    ignore_scrape_failures=True,
)

# Load documents from URLs as markdown
documents = scrapfly_reader.load_data(
    urls=["https://www.iii.org/publications/commercial-insurance/rankings",
          ]
)

# Set the OpenAI key as a environment variable
os.environ['OPENAI_API_KEY'] = "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"


# Create an index store for the documents
index = VectorStoreIndex.from_documents(documents)

# Create the RAG engine with using the index store
query_engine = index.as_query_engine()

# Submit a query
response = query_engine.query("What is the rank of AXA by revenue in 2023?")
print(response)

print("\n--- Sources ---")
for source_node in response.source_nodes:
    source_url = source_node.metadata.get('url', 'URL not found')
    print(f"Source URL: {source_url}")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


AXA is ranked as the 6th largest commercial insurer by revenue in 2023.

--- Sources ---
Source URL: https://www.iii.org/publications/commercial-insurance/rankings


In [None]:
from llama_index.readers.web import ScrapflyReader
from llama_index.core import VectorStoreIndex

scrapfly_reader = ScrapflyReader(
    api_key=os.environ['SCRAPFLY_API_KEY'],
    ignore_scrape_failures=True,
)

# Load documents from URLs as markdown
documents = scrapfly_reader.load_data(
    urls=["https://www.reinsurancene.ws/top-100-u-s-property-casualty-insurance-companies/",
          ]
)

# Set the OpenAI key as a environment variable
os.environ['OPENAI_API_KEY'] = "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"


# Create an index store for the documents
index = VectorStoreIndex.from_documents(documents)

# Create the RAG engine with using the index store
query_engine = index.as_query_engine()

# Submit a query
response = query_engine.query("What is the rank of Hartford by net written premium in 2023? Also mention tge net written premium of Hartford in 2023.")
print(response)


print("\n--- Sources ---")
for source_node in response.source_nodes:
    source_url = source_node.metadata.get('url', 'URL not found')
    print(f"Source URL: {source_url}")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Hartford is ranked 12th by net written premium in 2023. The net written premium of Hartford in 2023 is $13.5 billion.

--- Sources ---
Source URL: https://www.reinsurancene.ws/top-100-u-s-property-casualty-insurance-companies/


In [None]:
from llama_index.readers.web import ScrapflyReader
from llama_index.core import VectorStoreIndex

scrapfly_reader = ScrapflyReader(
    api_key=os.environ['SCRAPFLY_API_KEY'],
    ignore_scrape_failures=True,
)

# Load documents from URLs as markdown
documents = scrapfly_reader.load_data(
    urls=["https://www.insurancebusinessmag.com/us/news/breaking-news/the-10-largest-pc-commercial-and-personal-lines-insurers-19638.aspx",
          "https://web-scraping.dev/"
          ]
)

# Set the OpenAI key as a environment variable
os.environ['OPENAI_API_KEY'] = "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"


# Create an index store for the documents
index = VectorStoreIndex.from_documents(documents)

# Create the RAG engine with using the index store
query_engine = index.as_query_engine()

# Submit a query
response = query_engine.query("What were the market share of State Farm and Berkshire Hathaway in 2023?")
print(response)


print("\n--- Sources ---")
for source_node in response.source_nodes:
    source_url = source_node.metadata.get('url', 'URL not found')
    print(f"Source URL: {source_url}")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


State Farm had a market share of 9.3% and Berkshire Hathaway had a market share of 6.1% in 2023.

--- Sources ---
Source URL: https://www.insurancebusinessmag.com/us/news/breaking-news/the-10-largest-pc-commercial-and-personal-lines-insurers-19638.aspx
