In [1]:
#Installing  Lib
!pip uninstall duckduckgo-search -y
!pip install ddgs
!pip install beautifulsoup4
!pip install openpyxl

[0mCollecting ddgs
  Downloading ddgs-9.10.0-py3-none-any.whl.metadata (12 kB)
Collecting primp>=0.15.0 (from ddgs)
  Downloading primp-1.0.0-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting fake-useragent>=2.2.0 (from ddgs)
  Downloading fake_useragent-2.2.0-py3-none-any.whl.metadata (17 kB)
Collecting socksio==1.* (from httpx[brotli,http2,socks]>=0.28.1->ddgs)
  Downloading socksio-1.0.0-py3-none-any.whl.metadata (6.1 kB)
Downloading ddgs-9.10.0-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fake_useragent-2.2.0-py3-none-any.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading socksio-1.0.0-py3-none-any.whl (12 kB)
Downloading primp-1.0.0-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [2]:
import requests
from bs4 import BeautifulSoup
from ddgs import DDGS
import pandas as pd
import hashlib
import os
import re
from datetime import datetime

EXCEL_FILE = "financial_data.xlsx"

In [3]:
#Parsing query to extarct period and other details
def parse_user_query(query):
    kpis = ["Stock", "Revenue", "EBITDA", "Profit", "earnings per share"]
    periods = ["jan", "Feb", "Q3", "Q4", "FY", "2026", "2024", "2025"]

    found_kpi = next((k for k in kpis if k.lower() in query.lower()), None)
    found_period = next((p for p in periods if p.lower() in query.lower()), None)

    company = query.split()[0]

    return {
        "company": company,
        "kpi": found_kpi,
        "period": found_period
    }

In [4]:
#it will use duck go and search 5 url based on query
from ddgs import DDGS

def search_web(query, max_results=5):
    urls = []
    with DDGS() as ddgs:
        for r in ddgs.text(query):
            urls.append(r["href"])
            if len(urls) >= max_results:
                break
    return urls

In [5]:
#scraps 5  webpage and extracts raw readable text.
def scrape_page(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0"
        }
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        text = soup.get_text(separator=" ", strip=True)
        return text
    except Exception as e:
        print("Scrape error:", e)
        return None

In [6]:
#Store the raw data from webpage into excel or other formats based on user requirements
def store_document_excel(company, kpi, period, value, url):

    new_data = {
        "company": company,
        "kpi": kpi,
        "period": period,
        "value": value,
        "url": url,
        "timestamp": datetime.now()
    }

    if os.path.exists(EXCEL_FILE):
        df = pd.read_excel(EXCEL_FILE)

        # Deduplicate by URL + KPI + period
        duplicate = df[
            (df["url"] == url) &
            (df["kpi"] == kpi) &
            (df["period"] == period)
        ]

        if not duplicate.empty:
            print("Duplicate skipped")
            return

        df = pd.concat([df, pd.DataFrame([new_data])], ignore_index=True)
    else:
        df = pd.DataFrame([new_data])

    df.to_excel(EXCEL_FILE, index=False)
    print("Stored:", url)

In [7]:
#Applied regex to extract kpi like ebitda,revenue earned per share
def extract_financial_value(content, kpi):
    # This regex looks for the KPI,
    # then an optional currency symbol, optional space,
    # then the numeric part (allowing for thousands separators and decimals),
    # and finally an optional scale word (billion/million/B/M/T), followed by a word boundary.
    pattern = rf"{kpi}.*?(\$|€|₹|£)?\s*(\d{{1,3}}(?:[.,]\d{{3}})*(?:[.,]\d+)?\s*(?:billion|million|B|M|T)?)\b"
    matches = re.findall(pattern, content, re.IGNORECASE)

    if matches:
        # Group 1: currency symbol (e.g., '$')
        # Group 2: the numeric part including its scale (e.g., '100,000.5 billion')
        currency, value_with_scale = matches[0]
        return f"{currency}{value_with_scale.strip()}"

    return None

In [8]:
#connecting everything user query,parsing,finding websites for scrapping
def financial_agent(user_query):

    print("User Query:", user_query)

    parsed = parse_user_query(user_query)
    print("Parsed:", parsed)

    if not parsed["kpi"]:
        print("KPI not detected in query.")
        return

    search_query = f"{parsed['company']} {parsed['kpi']} {parsed['period']} financial results"
    print("Search Query:", search_query)

    urls = search_web(search_query)
    print("URLs Found:", len(urls))

    structured_results = []

    for url in urls:
        print("Scraping:", url)

        content = scrape_page(url)
        if not content:
            continue

        value = extract_financial_value(content, parsed["kpi"])

        if value:
            store_document_excel(
                parsed["company"],
                parsed["kpi"],
                parsed["period"],
                value,
                url
            )

            structured_results.append({
                "company": parsed["company"],
                "kpi": parsed["kpi"],
                "period": parsed["period"],
                "value": value,
                "source": url
            })

    return structured_results

In [9]:
#type any prompt
results = financial_agent("apple revenue feb 2025")
results

User Query: apple revenue feb 2025
Parsed: {'company': 'apple', 'kpi': 'Revenue', 'period': 'Feb'}
Search Query: apple Revenue Feb financial results




URLs Found: 5
Scraping: https://www.apple.com/newsroom/2026/01/apple-reports-first-quarter-results/
Stored: https://www.apple.com/newsroom/2026/01/apple-reports-first-quarter-results/
Scraping: https://investor.apple.com/investor-relations/default.aspx
Scraping: https://www.reuters.com/business/apple-sales-profit-beat-wall-street-estimates-amid-staggering-iphone-demand-2026-01-29/
Scraping: https://www.macrotrends.net/stocks/charts/AAPL/apple/revenue
Scraping: https://finance.yahoo.com/quote/AAPL/financials/


[{'company': 'apple',
  'kpi': 'Revenue',
  'period': 'Feb',
  'value': '026',
  'source': 'https://www.apple.com/newsroom/2026/01/apple-reports-first-quarter-results/'}]

In [10]:
from google.colab import files

if os.path.exists(EXCEL_FILE):
    print(f"Downloading {EXCEL_FILE}...")
    files.download(EXCEL_FILE)
else:
    print("Excel file not found. Please run the financial_agent() function to generate it.")

Downloading financial_data.xlsx...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>