In [22]:
# Environment & Utilities
import os
import json
import time
from dotenv import load_dotenv
from pprint import pprint
from typing import List


# Web Scraping
import bs4
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# LangChain Components
from langchain_community.document_loaders import WebBaseLoader, BSHTMLLoader
from langchain_core.output_parsers import PydanticOutputParser, StrOutputParser
from langchain_core.prompts import PromptTemplate, load_prompt
from langchain_openai import ChatOpenAI

# Search Utility
from langchain_community.utilities import SerpAPIWrapper

# data validation
from pydantic import BaseModel, Field

In [23]:
# Set up LangSmith tracing
from langchain_teddynote import logging


logging.langsmith("EQL Task")

LangSmith 추적을 시작합니다.
[프로젝트명]
EQL Task


In [24]:
# Load API KEY
load_dotenv()

True

In [None]:
SCRAP_PROMPT_PATH = "Set your local path"
REPORT_PROMPT_PATH = "Set your local path"

# =============== Scrap Chain ===============

In [26]:
# Scrap stockwik


def get_rendered_html(url):

    options = Options()

    options.add_argument("--headless")

    driver = webdriver.Chrome(options=options)
    driver.get(url)

    time.sleep(5)

    html = driver.page_source

    driver.quit()
    return html

In [27]:
# Save HTML


with open("stockwik_rendered.html", "w", encoding="utf-8") as f:

    f.write(get_rendered_html("https://www.stockwik.se/pressmeddelanden"))


loader = BSHTMLLoader(file_path="stockwik_rendered.html", open_encoding="utf-8")
stockwik_docs = loader.load()


print(f"✅ Loaded {len(stockwik_docs)} documents")

✅ Loaded 1 documents


In [28]:
stockwik_docs

[Document(metadata={'source': 'stockwik_rendered.html', 'title': 'Pressmeddelanden | Stockwik.se'}, page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPressmeddelanden | Stockwik.se\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\ntop of pageSkip to Main Content\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nIN ENGLISHSTARTOM OSSVÅR PROCESSBOLAG OCH SEGMENTBOLAGSSTYRNINGKARRIÄRFINANSIELLTAKTIENPRESSMore...Use tab to navigate through the menu items.Pressmeddelanden* anger från och med 160703 regulatorisk information i enlighet med MAR, svensk lag, börsens regelverk och bolagsstyrningskodens krav.\n2025\n250214\xa0Stockwik publicerar bokslutskommité 4Q24 januari-december *\n250214 Stockwik publishes year end report 4Q24 January-December (ENG) *\n\u200b\n\u200b\n2024\n241121 VD och Styrelseordförande ökar i Stockwik\n241108\xa0Stockwik publicerar 3Q24 *\n241108\xa0Stockwik publishes 3Q24 (ENG) *\n240823\xa0Stockwik publicerar 2Q24 *\n240823\xa0

In [29]:
print(stockwik_docs[0].page_content[:])






















































Pressmeddelanden | Stockwik.se




















top of pageSkip to Main Content















IN ENGLISHSTARTOM OSSVÅR PROCESSBOLAG OCH SEGMENTBOLAGSSTYRNINGKARRIÄRFINANSIELLTAKTIENPRESSMore...Use tab to navigate through the menu items.Pressmeddelanden* anger från och med 160703 regulatorisk information i enlighet med MAR, svensk lag, börsens regelverk och bolagsstyrningskodens krav.
2025
250214 Stockwik publicerar bokslutskommité 4Q24 januari-december *
250214 Stockwik publishes year end report 4Q24 January-December (ENG) *
​
​
2024
241121 VD och Styrelseordförande ökar i Stockwik
241108 Stockwik publicerar 3Q24 *
241108 Stockwik publishes 3Q24 (ENG) *
240823 Stockwik publicerar 2Q24 *
240823 Stockwik publishes 2Q24 (ENG) *
240822 Stockwik tidigarelägger publicering av kvartalsrapporter
240822 Stockwik brings forward publication of quarterly reports (ENG)
240515 Stockwik Kommuniké från Stockwiks årsstämma 2024 *
240515 Stockwik 

In [None]:
# Scrap acousort and carlsberg

loader = WebBaseLoader(
    web_paths=[
        "https://acousort.com/investors/press-releases/",
        "https://www.carlsberggroup.com/news-room/latest-news/",
    ],
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            "div",
            attrs={"class": ["mfn-list", "news-list__result-records"]},
        )
    ),
    header_template={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
    },
)


# Load info

docs = loader.load()


# Check scrapped docs length

print(f"✅ Loaded {len(docs)} documents")

✅ Loaded 2 documents


In [31]:
# combine scrapped information
scrapped_info = docs + stockwik_docs


len(scrapped_info)

3

In [32]:
print(scrapped_info[0].page_content[:])
print("===" * 10)
print(scrapped_info[1].page_content[:])
print("===" * 10)
print(scrapped_info[2].page_content[:])

2025


2025-03-26 09:45

AcouSort's CEO answers questions about the company's rights issue and gives his view on the future




In connection with AcouSort's announcement of its plan to carry out a rights issue, several shareholders have contacted the company with questions regarding...




2025-03-25 07:40

AcouSort announces the outcome of the exercise of warrants of series TO 2


Regulatory



The Board of Directors of AcouSort AB (publ) ("AcouSort" or the "Company") today announces the outcome of the exercise of warrants of series TO 2 (the...




2025-03-19 09:45

AcouSort in new collaboration to improve sepsis treatment




AcouSort’s business strategy relies on early development collaborations to integrate its separation technology in partnering companies' products and...




2025-03-12 12:30

AcouSort to explore South Korean opportunities at the KIMES exhibition




AcouSort has been invited by EU Business Hub Japan and Korea to participate in their Healthcare and Medical Equip

In [None]:
# Define output schema and parser for report extraction


class ReportScraper(BaseModel):

    company: str = Field(description="Company name")

    report_name: str = Field(description="report name")

    report_type: str = Field(description="The quarterly or annual report")

    link: str = Field(description="report link")

    date: str = Field(description="report release date")


# PydanticOutputParser

parser = PydanticOutputParser(pydantic_object=ReportScraper)

In [34]:
# Define collection schema for multiple report entries
class ReportScraperCollection(BaseModel):

    reports: List[ReportScraper]

In [35]:
# check instructions
print(parser.get_format_instructions())

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"company": {"description": "Company name", "title": "Company", "type": "string"}, "report_name": {"description": "report name", "title": "Report Name", "type": "string"}, "report_type": {"description": "The quarterly or annual report", "title": "Report Type", "type": "string"}, "link": {"description": "report link", "title": "Link", "type": "string"}, "date": {"description": "report release date", "title": "Date", "type": "string"}}, "required": ["company", "report_name", "report_type", "link", "date"]}
```


In [36]:
# Load prompt template for scrap chain
scrap_prompt = load_prompt(SCRAP_PROMPT_PATH, encoding="utf-8")

# Add partial formatting using PydanticOutputParser's format method
scrap_prompt = scrap_prompt.partial(format=parser.get_format_instructions())

scrap_prompt

PromptTemplate(input_variables=['question', 'scrapped_content'], input_types={}, partial_variables={'format': 'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"company": {"description": "Company name", "title": "Company", "type": "string"}, "report_name": {"description": "report name", "title": "Report Name", "type": "string"}, "report_type": {"description": "The quarterly or annual report", "title": "Report Type", "type": "string"}, "link": {"description": "report link", "title": "Link", "type": "string"}, "date": {"description": "report release date", "title": "Date

In [37]:
# Build scrap chain: prompt → GPT-4o → structured parser


scrap_chain = scrap_prompt | ChatOpenAI(model_name="gpt-4o", temperature=0) | parser

In [None]:
# Run scrap chain to extract key information from each document
responses = []


for doc in scrapped_info:

    response = scrap_chain.invoke(
        {"scrapped_content": doc, "question": "Please only extract key information."}
    )

    responses.append(response)

    pprint(response)

ReportScraper(company='AcouSort AB', report_name='Year end report for AcouSort AB 1 January – 31 December 2024', report_type='annual', link='https://acousort.com/investors/press-releases/', date='2025-02-14')
ReportScraper(company='Carlsberg Group', report_name='FY 2024 Financial Statement', report_type='annual', link='https://www.carlsberggroup.com/news-room/latest-news/', date='06/02/2025')
ReportScraper(company='Stockwik', report_name='Stockwik publishes year end report 4Q24 January-December', report_type='annual', link='stockwik_rendered.html', date='2025-02-14')


In [39]:
# Integrate into a single object
collection = ReportScraperCollection(reports=responses)

In [40]:
pprint(collection.model_dump(), indent=2, width=120)

{ 'reports': [ { 'company': 'AcouSort AB',
                 'date': '2025-02-14',
                 'link': 'https://acousort.com/investors/press-releases/',
                 'report_name': 'Year end report for AcouSort AB 1 January – 31 December 2024',
                 'report_type': 'annual'},
               { 'company': 'Carlsberg Group',
                 'date': '06/02/2025',
                 'link': 'https://www.carlsberggroup.com/news-room/latest-news/',
                 'report_name': 'FY 2024 Financial Statement',
                 'report_type': 'annual'},
               { 'company': 'Stockwik',
                 'date': '2025-02-14',
                 'link': 'stockwik_rendered.html',
                 'report_name': 'Stockwik publishes year end report 4Q24 January-December',
                 'report_type': 'annual'}]}


In [41]:
## Json format
print(json.dumps(collection.model_dump(), indent=2, ensure_ascii=False))

{
  "reports": [
    {
      "company": "AcouSort AB",
      "report_name": "Year end report for AcouSort AB 1 January – 31 December 2024",
      "report_type": "annual",
      "link": "https://acousort.com/investors/press-releases/",
      "date": "2025-02-14"
    },
    {
      "company": "Carlsberg Group",
      "report_name": "FY 2024 Financial Statement",
      "report_type": "annual",
      "link": "https://www.carlsberggroup.com/news-room/latest-news/",
      "date": "06/02/2025"
    },
    {
      "company": "Stockwik",
      "report_name": "Stockwik publishes year end report 4Q24 January-December",
      "report_type": "annual",
      "link": "stockwik_rendered.html",
      "date": "2025-02-14"
    }
  ]
}


# =============== Report Chain ===============

### SERP API

In [42]:
# Set Serpapi params
params = {"engine": "google", "gl": "de", "hl": "en", "num": "10"}

search = SerpAPIWrapper(params=params)

In [43]:
collection.reports[0].company

'AcouSort AB'

In [44]:
query = f"{collection.reports[0].company} {collection.reports[0].report_name}"
query

'AcouSort AB Year end report for AcouSort AB 1 January – 31 December 2024'

In [45]:
# Generate query strings for company reports to use in SERP search
queries = []

for report in collection.reports:
    query = f"{report.company} {report.report_name}"
    queries.append(query)

# Check queries
for query in queries:
    print(query)

AcouSort AB Year end report for AcouSort AB 1 January – 31 December 2024
Carlsberg Group FY 2024 Financial Statement
Stockwik Stockwik publishes year end report 4Q24 January-December


In [46]:
# Run SERP API searches for each report query and collect the results
search_results = []

for query in queries:
    print(f"🔍 Running SERP search for: {query}")

    result = search.run(query)
    search_results.append(result)

🔍 Running SERP search for: AcouSort AB Year end report for AcouSort AB 1 January – 31 December 2024
🔍 Running SERP search for: Carlsberg Group FY 2024 Financial Statement
🔍 Running SERP search for: Stockwik Stockwik publishes year end report 4Q24 January-December


In [47]:
# SERP search results
pprint(search_results, indent=2, width=120)

[ '["AcouSort announces that the Company\'s Board of Directors has decided to bring forward the year-end report for '
  '2024 to Friday, February 14, instead of February ...", "On February 10, AcouSort announces that the Company\'s '
  'Board of Directors has decided to bring forward the year-end report for 2024 to Friday, February 14, ...", '
  "'ACOUSORT | Year-End Report January 1 – December 31, 2024. Summary of the year-end report. SIGNIFICANT EVENTS "
  'DURING THE FOURTH QUARTER. • On ...\', "On February 10, AcouSort announces that the Company\'s Board of Directors '
  'has decided to bring forward the year-end report for 2024 to Friday, ...", \'2025-02-14 06:00 Year end report for '
  'AcouSort AB 1 January – 31 December 2024 Regulatory. The “Company” or “AcouSort” refers to AcouSort AB (publ) with '
  "...', 'AcouSort AB: Year end report for AcouSort AB 1 January - 31 December 2024 ... AcouSort AB - Interim Report "
  "for the period July - September 2024 ...', '17.02. Year end

In [48]:
# Merge SERP search results into one string
search_result_string = "\n".join(search_results)

print(search_result_string)

["AcouSort announces that the Company's Board of Directors has decided to bring forward the year-end report for 2024 to Friday, February 14, instead of February ...", "On February 10, AcouSort announces that the Company's Board of Directors has decided to bring forward the year-end report for 2024 to Friday, February 14, ...", 'ACOUSORT | Year-End Report January 1 – December 31, 2024. Summary of the year-end report. SIGNIFICANT EVENTS DURING THE FOURTH QUARTER. • On ...', "On February 10, AcouSort announces that the Company's Board of Directors has decided to bring forward the year-end report for 2024 to Friday, ...", '2025-02-14 06:00 Year end report for AcouSort AB 1 January – 31 December 2024 Regulatory. The “Company” or “AcouSort” refers to AcouSort AB (publ) with ...', 'AcouSort AB: Year end report for AcouSort AB 1 January - 31 December 2024 ... AcouSort AB - Interim Report for the period July - September 2024 ...', '17.02. Year end report for AcouSort AB 1 January – 31 December 

In [49]:
# Load prompt template for report generation
report_prompt = load_prompt(REPORT_PROMPT_PATH, encoding="utf-8")
report_prompt

PromptTemplate(input_variables=['additional_information', 'company', 'release_date', 'report_name', 'report_type'], input_types={}, partial_variables={}, template='You are an expert at summarizing key information about companies using scraped data.  \nYour task is to write a report based on the provided scraped information.  \nPlease follow the specified format when creating your summary.\n\n\n\n#Information:\n- Company: {company}\n- Report name: {report_name}\n- Report type: {report_type}\n- Release date: {release_date}\n- Company brief information: {additional_information}\n\n#Format(in markdown format):\n1.Company:\n- (Comany name)\n\n2.Report info:\n- title: (Report name)\n- type: (Report type)\n- release date: (Release date)\n\n3.Additional info:\n- (Summary of Company information)\n\n#Answer:\n')

In [50]:
# Build the report chain: prompt → gpt-4-turbo → string output
report_chain = (
    report_prompt | ChatOpenAI(model="gpt-4-turbo", temperature=0) | StrOutputParser()
)

In [51]:
# Generate final summary reports for each company using the report chain and SERP results

report_responses = []

for i, report in enumerate(collection.reports):
    response = report_chain.invoke(
        {
            "company": report.company,
            "report_name": report.report_name,
            "report_type": report.report_type,
            "release_date": report.date,
            "additional_information": search_results[i],
        }
    )

    report_responses.append(response)
    print(f"\n📄 Report Summary for {report.company}:\n")
    print(response)


📄 Report Summary for AcouSort AB:

1. Company:
- AcouSort AB

2. Report info:
- title: Year end report for AcouSort AB 1 January – 31 December 2024
- type: annual
- release date: 2025-02-14

3. Additional info:
- AcouSort AB is an innovative medical technology company that specializes in developing critical components for use in diagnostics, analytics, and cell therapy instrumentation. The company's Board of Directors decided to reschedule the release of the 2024 year-end report to February 14, 2025, advancing it from a later unspecified date. This decision was announced on February 10, 2025.

📄 Report Summary for Carlsberg Group:

1. Company:
- Carlsberg Group

2. Report info:
- title: FY 2024 Financial Statement
- type: annual
- release date: 06/02/2025

3. Additional info:
- Carlsberg Group reported a revenue growth of 1.9% reaching DKK 75,011 million in FY 2024. The company achieved an organic revenue per hectoliter increase of 2%, with contributions from all regions. The organic 