In [5]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from urllib.parse import urljoin
import pandas as pd
from firecrawl import FirecrawlApp
import os

# It's recommended to set the API key as an environment variable
# For this example, we'll use the provided key directly, but this is not best practice
FIRECRAWL_API_KEY = os.getenv('FIRECRAWL_API_KEY')
# Initialize FirecrawlApp
# It will automatically use the FIRECRAWL_API_KEY environment variable if set,
# otherwise you can pass it as an argument: FirecrawlApp(api_key="YOUR_API_KEY")
app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)

BASE_URL_DRUG_SAFETY = "https://www.gov.uk/drug-safety-update"

def scrape_drug_safety_updates():
    """
    Scrapes the list of drug safety updates, including dates, and then each linked page for detailed info using Firecrawl.
    Returns a pandas DataFrame.
    """
    listing_data = []

    print(f"Scraping listing page with requests: {BASE_URL_DRUG_SAFETY}")
    response = requests.get(BASE_URL_DRUG_SAFETY)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')

    alerts = []
    for li in soup.select('ul.gem-c-document-list li'):
        link = li.find('a')
        # Find the date text directly within the list item
        # Only proceed if a link is found
        if link and isinstance(link, Tag): # Ensure link is a Tag object
            date_text = li.get_text(strip=True).replace(link.get_text(strip=True), '').strip()

            if date_text:
                title = link.get_text(strip=True)
                href_attr = link.get('href') # Use .get() to safely retrieve attribute
                if href_attr and isinstance(href_attr, str): # Ensure href_attr is a string
                    full_url = urljoin(BASE_URL_DRUG_SAFETY, href_attr)

                    # Assuming the date format is consistent, you might need to adjust this
                    # if the format varies.
                    published_date = pd.to_datetime(date_text, errors='coerce')

                    alerts.append({'publish_date': published_date, 'title': title, 'url': full_url})
    listing_data = alerts

    print(listing_data)
    df = pd.DataFrame(listing_data)
    return df



In [4]:
df = scrape_drug_safety_updates()

Scraping listing page with requests: https://www.gov.uk/drug-safety-update
[{'publish_date': NaT, 'title': 'Abrysvo▼ (Pfizer RSV vaccine) and Arexvy▼ (GSK RSV vaccine): be alert to a small risk of Guillain-Barré syndrome following vaccination in older adults', 'url': 'https://www.gov.uk/drug-safety-update/abrysvov-pfizer-rsv-vaccine-and-arexvyv-gsk-rsv-vaccine-be-alert-to-a-small-risk-of-guillain-barre-syndrome-following-vaccination-in-older-adults'}, {'publish_date': NaT, 'title': 'IXCHIQ Chikungunya vaccine: temporary suspension in people aged 65 years or older', 'url': 'https://www.gov.uk/drug-safety-update/ixchiq-chikungunya-vaccine-temporary-suspension-in-people-aged-65-years-or-older'}, {'publish_date': NaT, 'title': 'Valproate (Belvo, Convulex, Depakote, Dyzantil, Epilim, Epilim Chrono or Chronosphere, Episenta, Epival, and Syonell▼): updated safety and educational materials to support patient discussion on reproductive risks', 'url': 'https://www.gov.uk/drug-safety-update/valpr

In [5]:
df

Unnamed: 0,publish_date,title,url
0,NaT,Abrysvo▼ (Pfizer RSV vaccine) and Arexvy▼ (GSK...,https://www.gov.uk/drug-safety-update/abrysvov...
1,NaT,IXCHIQ Chikungunya vaccine: temporary suspensi...,https://www.gov.uk/drug-safety-update/ixchiq-c...
2,NaT,"Valproate (Belvo, Convulex, Depakote, Dyzantil...",https://www.gov.uk/drug-safety-update/valproat...
3,NaT,Thiopurines and intrahepatic cholestasis of pr...,https://www.gov.uk/drug-safety-update/thiopuri...
4,NaT,"Kaftrio▼ (Ivacaftor, tezacaftor, elexacaftor):...",https://www.gov.uk/drug-safety-update/kaftriov...
5,NaT,Short-acting beta 2 agonists (SABA) (salbutamo...,https://www.gov.uk/drug-safety-update/short-ac...
6,NaT,Fezolinetant▼(Veoza): risk of liver injury; ne...,https://www.gov.uk/drug-safety-update/fezoline...
7,NaT,Prolonged-release opioids: Removal of indicati...,https://www.gov.uk/drug-safety-update/prolonge...
8,NaT,Letters and medicine recalls sent to healthcar...,https://www.gov.uk/drug-safety-update/letters-...
9,NaT,"Valproate (Belvo, Convulex, Depakote, Dyzantil...",https://www.gov.uk/drug-safety-update/valproat...


In [4]:
from firecrawl import FirecrawlApp, ScrapeOptions
from tqdm import tqdm
import time

app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))

detail = []
for index, row in tqdm(df.iterrows(),total=49):
    url = row['url']
    scrape_status = app.scrape_url(url, formats=['markdown'])
    markdown_text = scrape_status.markdown
    detail.append(markdown_text)
    time.sleep(10)
df['detail'] = detail

  2%|██▋                                                                                                                              | 1/49 [00:23<18:26, 23.05s/it]


KeyboardInterrupt: 

In [8]:
df

Unnamed: 0,publish_date,title,url,detail
0,NaT,Abrysvo▼ (Pfizer RSV vaccine) and Arexvy▼ (GSK...,https://www.gov.uk/drug-safety-update/abrysvov...,## Cookies on GOV.UK\n\nWe use some essential ...
1,NaT,IXCHIQ Chikungunya vaccine: temporary suspensi...,https://www.gov.uk/drug-safety-update/ixchiq-c...,## Cookies on GOV.UK\n\nWe use some essential ...
2,NaT,"Valproate (Belvo, Convulex, Depakote, Dyzantil...",https://www.gov.uk/drug-safety-update/valproat...,## Cookies on GOV.UK\n\nWe use some essential ...
3,NaT,Thiopurines and intrahepatic cholestasis of pr...,https://www.gov.uk/drug-safety-update/thiopuri...,## Cookies on GOV.UK\n\nWe use some essential ...
4,NaT,"Kaftrio▼ (Ivacaftor, tezacaftor, elexacaftor):...",https://www.gov.uk/drug-safety-update/kaftriov...,## Cookies on GOV.UK\n\nWe use some essential ...
5,NaT,Short-acting beta 2 agonists (SABA) (salbutamo...,https://www.gov.uk/drug-safety-update/short-ac...,## Cookies on GOV.UK\n\nWe use some essential ...
6,NaT,Fezolinetant▼(Veoza): risk of liver injury; ne...,https://www.gov.uk/drug-safety-update/fezoline...,## Cookies on GOV.UK\n\nWe use some essential ...
7,NaT,Prolonged-release opioids: Removal of indicati...,https://www.gov.uk/drug-safety-update/prolonge...,## Cookies on GOV.UK\n\nWe use some essential ...
8,NaT,Letters and medicine recalls sent to healthcar...,https://www.gov.uk/drug-safety-update/letters-...,## Cookies on GOV.UK\n\nWe use some essential ...
9,NaT,"Valproate (Belvo, Convulex, Depakote, Dyzantil...",https://www.gov.uk/drug-safety-update/valproat...,## Cookies on GOV.UK\n\nWe use some essential ...


In [11]:
df.to_csv('data.csv', index=False)

In [2]:
!pip install firecrawl

Collecting firecrawl
  Downloading firecrawl-2.15.0-py3-none-any.whl.metadata (7.2 kB)
Collecting python-dotenv (from firecrawl)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting websockets (from firecrawl)
  Using cached websockets-15.0.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting pydantic (from firecrawl)
  Using cached pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting aiohttp (from firecrawl)
  Downloading aiohttp-3.12.14-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.6 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp->firecrawl)
  Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.4.0 (from aiohttp->firecrawl)
  Downloading aiosignal-1.4.0-py3-none-any.whl.metadata (3.7 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->firecrawl)
  Using cached frozenlist-1.7.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (18 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp->firecrawl)
  Downloadi

In [18]:
from firecrawl import FirecrawlApp, JsonConfig
from pydantic import BaseModel, Field
from typing import Optional
import time

In [19]:
class ExtractSchema(BaseModel):
    document_download_url: Optional[str] = Field(
        None,
        description="The full URL for the 'Download Document' link on the page. It should end with .pdf"
    )
    therapeutic_area: Optional[str] = Field(
        None,
        description="List of therapeutic areas that the alert applies to for example: General Practice, Pharmacy, Surgery etc"
    )

def extract_url(url: str, api_key: str) -> dict:
    app = FirecrawlApp(api_key=api_key)
    
    json_config = JsonConfig(schema=ExtractSchema)
    
    result = app.scrape_url(
        url,
        formats=["json"],
        json_options=json_config,
        only_main_content=True,
        timeout=120000
    )
    
    return result.json

In [50]:
df = pd.read_csv('../data/data.csv')
df.head()

Unnamed: 0,publish_date,title,url,detail,alert_pdf,pdf,therapeutic
0,2025-07-07,Abrysvo▼ (Pfizer RSV vaccine) and Arexvy▼ (GSK...,https://www.gov.uk/drug-safety-update/abrysvov...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...,"Anaesthesia and intensive care, General practi..."
1,2025-06-18,IXCHIQ Chikungunya vaccine: temporary suspensi...,https://www.gov.uk/drug-safety-update/ixchiq-c...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...,"Immunology and vaccination, Infection preventi..."
2,2025-06-10,"Valproate (Belvo, Convulex, Depakote, Dyzantil...",https://www.gov.uk/drug-safety-update/valproat...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...,"General practice, Neurology, Obstetrics, gynae..."
3,2025-05-15,Thiopurines and intrahepatic cholestasis of pr...,https://www.gov.uk/drug-safety-update/thiopuri...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...,"Cancer, Dermatology, GI, hepatology and pancre..."
4,2025-05-07,"Kaftrio▼ (Ivacaftor, tezacaftor, elexacaftor):...",https://www.gov.uk/drug-safety-update/kaftriov...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...,"General practice, Pharmacy, Respiratory diseas..."


In [24]:
pdf = []
therapeutic = []
from tqdm import tqdm

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    url = row['url']
    output = extract_url(url, os.getenv('FIRECRAWL_API_KEY'))

    time.sleep(8)
    pdf.append(output['document_download_url'])
    therapeutic.append(output['therapeutic_area'])

df['pdf'] = pdf
df['therapeutic'] = therapeutic

 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 42/50 [08:49<01:40, 12.62s/it]


HTTPError: Failed to parse Firecrawl error response as JSON. Status code: 502

In [36]:
pdf


['https://assets.publishing.service.gov.uk/media/686bb9d3fe1a249e937cbd64/DSU_RSV_vaccine_-_final.pdf',
 'https://assets.publishing.service.gov.uk/media/6852c872ff16d05c5e6aa6bb/IXCHIQ_-_DRUG_SAFETY_UPDATE_-_FINAL.pdf',
 'https://assets.publishing.service.gov.uk/media/6846e91f4d039a010411f102/Valproate_RMM_DSU_-_Final.pdf',
 'https://assets.publishing.service.gov.uk/media/6825b71d85c0250741b014a1/DSU_Thiopurines_and_intrahepatic_cholestasis_of_pregnancy_-_FINAL.pdf',
 'https://assets.publishing.service.gov.uk/media/681a1db1fb59a222d4f17334/Kaftrio_DSU_FINAL_v2.pdf',
 'https://assets.publishing.service.gov.uk/media/680a0f63382965132de1aa69/SABA_DSU_FINAL_PDF.pdf',
 'https://assets.publishing.service.gov.uk/media/67f79c8049d2ffceecef2fee/Veoza_DSU_PDF_Final.pdf',
 'https://assets.publishing.service.gov.uk/media/67d164ca0c569e0d48fb0a0a/Final_Prolonged-release_opioids_DSU.pdf',
 'https://assets.publishing.service.gov.uk/media/67bde2e65d6b30e896ac6d35/Veoza_DHPC.pdf',
 'https://assets.publ

In [35]:
pdf.append(None)

In [37]:
df['pdf'] = pdf

In [38]:
df.head()

Unnamed: 0,publish_date,title,url,detail,alert_pdf,pdf
0,2025-07-07,Abrysvo▼ (Pfizer RSV vaccine) and Arexvy▼ (GSK...,https://www.gov.uk/drug-safety-update/abrysvov...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...
1,2025-06-18,IXCHIQ Chikungunya vaccine: temporary suspensi...,https://www.gov.uk/drug-safety-update/ixchiq-c...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...
2,2025-06-10,"Valproate (Belvo, Convulex, Depakote, Dyzantil...",https://www.gov.uk/drug-safety-update/valproat...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...
3,2025-05-15,Thiopurines and intrahepatic cholestasis of pr...,https://www.gov.uk/drug-safety-update/thiopuri...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...
4,2025-05-07,"Kaftrio▼ (Ivacaftor, tezacaftor, elexacaftor):...",https://www.gov.uk/drug-safety-update/kaftriov...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...


In [46]:
therapeutic.append(None)

In [47]:
df['therapeutic'] = therapeutic

In [48]:
df.head()

Unnamed: 0,publish_date,title,url,detail,alert_pdf,pdf,therapeutic
0,2025-07-07,Abrysvo▼ (Pfizer RSV vaccine) and Arexvy▼ (GSK...,https://www.gov.uk/drug-safety-update/abrysvov...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...,"Anaesthesia and intensive care, General practi..."
1,2025-06-18,IXCHIQ Chikungunya vaccine: temporary suspensi...,https://www.gov.uk/drug-safety-update/ixchiq-c...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...,"Immunology and vaccination, Infection preventi..."
2,2025-06-10,"Valproate (Belvo, Convulex, Depakote, Dyzantil...",https://www.gov.uk/drug-safety-update/valproat...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...,"General practice, Neurology, Obstetrics, gynae..."
3,2025-05-15,Thiopurines and intrahepatic cholestasis of pr...,https://www.gov.uk/drug-safety-update/thiopuri...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...,"Cancer, Dermatology, GI, hepatology and pancre..."
4,2025-05-07,"Kaftrio▼ (Ivacaftor, tezacaftor, elexacaftor):...",https://www.gov.uk/drug-safety-update/kaftriov...,## Cookies on GOV.UK\n\nWe use some essential ...,,https://assets.publishing.service.gov.uk/media...,"General practice, Pharmacy, Respiratory diseas..."


In [49]:
df.to_csv('../data/data.csv', index=False)