In [1]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from urllib.parse import urljoin
import pandas as pd
from firecrawl import FirecrawlApp
import os

# It's recommended to set the API key as an environment variable
# For this example, we'll use the provided key directly, but this is not best practice
FIRECRAWL_API_KEY = os.getenv('FIRECRAWL_API_KEY')
# Initialize FirecrawlApp
# It will automatically use the FIRECRAWL_API_KEY environment variable if set,
# otherwise you can pass it as an argument: FirecrawlApp(api_key="YOUR_API_KEY")
app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)


BASE_URL_DRUG_SAFETY = "https://www.gov.uk/drug-safety-update"

def scrape_detailed_alert_info_with_firecrawl(url):
    """
    Scrapes a detailed alert page using Firecrawl to get markdown content.
    """
    try:
        scraped_data = app.scrape_url(url, params={'pageOptions': {'onlyMainContent': True}})
        if scraped_data and 'markdown' in scraped_data:
            # The detailed title is usually the first line of the markdown
            lines = scraped_data['markdown'].split('\n')
            detailed_title = lines[0].strip('# ') if lines else ""
            detailed_content = scraped_data['markdown']
            return {'detailed_title': detailed_title, 'detailed_content': detailed_content}
        else:
            print(f"Firecrawl failed to return markdown for {url}")
            return {'detailed_title': None, 'detailed_content': None}
    except Exception as e:
        print(f"An error occurred while scraping with Firecrawl for {url}: {e}")
        return {'detailed_title': None, 'detailed_content': None}

def scrape_drug_safety_updates():
    """
    Scrapes the list of drug safety updates, including dates, and then each linked page for detailed info using Firecrawl.
    Returns a pandas DataFrame.
    """
    listing_data = []

    print(f"Scraping listing page with requests: {BASE_URL_DRUG_SAFETY}")
    response = requests.get(BASE_URL_DRUG_SAFETY)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')

    alerts = []
    for li in soup.select('ul.gem-c-document-list li'):
        link = li.find('a')
        # Find the date text directly within the list item
        # Only proceed if a link is found
        if link and isinstance(link, Tag): # Ensure link is a Tag object
            date_text = li.get_text(strip=True).replace(link.get_text(strip=True), '').strip()

            if date_text:
                title = link.get_text(strip=True)
                href_attr = link.get('href') # Use .get() to safely retrieve attribute
                if href_attr and isinstance(href_attr, str): # Ensure href_attr is a string
                    full_url = urljoin(BASE_URL_DRUG_SAFETY, href_attr)

                    # Assuming the date format is consistent, you might need to adjust this
                    # if the format varies.
                    published_date = pd.to_datetime(date_text, errors='coerce')

                    alerts.append({'publish_date': published_date, 'title': title, 'url': full_url})
    listing_data = alerts

    print(listing_data)
    df = pd.DataFrame(listing_data)
    return df



In [2]:
df = scrape_drug_safety_updates()

Scraping listing page with requests: https://www.gov.uk/drug-safety-update
[{'publish_date': NaT, 'title': 'Abrysvo▼ (Pfizer RSV vaccine) and Arexvy▼ (GSK RSV vaccine): be alert to a small risk of Guillain-Barré syndrome following vaccination in older adults', 'url': 'https://www.gov.uk/drug-safety-update/abrysvov-pfizer-rsv-vaccine-and-arexvyv-gsk-rsv-vaccine-be-alert-to-a-small-risk-of-guillain-barre-syndrome-following-vaccination-in-older-adults'}, {'publish_date': NaT, 'title': 'IXCHIQ Chikungunya vaccine: temporary suspension in people aged 65 years or older', 'url': 'https://www.gov.uk/drug-safety-update/ixchiq-chikungunya-vaccine-temporary-suspension-in-people-aged-65-years-or-older'}, {'publish_date': NaT, 'title': 'Valproate (Belvo, Convulex, Depakote, Dyzantil, Epilim, Epilim Chrono or Chronosphere, Episenta, Epival, and Syonell▼): updated safety and educational materials to support patient discussion on reproductive risks', 'url': 'https://www.gov.uk/drug-safety-update/valpr

In [3]:
df

Unnamed: 0,publish_date,title,url
0,NaT,Abrysvo▼ (Pfizer RSV vaccine) and Arexvy▼ (GSK...,https://www.gov.uk/drug-safety-update/abrysvov...
1,NaT,IXCHIQ Chikungunya vaccine: temporary suspensi...,https://www.gov.uk/drug-safety-update/ixchiq-c...
2,NaT,"Valproate (Belvo, Convulex, Depakote, Dyzantil...",https://www.gov.uk/drug-safety-update/valproat...
3,NaT,Thiopurines and intrahepatic cholestasis of pr...,https://www.gov.uk/drug-safety-update/thiopuri...
4,NaT,"Kaftrio▼ (Ivacaftor, tezacaftor, elexacaftor):...",https://www.gov.uk/drug-safety-update/kaftriov...
5,NaT,Short-acting beta 2 agonists (SABA) (salbutamo...,https://www.gov.uk/drug-safety-update/short-ac...
6,NaT,Fezolinetant▼(Veoza): risk of liver injury; ne...,https://www.gov.uk/drug-safety-update/fezoline...
7,NaT,Prolonged-release opioids: Removal of indicati...,https://www.gov.uk/drug-safety-update/prolonge...
8,NaT,Letters and medicine recalls sent to healthcar...,https://www.gov.uk/drug-safety-update/letters-...
9,NaT,"Valproate (Belvo, Convulex, Depakote, Dyzantil...",https://www.gov.uk/drug-safety-update/valproat...


In [4]:
from tqdm import tqdm
import time

app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))

detail = []
for index, row in tqdm(df.iterrows(),total=49):
    url = row['url']
    scrape_status = app.scrape_url(url, formats=['markdown'])
    markdown_text = scrape_status.markdown
    detail.append(markdown_text)
    time.sleep(10)
df['detail'] = detail

  2%|██▋                                                                                                                              | 1/49 [00:23<18:26, 23.05s/it]


KeyboardInterrupt: 

In [None]:
from firecrawl import FirecrawlApp, ScrapeOptions

In [8]:
df

Unnamed: 0,publish_date,title,url,detail
0,NaT,Abrysvo▼ (Pfizer RSV vaccine) and Arexvy▼ (GSK...,https://www.gov.uk/drug-safety-update/abrysvov...,## Cookies on GOV.UK\n\nWe use some essential ...
1,NaT,IXCHIQ Chikungunya vaccine: temporary suspensi...,https://www.gov.uk/drug-safety-update/ixchiq-c...,## Cookies on GOV.UK\n\nWe use some essential ...
2,NaT,"Valproate (Belvo, Convulex, Depakote, Dyzantil...",https://www.gov.uk/drug-safety-update/valproat...,## Cookies on GOV.UK\n\nWe use some essential ...
3,NaT,Thiopurines and intrahepatic cholestasis of pr...,https://www.gov.uk/drug-safety-update/thiopuri...,## Cookies on GOV.UK\n\nWe use some essential ...
4,NaT,"Kaftrio▼ (Ivacaftor, tezacaftor, elexacaftor):...",https://www.gov.uk/drug-safety-update/kaftriov...,## Cookies on GOV.UK\n\nWe use some essential ...
5,NaT,Short-acting beta 2 agonists (SABA) (salbutamo...,https://www.gov.uk/drug-safety-update/short-ac...,## Cookies on GOV.UK\n\nWe use some essential ...
6,NaT,Fezolinetant▼(Veoza): risk of liver injury; ne...,https://www.gov.uk/drug-safety-update/fezoline...,## Cookies on GOV.UK\n\nWe use some essential ...
7,NaT,Prolonged-release opioids: Removal of indicati...,https://www.gov.uk/drug-safety-update/prolonge...,## Cookies on GOV.UK\n\nWe use some essential ...
8,NaT,Letters and medicine recalls sent to healthcar...,https://www.gov.uk/drug-safety-update/letters-...,## Cookies on GOV.UK\n\nWe use some essential ...
9,NaT,"Valproate (Belvo, Convulex, Depakote, Dyzantil...",https://www.gov.uk/drug-safety-update/valproat...,## Cookies on GOV.UK\n\nWe use some essential ...


In [11]:
df.to_csv('data.csv', index=False)