In [1]:
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://press.un.org"
SEED_URL = "https://press.un.org/en"
PRESS_RELATIVE_URL = "/en/press-release"

def is_press_release(soup):
    """Check if the page is a press release based on the 'PRESS RELEASE' link."""
    anchor = soup.find('a', hreflang='en', href=PRESS_RELATIVE_URL)
    return anchor is not None

def save_html_to_file(html, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(html)

def get_press_releases_with_crisis(seed_url, part_number, limit=10):
    visited = set()
    to_visit = [seed_url]
    press_releases = []

    while to_visit and len(press_releases) < limit:
        url = to_visit.pop(0)
        if url in visited:
            continue

        response = requests.get(url)
        if response.status_code != 200:
            continue

        soup = BeautifulSoup(response.content, 'html.parser')

        if is_press_release(soup) and "crisis" in soup.get_text().lower():
            press_releases.append(url)
            save_html_to_file(response.text, f"{part_number}_{len(press_releases)}.txt")

        visited.add(url)

        # Extract links for further crawling
        for link in soup.find_all('a', href=True):
            if link['href'].startswith('/'):
                full_link = BASE_URL + link['href']
                if full_link not in visited:
                    to_visit.append(full_link)

    return press_releases

press_releases = get_press_releases_with_crisis(SEED_URL, 1, 10)
for pr in press_releases:
    print(pr)




https://press.un.org/en/2023/sgsm21967.doc.htm
https://press.un.org/en/2023/sgsm21947.doc.htm
https://press.un.org/en/2023/dsgsm1874.doc.htm
https://press.un.org/en/2023/sgsm21952.doc.htm
https://press.un.org/en/2023/sgsm21876.doc.htm
https://press.un.org/en/2023/sgsm21852.doc.htm
https://press.un.org/en/2023/sgsm21806.doc.htm
https://press.un.org/en/2023/dsgsm1848.doc.htm
https://press.un.org/en/2023/sgsm21765.doc.htm
https://press.un.org/en/2023/sgsm21767.doc.htm


In [2]:
from bs4 import BeautifulSoup
import urllib.request

seed_url = 'https://www.europarl.europa.eu/news/en/press-room'
urls = [seed_url]
seen = {seed_url}
opened_press = set()
min_links = 10

def save_html_to_file(html, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(html)

press_count = 0

while len(urls) > 0 and press_count < min_links:
    try:
        curr_url = urls.pop(0)
        req = urllib.request.Request(curr_url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urllib.request.urlopen(req).read()
        soup = BeautifulSoup(webpage, 'html.parser')
        tag1 = soup.find('span', class_="ep_name", text='Plenary session')
        tag2 = soup.find('span', class_="ep_name", text='Press Releases')
        if tag1 and tag2:
            text = soup.get_text()
            if 'crisis' in text.lower():
                press_count += 1
                opened_press.add(curr_url)
                save_html_to_file(str(soup), f"2_{press_count}.txt")
    except:
        continue

    for a_tag in soup.find_all('a', href=True):
        org_child_url = a_tag.get('href')
        child_url = urllib.parse.urljoin(seed_url, org_child_url)
        if child_url not in seen and seed_url in child_url:
            seen.add(child_url)
            urls.append(child_url)

print("European Parliament press releases containing the word crisis")
for link in opened_press:
    print(link)


  tag1 = soup.find('span', class_="ep_name", text='Plenary session')
  tag2 = soup.find('span', class_="ep_name", text='Press Releases')


European Parliament press releases containing the word crisis
https://www.europarl.europa.eu/news/en/press-room/20230310IPR77232/minimum-income-schemes-increasing-support-accessibility-and-inclusion
https://www.europarl.europa.eu/news/en/press-room/20210422IPR02615/civil-protection-faster-eu-response-to-large-scale-emergencies
https://www.europarl.europa.eu/news/en/press-room/20221209IPR64426/eu-long-term-budget-needs-urgent-revision-to-cope-with-current-crises
https://www.europarl.europa.eu/news/en/press-room/20221209IPR64427/holodomor-parliament-recognises-soviet-starvation-of-ukrainians-as-genocide
https://www.europarl.europa.eu/news/en/press-room/20230707IPR02421/parliament-adopts-new-rules-to-boost-energy-savings
https://www.europarl.europa.eu/news/en/press-room/20210304IPR99207/parliament-gives-green-light-for-new-eu4health-programme
https://www.europarl.europa.eu/news/en/press-room/20230210IPR74806/green-deal-industrial-plan-securing-the-eu-s-clean-tech-leadership
https://www.eu