In [1]:
from bs4 import BeautifulSoup
import re
import requests
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor

In [2]:
entry_urls = []

start_date = "2006-07-02"
end_date = "2006-07-12"
base_url = "https://rip.ie/death-notice/s/all?page={}&start={}%2B00%3A00%3A00&end={}&sortField=a.createdAtCastToDate&sortDir=DESC&view=boxes"
page_count = 10 #manually changed every time
#rough_count_deaths = page_count*40
#print(str(rough_count_deaths) + " deaths between " + start_date + " and " + end_date)

for i in range(1, page_count+1):
    entry_urls.append(base_url.format(i, start_date, end_date))

In [3]:
def re_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error while processing {url}: {str(e)}")
        return None

In [4]:
def scrape_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [5]:
def get_url(soup):
    url_list = []
    anchors = soup.find_all('a', class_='showdown-dn-link')
    urls = [anchor['href'] for anchor in anchors]
    for url in urls:
        url_list.append("https://rip.ie" + url)
    return url_list

In [6]:
def get_individual_date(soup):
    date_list = {}

    published_date_value = soup.find('p', class_='DeathNotice_dates-published-date__M0A_i')
    if published_date_value:
        date_list["Published Date"] = published_date_value.get_text()
    else:
        date_list["Published Date"] = None

    death_date_value = soup.find('p', class_='DeathNotice_dates-death-date__bR7g_')
    if death_date_value:
        date_list["Death Date"] = death_date_value.get_text()
    else:
        date_list["Death Date"] = None

    return date_list

In [7]:
def get_obituary(soup):
    description = soup.find('div', class_='DeathNotice_description__sY_tC word-break')
    if description:
        text = description.get_text()
        text = text.strip()
        text = text.replace("Â", "")
        text = text.replace("nee", "née")
        text = text.replace("nÃ©e", "née")
        text = text.replace("\n", " ")
        text = text.replace("â", "’")
        return text
    else:
        return None

In [8]:
def get_page_data(entry_urls):
    data_list = []

    url_pattern = r'https://rip.ie/death-notice/([^/]+)-([^/]+)-([^/]+)(?:-([^/]+))?-(\d+)'

    for entry_url in entry_urls:
        page_data = re_url(entry_url)
        if page_data:
            url_list = get_url(scrape_html(page_data))
            
            for u in url_list:
                m = re.match(url_pattern, u)

                dates = get_individual_date(scrape_html(re_url(u)))
                obituary_text = get_obituary(scrape_html(re_url(u)))
                
                if m:
                    name = m.group(1)
                    city = m.group(2)
                    location = m.group(3)
                    identity = m.group(5)

                    data_list.append({
                                    "Name": name,
                                    "ID": identity,
                                    "City": city.capitalize(),
                                    "Location": location.capitalize(),
                                    "Published Date": dates["Published Date"],
                                    "Death Date": dates["Death Date"],
                                    "Obituary Text": obituary_text
                                })

                else:
                    print("Pattern did not match.")

    df = pd.DataFrame(data_list)
    return df

In [None]:
rip_df = get_page_data(entry_urls)
rip_df

In [None]:
output_directory = '/Users/idilbilgic/Desktop/STAGE4.1/COMP30170/suicide_rates_IE/output_monthly'
rip_df.to_csv(os.path.join(output_directory, 'rip_output_2006_07.csv'), index=False)

In [None]:
#rip_df.to_csv('rip_output_2006.csv', index=False)