In [1]:
from bs4 import BeautifulSoup
import re
import requests
import pandas as pd
import os

In [2]:
def re_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error while processing {url}: {str(e)}")
        return None

In [3]:
def dates_between(start_date, end_date):
    entry_urls = []
    base_url = "https://rip.ie/death-notice/s/all?page={}&start={}%2B00%3A00%3A00&end={}&sortField=a.createdAtCastToDate&sortDir=DESC&view=boxes"
    p = 1
    response = re_url(base_url.format(p, start_date, end_date))

    while response:
        entry_urls.append(base_url.format(p, start_date, end_date))
        p += 1
    return entry_urls

In [4]:
def scrape_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [5]:
def get_url(soup):
    url_list = []
    anchors = soup.find_all('a', class_='showdown-dn-link')
    urls = [anchor['href'] for anchor in anchors]
    for url in urls:
        url_list.append("https://rip.ie" + url)
    return url_list

In [6]:
def get_name(soup):
    name_val = soup.find('h1', class_='DeathNotice_person-name__lkvex')
    if name_val:
        return name_val.get_text()
    else:
        return None

In [7]:
def get_individual_date(soup):
    date_list = {}

    published_date_value = soup.find('p', class_='DeathNotice_dates-published-date__M0A_i')
    if published_date_value:
        date_list["Published Date"] = published_date_value.get_text()
    else:
        date_list["Published Date"] = None

    death_date_value = soup.find('p', class_='DeathNotice_dates-death-date__bR7g_')
    if death_date_value:
        date_list["Death Date"] = death_date_value.get_text()
    else:
        date_list["Death Date"] = None

    return date_list

In [8]:
def get_obituary(soup):
    description = soup.find('div', class_='DeathNotice_description__sY_tC word-break')
    if description:
        text = description.get_text()
        text = text.strip()
        text = text.replace("Â", "")
        text = text.replace("nee", "née")
        text = text.replace("nÃ©e", "née")
        text = text.replace("\n", " ")
        text = text.replace("â", "’")
        return text
    else:
        return None

In [9]:
def get_loc(url_list):
    url_info = {}
    url_pattern = r'https://rip.ie/death-notice/([^/]+)-([^/]+)-([^/]+)(?:-([^/]+))?-(\d+)'
    
    for u in url_list:
        m = re.match(url_pattern, u)
            
        if m:
            url_id = m.group(5)
            city = m.group(2)
            loc = m.group(3)

            url_info[url_id] = {
                "City": city,
                "Location": loc
            }
    return url_info

In [10]:
def get_page_data(page_data):
    data_list = []
    if page_data:
        url_list = get_url(scrape_html(page_data))
            
        for u in url_list:
            s = scrape_html(re_url(u))
            url_id = url.split("-")[-1]
            name = get_name(s)
            dates = get_individual_date(s)
            obituary_text = get_obituary(s)

            data_list.append({
                                "Name": name,
                                "ID": url_id,
                                "City": get_loc(url_id["City"]),
                                "Location": get_loc(url_id["Location"]),
                                "Published Date": dates["Published Date"],
                                "Death Date": dates["Death Date"],
                                "Obituary Text": obituary_text
                            })

    df = pd.DataFrame(data_list)
    return df

In [None]:
rip_df = get_page_data(dates_between("today", "today"))
rip_df

In [None]:
output_directory = '/Users/idilbilgic/Desktop/STAGE4.1/COMP30170/suicide_rates_IE/output_monthly'
rip_df.to_csv(os.path.join(output_directory, 'rip_output_2006_07.csv'), index=False)

In [None]:
#rip_df.to_csv('rip_output_2006.csv', index=False)