In [1]:
from bs4 import BeautifulSoup
import re
import requests
import pandas as pd
import os

In [2]:
def re_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error while processing {url}: {str(e)}")
        return None

In [3]:
def scrape_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [4]:
def get_url(soup):
    url_list = []
    anchors = soup.find_all('a', class_='showdown-dn-link')
    urls = [anchor['href'] for anchor in anchors]
    for url in urls:
        url_list.append("https://rip.ie" + url)
    return url_list

In [5]:
def get_ID(url):
    url_pattern = r'https://rip.ie/death-notice/([^/]+)-([^/]+)-([^/]+)(?:-([^/]+))?-(\d+)'
    m = re.match(url_pattern, url)
    if m:
        return m.group(5)  
    else:
        print("Pattern did not match.")

In [6]:
def get_name(soup):
    name_val = soup.find('h1', class_='DeathNotice_person-name__lkvex')
    if name_val:
        name = name_val.get_text()
        name = name.strip()
        name = name.replace("Â", "")
        name = name.replace("nee", "née")
        name = name.replace("nÃ©e", "née")
        name = name.replace("\n", " ")
        name = name.replace("â", "’")
        return name
    else:
        return None

In [7]:
def get_loc(soup):
    locations = {}
    loc_val = soup.find('div', class_='DeathNotice_tags-item__Fp1X4')
    if loc_val:
        raw_loc = loc_val.get_text()
        loc = [word.strip() for word in raw_loc.split(",") + raw_loc.split("/")]
        city_town = loc[-1]
        locations["City"] = city_town.split(",")[-1]
        locations["Town"] = city_town.split(",")[:-1]
        return locations
    else:
        return None

In [8]:
def get_dates(soup):
    date_list = {}

    published_date_value = soup.find('p', class_='DeathNotice_dates-published-date__M0A_i')
    if published_date_value:
        date_list["Published Date"] = published_date_value.get_text()
    else:
        date_list["Published Date"] = None

    death_date_value = soup.find('p', class_='DeathNotice_dates-death-date__bR7g_')
    if death_date_value:
        date_list["Death Date"] = death_date_value.get_text()
    else:
        date_list["Death Date"] = None

    return date_list

In [9]:
def get_obituary(soup):
    description = soup.find('div', class_='DeathNotice_description__sY_tC word-break')
    if description:
        text = description.get_text()
        text = text.strip()
        text = text.replace("Â", "")
        text = text.replace("nee", "née")
        text = text.replace("nÃ©e", "née")
        text = text.replace("\n", " ")
        text = text.replace("â", "’")
        text = text.replace("Å", "")
        text = text.replace("Ä", "")
        text = text.replace("", "")
        return text
    else:
        return None

In [10]:
def get_page_data(url_list):
    data_list = []
        
    for u in url_list:
        s = scrape_html(re_url(u))
        identity = get_ID(u)
        name = get_name(s)
        loc = get_loc(s)
        dates = get_dates(s)
        obituary_text = get_obituary(s)

        data_list.append({
            "Name": name,
            "ID": identity,
            "City": loc["City"],
            "Town": loc["Town"],
            "Published Date": dates["Published Date"],
            "Death Date": dates["Death Date"],
            "Obituary Text": obituary_text
        })

    df = pd.DataFrame(data_list)
    return df

In [16]:
def get_entry_urls(page_number):
    entry_urls = []
    for p in range(1, page_number+1):
        base_url = "https://rip.ie/death-notice/s/all?page=" + str(p) + "&start=today+00%3A00%3A00&end=today&sortField=a.createdAtCastToDate&sortDir=ASC&view=list"
        entry_urls.append(base_url)
    return entry_urls

In [17]:
get_entry_urls(2)

['https://rip.ie/death-notice/s/all?page=1&start=today+00%3A00%3A00&end=today&sortField=a.createdAtCastToDate&sortDir=ASC&view=list',
 'https://rip.ie/death-notice/s/all?page=2&start=today+00%3A00%3A00&end=today&sortField=a.createdAtCastToDate&sortDir=ASC&view=list']

In [18]:
def get_url_list(entry_urls):
    for entry_url in entry_urls:
        page_data = re_url(entry_url)
        if page_data:
            url_list = get_url(scrape_html(page_data))
    return url_list

In [19]:
get_url_list(get_entry_urls(2))

['https://rip.ie/death-notice/denis-frawley-dublin-clonsilla-533645',
 'https://rip.ie/death-notice/ita-feely-dublin-terenure-533644',
 'https://rip.ie/death-notice/ita-feely-laois-abbeyleix-533644',
 'https://rip.ie/death-notice/anthony-tony-mcnamara-tipperary-carrick-on-suir-533643',
 'https://rip.ie/death-notice/myles-byrne-offaly-edenderry-533642',
 'https://rip.ie/death-notice/peter-pierre-kiernan-dublin-ringsend-533641',
 'https://rip.ie/death-notice/denis-bucke-limerick-feohanagh-533640',
 'https://rip.ie/death-notice/denis-bucke-limerick-ballagh-533640',
 'https://rip.ie/death-notice/john-cliffe-waterford-abbeyside-533639',
 'https://rip.ie/death-notice/michael-mike-carty-dublin-carpenterstown-533638',
 'https://rip.ie/death-notice/michael-mike-carty-galway-newbridge-533638',
 'https://rip.ie/death-notice/john-mccormack-westmeath-delvin-533637',
 'https://rip.ie/death-notice/catherine-kitty-daly-cork-charleville-533636',
 'https://rip.ie/death-notice/anthony-tony-odonovan-sligo

In [15]:
rip_df = get_page_data(get_url_list(get_entry_urls(2)))
rip_df

Unnamed: 0,Name,ID,City,Town,Published Date,Death Date,Obituary Text
0,Denis Frawley,533645,Dublin,[Clonsilla],Monday 23rd October 2023,Saturday 21st October 2023,"FRAWLEY, Denis (Clonsilla, Dublin 15 and forme..."
1,Ita Feely(née Deegan),533644,Laois,[Abbeyleix],Monday 23rd October 2023,Sunday 22nd October 2023,"FEELY (née Deegan), Ita, 22nd October 2023, Te..."
2,Ita Feely(née Deegan),533644,Laois,[Abbeyleix],Monday 23rd October 2023,Sunday 22nd October 2023,"FEELY (née Deegan), Ita, 22nd October 2023, Te..."
3,Anthony (Tony) McNamara,533643,Tipperary,"[Collins Park, Carrick-on-Suir]",Monday 23rd October 2023,Saturday 21st October 2023,The death has occurred of Anthony (Tony) McNam...
4,Myles BYRNE,533642,Offaly,"[Gilroy Avenue, Edenderry]",Monday 23rd October 2023,Thursday 19th October 2023,"Leicester, England & formerly Gilroy Avenue, E..."
5,Peter (Pierre) KIERNAN,533641,Dublin,[Ringsend],Monday 23rd October 2023,Thursday 19th October 2023,"KIERNAN (Ringsend, D4) â October 19th 2023 su..."
6,Denis Bucke,533640,Limerick,[Ballagh],Monday 23rd October 2023,Sunday 22nd October 2023,"Feohanagh, Co. Limerick and formerly of Dromde..."
7,Denis Bucke,533640,Limerick,[Ballagh],Monday 23rd October 2023,Sunday 22nd October 2023,"Feohanagh, Co. Limerick and formerly of Dromde..."
8,John Cliffe,533639,Waterford,"[Windsor and formerly of The Burgery, Abbeyside]",Monday 23rd October 2023,Monday 5th June 2023,"Late John Cliffe, Windsor, Berkshire and forme..."
9,Michael (Mike) CARTY,533638,Galway,[Newbridge],Monday 23rd October 2023,Saturday 21st October 2023,"CARTY, Michael (Mike) (Carpenterstown, Castlek..."
