### Starter code:

In [1]:
import requests
from bs4 import BeautifulSoup
import re

In [2]:
# Sample data
data = """
<title>Death Notice of Catherine (Cathy) Lyndon (née Lawlor) (Dublin 7, Dublin) | rip.ie</title>
<meta name="description" content="The death has occurred of Catherine (Cathy) Lyndon (née Lawlor) of Dublin 7, Dublin Ireland, on 21/09/2023. You can view the full death notice and add your condolences here."/>
<meta property="og:title" content="Death Notice of Catherine (Cathy) Lyndon (née Lawlor) (Dublin 7, Dublin) | rip.ie"/>
<meta property="og:description" content="The death has occurred of Catherine (Cathy) Lyndon (née Lawlor) of Dublin 7, Dublin Ireland, on 21/09/2023. You can view the full death notice and add your condolences here."/>
<meta property="og:image" content="https://img-dedicated.rip.ie/w188/Cathy_Lyndon_24e22a9de72b03e77fb49fddaca282b636db7c9b26cb0255.jpeg"/>
<meta property="og:image:height" content="282"/>
<meta property="og:image:width" content="188"/>
<meta property="og:type" content="website"/>
<meta name="twitter:card" content="summary_large_image"/>
<link rel="preload" as="image" imageSrcSet="https://img-dedicated.rip.ie/assets/rip-logo.svg 1x, https://img-dedicated.rip.ie/assets/rip-logo.svg 2x"/>
<link rel="preload" as="image" imageSrcSet="https://img-dedicated.rip.ie/assets/rip-logo-small.svg 1x, https://img-dedicated.rip.ie/assets/rip-logo-small.svg 2x"/>
<meta name="next-head-count" content="13"/>
<meta name="application-name" content="RIP.ie"/>
<meta name="apple-mobile-web-app-capable" content="yes"/>
<meta name="apple-mobile-web-app-status-bar-style" content="default"/>
<meta name="apple-mobile-web-app-title" content="RIP.ie"/>
<meta name="format-detection" content="telephone=no"/>
<meta name="referrer" content="always"/>
<meta name="mobile-web-app-capable" content="yes"/>
<meta name="theme-color" content="#d2ae38"/>
<link rel="apple-touch-icon" href="https://img-dedicated.rip.ie/assets/icon-180x180.png"/>
<link rel="apple-touch-icon" sizes="152x152" href="https://img-dedicated.rip.ie/assets/icon-152x152.jpg"/>
<link rel="apple-touch-icon" sizes="180x180" href="https://img-dedicated.rip.ie/assets/icon-180x180.jpg"/>
<link rel="apple-touch-icon" sizes="167x167" href="https://img-dedicated.rip.ie/assets/icon-167x167.jpg"/>
"""

In [3]:
soup = BeautifulSoup(data, 'html.parser')

title = soup.find('title').text
pattern = r"Death Notice of ([A-Za-z]+) \(.+?\) ([A-Za-z]+) \(.+?\) \(([^)]+)\) | rip.ie"
match = re.search(pattern, title)

if match:
    first_name = match.group(1)
    last_name = match.group(2)
    address = match.group(3)

    print("First Name:", first_name)
    print("Last Name:", last_name)
    print("Address:", address)
else:
    print("No match found.")

url_meta = soup.find('meta', property='og:url')
if url_meta:
    url = url_meta.get('content')
else:
    url = "URL not found"

First Name: Catherine
Last Name: Lyndon
Address: Dublin 7, Dublin


### Page py page scraping:

In [4]:
import pandas as pd

In [5]:
def scrape(entry_url):
    response = requests.get(entry_url)
    if response.status_code != 200:
        print(f"Failed to fetch data for URL: {entry_url}")
    response.encoding = 'UTF-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [6]:
entry_urls = [
    "https://rip.ie/death-notice/s/all?page=1&start=2023-09-03+00%3A00%3A00&end=today&sortField=a.createdAtCastToDate&sortDir=DESC&view=boxes",
    "https://rip.ie/death-notice/s/all?page=2&start=2023-09-03+00%3A00%3A00&end=today&sortField=a.createdAtCastToDate&sortDir=DESC&view=boxes",
    "https://rip.ie/death-notice/s/all?page=3&start=2023-09-03+00%3A00%3A00&end=today&sortField=a.createdAtCastToDate&sortDir=DESC&view=boxes"
    # Add more entry URLs here
]

In [7]:
def get_table(soup):
    data_list = []
    
    parent_div = soup.find('div', class_="sc-hHLeRK fnSgVj rdt_TableBody", role='rowgroup')
    
    if parent_div:
        for row_div in parent_div.find_all('div', class_='sc-jqUVSM hjBMSB rdt_TableRow', role="row"):
            cleaned_text = row_div.text.replace('\xa0', '')
            data = cleaned_text.strip()
            pattern = r'([A-Z][a-z]*)([A-Z][a-z]*)*([A-Z][a-z]*)([A-Z][a-z]*)*(\d{2}/\d{2}/\d{2})'
            match = re.search(pattern, data)
            
            if match:
                first_name = match.group(1)
                #middle_name = match.group()
                #last_name = match.group()
                location = match.group(2)
                city = match.group(3)
                published_date = match.group(5)
                
                #print(match.group(0))
                data_list.append({
                    "First Name": first_name,
                    #"Middle Name": middle_name,
                    #"Last Name": last_name,
                    "Location": location,
                    "City": city,
                    "Date": published_date
                })

    df = pd.DataFrame(data_list)
    return df

In [8]:
for url in entry_urls:
    result_df = get_table(scrape(url))
result_df

Unnamed: 0,First Name,Location,City,Date
0,Knocknaheeny,,Cork,09/10/23
1,Wilhelm,,Carlow,09/10/23
2,Wilhelm,Castledermot,Kildare,09/10/23
3,Marie,Balally,Dublin,09/10/23
4,Denis,Inchigeela,Cork,09/10/23
5,Anne,Castlebar,Mayo,09/10/23
6,Angela,Finglas,Dublin,09/10/23
7,Road,,Dublin,09/10/23
8,Tommy,Ballyjamesduff,Cavan,09/10/23
9,Marie,Newbridge,Kildare,09/10/23


Issues:

In [9]:
html_data = """
<div class="mx-n-md-4"><div class="relative"><div class="custom-datatable"><div class="sc-dmRaPn embYUS"><div class="sc-fLlhyt gcPjVa"><div class="sc-bczRLJ jNMKxq rdt_Table" role="table"><div class="sc-gsnTZi bZtlLv rdt_TableHead" role="rowgroup"><div class="sc-dkzDqf eOtTIX rdt_TableHeadRow" role="row"><div data-column-id="1" class="sc-hKMtZM sc-eCYdqJ sc-iqcoie jYPiPR hfWhjw qVZpv rdt_TableCol"><div data-column-id="1" data-sort-id="1" role="columnheader" tabindex="0" class="sc-crXcEl jNENvb rdt_TableCol_Sortable"><div data-column-id="1" class="sc-evZas dFsTSm">Name</div><span class="asc __rdt_custom_sort_icon__"></span></div></div><div data-column-id="2" class="sc-hKMtZM sc-eCYdqJ sc-iqcoie jYPiPR gHBJQB qVZpv rdt_TableCol"><div data-column-id="2" data-sort-id="2" role="columnheader" tabindex="0" class="sc-crXcEl jNENvb rdt_TableCol_Sortable"><div data-column-id="2" class="sc-evZas dFsTSm">Town</div><span class="asc __rdt_custom_sort_icon__"></span></div></div><div data-column-id="3" class="sc-hKMtZM sc-eCYdqJ sc-iqcoie jYPiPR gfYsxk qVZpv rdt_TableCol"><div data-column-id="3" data-sort-id="3" role="columnheader" tabindex="0" class="sc-crXcEl jNENvb rdt_TableCol_Sortable"><div data-column-id="3" class="sc-evZas dFsTSm">County</div><span class="asc __rdt_custom_sort_icon__"></span></div></div><div data-column-id="4" class="sc-hKMtZM sc-eCYdqJ sc-iqcoie jYPiPR iPQAkn qVZpv rdt_TableCol" width="95px"><div data-column-id="4" data-sort-id="4" role="columnheader" tabindex="0" class="sc-crXcEl jNENvb rdt_TableCol_Sortable"><div data-column-id="4" class="sc-evZas dFsTSm">Published</div><span class="asc __rdt_custom_sort_icon__"></span></div></div></div></div><div class="sc-hHLeRK fnSgVj rdt_TableBody" role="rowgroup"><div id="row-0" role="row" class="sc-jqUVSM hjBMSB rdt_TableRow"><div id="cell-1-undefined" data-column-id="1" role="gridcell" class="sc-hKMtZM sc-eCYdqJ sc-jSMfEi cLRkKo hfWhjw dNvvkQ rdt_TableCell" data-tag="allowRowEvents"><a href="/death-notice/oliver-ollie-clery-waterford-waterford-city-532166" data-cy="dn-link" target="" style="height:100%;width:100%" class="flex align-items-center showdown-dn-link justify-content-between"><div class="font-13 font-md-12 flex align-items-center"><div class="flex-inline flex-wrap name-cell"><b class="">CLERY</b><span>,<!-- --> </span>Oliver (ollie)</div></div><div class="flex align-items-center pl-1"></div></a></div><div id="cell-2-undefined" data-column-id="2" role="gridcell" class="sc-hKMtZM sc-eCYdqJ sc-jSMfEi cLRkKo gHBJQB dNvvkQ rdt_TableCell" data-tag="allowRowEvents"><div class="font-13 font-md-12 font-weight-500 w-100 word-break">Waterford City</div></div><div id="cell-3-undefined" data-column-id="3" role="gridcell" class="sc-hKMtZM sc-eCYdqJ sc-jSMfEi cLRkKo gfYsxk dNvvkQ rdt_TableCell" data-tag="allowRowEvents"><div class="font-13 font-md-12 font-weight-500 w-100 word-break">Waterford</div></div><div id="cell-4-undefined" data-column-id="4" role="gridcell" class="sc-hKMtZM sc-eCYdqJ sc-jSMfEi cLRkKo iPQAkn dNvvkQ rdt_TableCell" data-tag="allowRowEvents" width="95px"><div class="font-13 font-md-12 font-weight-500">09/10/23</div></div></div><div id="row-1" role="row" class="sc-jqUVSM hjBMSB rdt_TableRow"><div id="cell-1-undefined" data-column-id="1" role="gridcell" class="sc-hKMtZM sc-eCYdqJ sc-jSMfEi cLRkKo hfWhjw dNvvkQ rdt_TableCell" data-tag="allowRowEvents"><a href="/death-notice/paddy-como-connolly-wexford-enniscorthy-532165" data-cy="dn-link" target="" style="height:100%;width:100%" class="flex align-items-center showdown-dn-link justify-content-between"><div class="font-13 font-md-12 flex align-items-center"><div class="flex-inline flex-wrap name-cell"><b class="">CONNOLLY</b><span>,<!-- --> </span>Paddy (Como)</div></div><div class="flex align-items-center pl-1"></div></a></div><div id="cell-2-undefined" data-column-id="2" role="gridcell" class="sc-hKMtZM sc-eCYdqJ sc-jSMfEi cLRkKo gHBJQB dNvvkQ rdt_TableCell" data-tag="allowRowEvents"><div class="font-13 font-md-12 font-weight-500 w-100 word-break">Enniscorthy</div></div><div id="cell-3-undefined" data-column-id="3" role="gridcell" class="sc-hKMtZM sc-eCYdqJ sc-jSMfEi cLRkKo gfYsxk dNvvkQ rdt_TableCell" data-tag="allowRowEvents"><div class="font-13 font-md-12 font-weight-500 w-100 word-break">Wexford</div></div><div id="cell-4-undefined" data-column-id="4" role="gridcell" class="sc-hKMtZM sc-eCYdqJ sc-jSMfEi cLRkKo iPQAkn dNvvkQ rdt_TableCell" data-tag="allowRowEvents" width="95px"><div class="font-13 font-md-12 font-weight-500">09/10/23</div></div></div><div id="row-2" role="row" class="sc-jqUVSM hjBMSB rdt_TableRow"><div id="cell-1-undefined" data-column-id="1" role="gridcell" class="sc-hKMtZM sc-eCYdqJ sc-jSMfEi cLRkKo hfWhjw dNvvkQ rdt_TableCell" data-tag="allowRowEvents"><a href="/death-notice/dolly-lennon-kilkenny-dunbell-532164" data-cy="dn-link" target="" style="height:100%;width:100%" class="flex align-items-center showdown-dn-link justify-content-between"><div class="font-13 font-md-12 flex align-items-center"><div class="flex-inline flex-wrap name-cell">
"""

In [10]:
soup = BeautifulSoup(html_data, 'html.parser')

# Find all anchor tags with the specific class
anchors = soup.find_all('a', class_='showdown-dn-link')

# Extract the 'href' attribute from each anchor
urls = [anchor['href'] for anchor in anchors]

# Print the extracted URLs
for url in urls:
    print("https://rip.ie" + url)

https://rip.ie/death-notice/oliver-ollie-clery-waterford-waterford-city-532166
https://rip.ie/death-notice/paddy-como-connolly-wexford-enniscorthy-532165
https://rip.ie/death-notice/dolly-lennon-kilkenny-dunbell-532164


### Notes:

In [11]:
def scrape_entry_data(entry_url):
    
    response = requests.get(entry_url)
    if response.status_code != 200:
        print(f"Failed to fetch data for URL: {entry_url}")
        return
    
    response.encoding = 'UTF-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    parent_div = soup.find('div', class_="sc-hHLeRK fnSgVj rdt_TableBody", role='rowgroup')
    
    if parent_div:
        for row_div in parent_div.find_all('div', class_='sc-jqUVSM hjBMSB rdt_TableRow', role="row"):
            cleaned_text = row_div.text.replace('\xa0', '')
            data = cleaned_text.strip().split(",")
            print(data)
    else:
        print("Parent div not found.")
        
    print("---")

In [12]:
for url in entry_urls:
    scrape_entry_data(url)

['Buckley', 'Derry (Jeremiah)KnocknaheenyCork09/10/23']
['Seitz', 'Kilian WilhelmCarlow09/10/23']
['Seitz', 'Kilian WilhelmCastledermotKildare09/10/23']
['HITCHCOCK(née Seery)', 'MarieBalallyDublin09/10/23']
['MANNING', 'DenisInchigeelaCork09/10/23']
['Ward(née Martin)', 'AnneCastlebarMayo09/10/23']
['Keogh(née Gibbons)', 'AngelaFinglasDublin09/10/23']
['Connery', 'MichaelSouth Circular RoadDublin09/10/23']
['Kerrigan', 'TommyBallyjamesduffCavan09/10/23']
['McNALLY(née McHugh)', 'MarieNewbridgeKildare09/10/23']
['McNALLY(née McHugh)', 'MarieLeitrim09/10/23']
['Standley', 'KenKilmallockLimerick09/10/23']
['McKenna(née Hoey)', 'EileenCollonLouth09/10/23']
['McKenna(née Hoey)', 'EileenKnockbridgeLouth09/10/23']
["O'Neill", 'William (Bill)AdareLimerick09/10/23']
['HARKNESS(née Moore)', 'Philomena (Phil)KilmainhamDublin09/10/23']
['Slattery', 'JohnFreshfordKilkenny09/10/23']
['Slattery', 'JohnGathabawnKilkenny09/10/23']
['Moore', 'SeanAthloneWestmeath09/10/23']
['Dooley(née Barry)', 'Rita (