### Starter code:

In [1]:
import requests
from bs4 import BeautifulSoup
import re

In [2]:
# Sample data
data = """
<title>Death Notice of Catherine (Cathy) Lyndon (née Lawlor) (Dublin 7, Dublin) | rip.ie</title>
<meta name="description" content="The death has occurred of Catherine (Cathy) Lyndon (née Lawlor) of Dublin 7, Dublin Ireland, on 21/09/2023. You can view the full death notice and add your condolences here."/>
<meta property="og:title" content="Death Notice of Catherine (Cathy) Lyndon (née Lawlor) (Dublin 7, Dublin) | rip.ie"/>
<meta property="og:description" content="The death has occurred of Catherine (Cathy) Lyndon (née Lawlor) of Dublin 7, Dublin Ireland, on 21/09/2023. You can view the full death notice and add your condolences here."/>
<meta property="og:image" content="https://img-dedicated.rip.ie/w188/Cathy_Lyndon_24e22a9de72b03e77fb49fddaca282b636db7c9b26cb0255.jpeg"/>
<meta property="og:image:height" content="282"/>
<meta property="og:image:width" content="188"/>
<meta property="og:type" content="website"/>
<meta name="twitter:card" content="summary_large_image"/>
<link rel="preload" as="image" imageSrcSet="https://img-dedicated.rip.ie/assets/rip-logo.svg 1x, https://img-dedicated.rip.ie/assets/rip-logo.svg 2x"/>
<link rel="preload" as="image" imageSrcSet="https://img-dedicated.rip.ie/assets/rip-logo-small.svg 1x, https://img-dedicated.rip.ie/assets/rip-logo-small.svg 2x"/>
<meta name="next-head-count" content="13"/>
<meta name="application-name" content="RIP.ie"/>
<meta name="apple-mobile-web-app-capable" content="yes"/>
<meta name="apple-mobile-web-app-status-bar-style" content="default"/>
<meta name="apple-mobile-web-app-title" content="RIP.ie"/>
<meta name="format-detection" content="telephone=no"/>
<meta name="referrer" content="always"/>
<meta name="mobile-web-app-capable" content="yes"/>
<meta name="theme-color" content="#d2ae38"/>
<link rel="apple-touch-icon" href="https://img-dedicated.rip.ie/assets/icon-180x180.png"/>
<link rel="apple-touch-icon" sizes="152x152" href="https://img-dedicated.rip.ie/assets/icon-152x152.jpg"/>
<link rel="apple-touch-icon" sizes="180x180" href="https://img-dedicated.rip.ie/assets/icon-180x180.jpg"/>
<link rel="apple-touch-icon" sizes="167x167" href="https://img-dedicated.rip.ie/assets/icon-167x167.jpg"/>
"""

In [3]:
soup = BeautifulSoup(data, 'html.parser')

title = soup.find('title').text
pattern = r"Death Notice of ([A-Za-z]+) \(.+?\) ([A-Za-z]+) \(.+?\) \(([^)]+)\) | rip.ie"
match = re.search(pattern, title)

if match:
    first_name = match.group(1)
    last_name = match.group(2)
    address = match.group(3)

    print("First Name:", first_name)
    print("Last Name:", last_name)
    print("Address:", address)
else:
    print("No match found.")

url_meta = soup.find('meta', property='og:url')
if url_meta:
    url = url_meta.get('content')
else:
    url = "URL not found"

First Name: Catherine
Last Name: Lyndon
Address: Dublin 7, Dublin


### Page py page scraping:

In [4]:
import pandas as pd

In [41]:
def scrape(entry_url):
    response = requests.get(entry_url)
    if response.status_code != 200:
        print(f"Failed to fetch data for URL: {entry_url}")
    response.encoding = 'UTF-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [42]:
entry_urls = [
    "https://rip.ie/death-notice/s/all?page=1&start=2023-09-03+00%3A00%3A00&end=today&sortField=a.createdAtCastToDate&sortDir=DESC&view=boxes",
    "https://rip.ie/death-notice/s/all?page=2&start=2023-09-03+00%3A00%3A00&end=today&sortField=a.createdAtCastToDate&sortDir=DESC&view=boxes",
    "https://rip.ie/death-notice/s/all?page=3&start=2023-09-03+00%3A00%3A00&end=today&sortField=a.createdAtCastToDate&sortDir=DESC&view=boxes"
    # Add more entry URLs here
]

In [43]:
def get_table(soup):
    data_list = []
    
    parent_div = soup.find('div', class_="sc-hHLeRK fnSgVj rdt_TableBody", role='rowgroup')
    
    if parent_div:
        for row_div in parent_div.find_all('div', class_='sc-jqUVSM hjBMSB rdt_TableRow', role="row"):
            cleaned_text = row_div.text.replace('\xa0', '')
            data = cleaned_text.strip()
            pattern = r'([A-Z][a-z]*)([A-Z][a-z]*)*([A-Z][a-z]*)([A-Z][a-z]*)*(\d{2}/\d{2}/\d{2})'
            match = re.search(pattern, data)
            
            if match:
                first_name = match.group(1)
                #middle_name = match.group()
                #last_name = match.group()
                location = match.group(2)
                city = match.group(3)
                published_date = match.group(5)
                
                #print(match.group(0))
                data_list.append({
                    "First Name": first_name,
                    #"Last Name": last_name,
                    #"Middle Name": middle_name,
                    "Location": location,
                    "City": city,
                    "Date": published_date
                })

    df = pd.DataFrame(data_list)
    return df

In [44]:
for url in entry_urls:
    result_df = get_table(scrape(url))
result_df

Unnamed: 0,First Name,Location,City,Date
0,J,Butlerstown,Waterford,06/10/23
1,Teresa,Tuam,Galway,06/10/23
2,Kilnamona,,Clare,06/10/23
3,Francesca,Athlone,Westmeath,06/10/23
4,Francesca,Athlone,Roscommon,06/10/23
5,Joseph,Lisnagry,Limerick,06/10/23
6,Eoin,Ballivor,Meath,06/10/23
7,Sandra,Strabane,Tyrone,06/10/23
8,Johnston,,Donegal,06/10/23
9,Pat,Donaghmede,Dublin,06/10/23


Issues:

### Notes:

In [10]:
def scrape_entry_data(entry_url):
    
    response = requests.get(entry_url)
    if response.status_code != 200:
        print(f"Failed to fetch data for URL: {entry_url}")
        return
    
    response.encoding = 'UTF-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    parent_div = soup.find('div', class_="sc-hHLeRK fnSgVj rdt_TableBody", role='rowgroup')
    
    if parent_div:
        for row_div in parent_div.find_all('div', class_='sc-jqUVSM hjBMSB rdt_TableRow', role="row"):
            cleaned_text = row_div.text.replace('\xa0', '')
            data = cleaned_text.strip().split(",")
            print(data)
    else:
        print("Parent div not found.")
        
    print("---")

In [11]:
for url in entry_urls:
    scrape_entry_data(url)

['Deane(née Jennings)', 'OliveDunmanwayCork05/10/23']
['Deane(née Jennings)', 'OliveRossmoreCork05/10/23']
['McMANUS', 'PatrickDundalkLouth05/10/23']
['Donnelly(née Doheny)', 'NoraThurlesTipperary05/10/23']
['Donnelly(née Doheny)', 'NoraDunleerLouth05/10/23']
['Mitchell', 'Georgina MaryErrillLaois05/10/23']
['Barry', 'BillyLislevaneCork05/10/23']
['Fitzgerald', 'SeánDungarvanWaterford05/10/23']
['KERRISK', 'DECLANCastlemaineKerry05/10/23']
["O'Hara(née Brennan)", 'BettyAclareSligo05/10/23']
['Moore', 'DanielDungarvanWaterford05/10/23']
['Wijers', 'Paulus WillemCastlebarMayo05/10/23']
["O' Connor(née Reidy)", 'MauraKanturkCork05/10/23']
['Tierney(née Smyth)', 'MayBirrOffaly05/10/23']
['Walshe', 'Edward (Ned)DonardWicklow05/10/23']
['Dillon(née Gaffney)', 'BridieGlencarLeitrim05/10/23']
['Diffley(née Murray)', 'MaryStrokestownRoscommon05/10/23']
['Diffley(née Murray)', 'MaryTarmonbarryRoscommon05/10/23']
['Treacey', 'Mary Loreto JaneRanelaghDublin05/10/23']
['Gleeson(née Moran)', 'Marian