In [33]:
from bs4 import BeautifulSoup
from obittools import ROOT_DIR
import os
import json
import csv


class ObitMetadataRow:
    def __init__(self, obit_id, obit_page_type):
        self.obit_id = obit_id
        self.obit_page_type = obit_page_type
        self.obituary_paragraph, self.sympathy_cta = None, None
        self.full_name, self.first_name, self.middle_name, self.last_name = None, None, None, None
        self.nick_name, self.maiden_name, self.prefix, self.suffix, self.additional_prefix, self.additional_suffix = None, None, None, None, None, None
        self.birth_date, self.death_date, self.from_to_years = None, None, None
        self.address_country, self.address_locality, self.address_region, self.address_postal_code = None, None, None, None
        self.affiliated_source, self.obitwriter_source = None, None
        self.ad_status, self.affiliate, self.affiliate_type, self.analytic_page_category, self.days_to_first_service, self.days_to_last_service, self.first_to_file, self.merged_obit_parents, self.notice_type, self.page_name, self.product_name = None, None, None, None, None, None, None, None, None, None, None
        self.obit_publish_date = None
        self.donations_view_type = None
        self.is_ad_free, self.is_consumer_direct_obituary, self.legacy_pro_plan_type = None, None, None

        self.partner_city, self.partner_state, self.partner_country, self.partner_display_name = None, None, None, None

        self.church = None

    def get_dict(self):
        return_dict = {
            "obit_id": self.obit_id,
            "obit_page_type": self.obit_page_type,
            "obituary_paragraph": self.obituary_paragraph,
            "sympathy_cta": self.sympathy_cta,
            "full_name": self.full_name,
            "first_name": self.first_name,
            "middle_name": self.middle_name,
            "last_name": self.last_name,
            "nick_name": self.nick_name,
            "maiden_name": self.maiden_name,
            "prefix": self.prefix,
            "suffix": self.suffix,
            "additional_prefix": self.additional_prefix,
            "additional_suffix": self.additional_suffix,
            "birth_date": self.birth_date,
            "death_date": self.death_date,
            "from_to_years": self.from_to_years,
            "address_country": self.address_country,
            "address_locality": self.address_locality,
            "address_region": self.address_region,
            "address_postal_code": self.address_postal_code,
            "affiliated_source": self.affiliated_source,
            "obitwriter_source": self.obitwriter_source,
            "ad_status": self.ad_status,
            "affiliate": self.affiliate,
            "affiliate_type": self.affiliate_type,
            "analytic_page_category": self.analytic_page_category,
            "days_to_first_service": self.days_to_first_service,
            "days_to_last_service": self.days_to_last_service,
            "first_to_file": self.first_to_file,
            "merged_obit_parents": self.merged_obit_parents,
            "notice_type": self.notice_type,
            "page_name": self.page_name,
            "product_name": self.product_name,
            "obit_publish_date": self.obit_publish_date,
            "donations_view_type": self.donations_view_type,
            "is_ad_free": self.is_ad_free,
            "is_consumer_direct_obituary": self.is_consumer_direct_obituary,
            "legacy_pro_plan_type": self.legacy_pro_plan_type,
            "partner_city": self.partner_city,
            "partner_state": self.partner_state,
            "partner_country": self.partner_country,
            "partner_display_name": self.partner_display_name,
            "church": self.church
        }

        for key in return_dict.keys():
            if return_dict[key] is not None and type(return_dict[key]) == str:
                return_dict[key] = return_dict[key].replace("<br />", " ")
                return_dict[key] = return_dict[key].replace("<p>", " ")
                return_dict[key] = return_dict[key].replace("</p>", " ")
                return_dict[key] = return_dict[key].replace("<strong>", " ")
                return_dict[key] = return_dict[key].replace("</strong>", " ")
                return_dict[key] = return_dict[key].replace("<em>", " ")
                return_dict[key] = return_dict[key].replace("</em>", " ")
                return_dict[key] = return_dict[key].strip()

        return return_dict


In [34]:
def parse_page_hypernova_rendered(obit_id, metadata_script_block, rendered_page_block):
    metadata_row = ObitMetadataRow(obit_id, "hypernova_rendered")

    obit_text_blocks = rendered_page_block.find_all("div", {"data-component": "ObituaryParagraph"})
    if len(obit_text_blocks) > 1:
        print(f"ERROR: {obit_id} has more than one obituary paragraph")
    elif len(obit_text_blocks) == 1:
        metadata_row.obituary_paragraph = obit_text_blocks[0].get_text(separator=" ")
    cta_blocks = rendered_page_block.find_all("div", {"data-component": "NonAffiliateSympathyCta"})
    if len(cta_blocks) > 1:
        print(f"ERROR: {obit_id} has more than one non-affiliate cta")
    elif len(cta_blocks) == 1:
        metadata_row.sympathy_cta = cta_blocks[0].get_text(separator=" ")

    metadata_dict = json.loads(metadata_script_block.text[4:-3])

    if "contentModules" in metadata_dict:
        if "name" in metadata_dict["contentModules"]:
            if "firstName" in metadata_dict["contentModules"]["name"]:
                metadata_row.first_name = metadata_dict["contentModules"]["name"]["firstName"]
            if "lastName" in metadata_dict["contentModules"]["name"]:
                metadata_row.last_name = metadata_dict["contentModules"]["name"]["lastName"]
            if "middleName" in metadata_dict["contentModules"]["name"]:
                metadata_row.middle_name = metadata_dict["contentModules"]["name"]["middleName"]
            if "fullName" in metadata_dict["contentModules"]["name"]:
                metadata_row.full_name = metadata_dict["contentModules"]["name"]["fullName"]
            if "nickName" in metadata_dict["contentModules"]["name"]:
                metadata_row.nick_name = metadata_dict["contentModules"]["name"]["nickName"]
            if "maidenName" in metadata_dict["contentModules"]["name"]:
                metadata_row.maiden_name = metadata_dict["contentModules"]["name"]["maidenName"]
            if "prefix" in metadata_dict["contentModules"]["name"]:
                metadata_row.prefix = metadata_dict["contentModules"]["name"]["prefix"]
            if "suffix" in metadata_dict["contentModules"]["name"]:
                metadata_row.suffix = metadata_dict["contentModules"]["name"]["suffix"]
            if "additionalSuffix" in metadata_dict["contentModules"]["name"]:
                metadata_row.additional_suffix = metadata_dict["contentModules"]["name"]["additionalSuffix"]
            if "additionalPrefix" in metadata_dict["contentModules"]["name"]:
                metadata_row.additional_prefix = metadata_dict["contentModules"]["name"]["additionalPrefix"]

    if "affiliatedSource" in metadata_dict:
        metadata_row.affiliated_source = metadata_dict["affiliatedSource"]


    if "schemas" in metadata_dict:
        if "personSchema" in metadata_dict["schemas"]:
            if "birthDate" in metadata_dict["schemas"]["personSchema"]:
                metadata_row.birth_date = metadata_dict["schemas"]["personSchema"]["birthDate"]
            if "deathDate" in metadata_dict["schemas"]["personSchema"]:
                metadata_row.death_date = metadata_dict["schemas"]["personSchema"]["deathDate"]
            if "address" in metadata_dict["schemas"]["personSchema"]:
                if "country" in metadata_dict["schemas"]["personSchema"]["address"]:
                    metadata_row.address_country = metadata_dict["schemas"]["personSchema"]["address"]["country"]
                if "locality" in metadata_dict["schemas"]["personSchema"]["address"]:
                    metadata_row.address_locality = metadata_dict["schemas"]["personSchema"]["address"]["locality"]
                if "region" in metadata_dict["schemas"]["personSchema"]["address"]:
                    metadata_row.address_region = metadata_dict["schemas"]["personSchema"]["address"]["region"]
                if "postalCode" in metadata_dict["schemas"]["personSchema"]["address"]:
                    metadata_row.address_postal_code = metadata_dict["schemas"]["personSchema"]["address"]["postalCode"]

    if "customDimensions" in metadata_dict:
        if "ObitwriterSource" in metadata_dict["customDimensions"]:
            metadata_row.obitwriter_source = metadata_dict["customDimensions"]["ObitwriterSource"]
        if "AdStatus" in metadata_dict["customDimensions"]:
            metadata_row.ad_status = metadata_dict["customDimensions"]["AdStatus"]
        if "Affiliate" in metadata_dict["customDimensions"]:
            metadata_row.affiliate = metadata_dict["customDimensions"]["Affiliate"]
        if "AffiliateType" in metadata_dict["customDimensions"]:
            metadata_row.affiliate_type = metadata_dict["customDimensions"]["AffiliateType"]
        if "AnalyticPageCategory" in metadata_dict["customDimensions"]:
            metadata_row.analytic_page_category = metadata_dict["customDimensions"]["AnalyticPageCategory"]
        if "DaysToFirstService" in metadata_dict["customDimensions"]:
            metadata_row.days_to_first_service = metadata_dict["customDimensions"]["DaysToFirstService"]
        if "DaysToLastService" in metadata_dict["customDimensions"]:
            metadata_row.days_to_last_service = metadata_dict["customDimensions"]["DaysToLastService"]
        if "FirstToFile" in metadata_dict["customDimensions"]:
            metadata_row.first_to_file = metadata_dict["customDimensions"]["FirstToFile"]
        if "MergedObitParents" in metadata_dict["customDimensions"]:
            metadata_row.merged_obit_parents = metadata_dict["customDimensions"]["MergedObitParents"]
        if "NoticeType" in metadata_dict["customDimensions"]:
            metadata_row.notice_type = metadata_dict["customDimensions"]["NoticeType"]
        if "ObitPublishDate" in metadata_dict["customDimensions"]:
            metadata_row.obit_publish_date = metadata_dict["customDimensions"]["ObitPublishDate"]
        if "PageName" in metadata_dict["customDimensions"]:
            metadata_row.page_name = metadata_dict["customDimensions"]["PageName"]
        if "ProductName" in metadata_dict["customDimensions"]:
            metadata_row.product_name = metadata_dict["customDimensions"]["ProductName"]

    if "donationsViewType" in metadata_dict:
        metadata_row.donations_view_type = metadata_dict["donationsViewType"]
    if "isAdFree" in metadata_dict:
        metadata_row.is_ad_free = metadata_dict["isAdFree"]
    if "isConsumerDirectObituary" in metadata_dict:
        metadata_row.is_consumer_direct_obituary = metadata_dict["isConsumerDirectObituary"]
    if "legacyProPlanType" in metadata_dict:
        metadata_row.legacy_pro_plan_type = metadata_dict["legacyProPlanType"]

    if "partner" in metadata_dict:
        if "city" in metadata_dict["partner"]:
            metadata_row.city = metadata_dict["partner"]["city"]
        if "state" in metadata_dict["partner"]:
            metadata_row.state = metadata_dict["partner"]["state"]
        if "country" in metadata_dict["partner"]:
            metadata_row.country = metadata_dict["partner"]["country"]
        if "display_name" in metadata_dict["partner"]:
            metadata_row.display_name = metadata_dict["partner"]["display_name"]

    return metadata_row.get_dict()

def redux_helper(metadata_row, metadata_dict):
    if "displayText" in metadata_dict:
        if "fullSanitized" in metadata_dict["displayText"]:
            metadata_row.obituary_paragraph = metadata_dict["displayText"]["fullSanitized"]
        elif "text" in metadata_dict["displayText"]:
            metadata_row.obituary_paragraph = metadata_dict["displayText"]["text"]
    if "name" in metadata_dict:
        if "first" in metadata_dict["name"]:
            metadata_row.first_name = metadata_dict["name"]["first"]
        if "last" in metadata_dict["name"]:
            metadata_row.last_name = metadata_dict["name"]["last"]
        if "middle" in metadata_dict["name"]:
            metadata_row.middle_name = metadata_dict["name"]["middle"]
        if "nick" in metadata_dict["name"]:
            metadata_row.nick_name = metadata_dict["name"]["nick"]
        if "maiden" in metadata_dict["name"]:
            metadata_row.maiden_name = metadata_dict["name"]["maiden"]
        if "prefix" in metadata_dict["name"]:
            metadata_row.prefix = metadata_dict["name"]["prefix"]
        if "suffix" in metadata_dict["name"]:
            metadata_row.suffix = metadata_dict["name"]["suffix"]
        if "full" in metadata_dict["name"]:
            metadata_row.full_name = metadata_dict["name"]["full"]
    if "dateOfBirth" in metadata_dict:
        metadata_row.birth_date = metadata_dict["dateOfBirth"]
    if "dateOfDeath" in metadata_dict:
        metadata_row.death_date = metadata_dict["dateOfDeath"]
    if "fromToYears" in metadata_dict:
        metadata_row.from_to_years = metadata_dict["fromToYears"]
    if "location" in metadata_dict:
        if "country" in metadata_dict["location"]:
            metadata_row.address_country = metadata_dict["location"]["country"]
        if "city" in metadata_dict["location"]:
            metadata_row.address_locality = metadata_dict["location"]["city"]
        if "state" in metadata_dict["location"]:
            metadata_row.address_region = metadata_dict["location"]["state"]
        elif "stateCode" in metadata_dict["location"]:
            metadata_row.address_region = metadata_dict["location"]["stateCode"]
    if "church" in metadata_dict:
        metadata_row.church = metadata_dict["church"]
    if "customDimensions" in metadata_dict:
        if "firstToFile" in metadata_dict["customDimensions"]:
            metadata_row.first_to_file = metadata_dict["customDimensions"]["firstToFile"]
    # if "affiliates" in metadata_dict:
    #     active_affiliates = [affiliate for affiliate in metadata_dict["affiliates"] if affiliate["active"] == "Active"]
    #     if len(active_affiliates) != 1:
    #         print(f"{obit_id} has {len(active_affiliates)} active affiliates")
    if "obituaries" in metadata_dict and len(metadata_dict["obituaries"]) > 0:
        affiliates = " / ".join([obituary["gaSitename"] for obituary in metadata_dict["obituaries"]])
        metadata_row.affiliate = affiliates
        dates_created = [obituary["dateCreated"] for obituary in metadata_dict["obituaries"]]
        metadata_row.obit_publish_date = dates_created[0]
    return metadata_row

def parse_redux_preloaded(obit_id, metadata_script_block):
    metadata_dict = json.loads(metadata_script_block.get_text().split(".__PRELOADED_STATE__ = ")[1].strip()[:-1])
    metadata_row = redux_helper(ObitMetadataRow(obit_id, "redux_preloaded"), metadata_dict["personStore"]["person"])
    return metadata_row.get_dict()

def parse_redux_initial(obit_id, metadata_script_block):
    metadata_dict = json.loads(metadata_script_block.get_text().strip()[27:-1])
    metadata_row = redux_helper(ObitMetadataRow(obit_id, "redux_initial"), metadata_dict["personStore"])
    return metadata_row.get_dict()

def parse_person_94(obit_id, person_block):
    metadata_row = ObitMetadataRow(obit_id, "person_94")
    name_blocks = person_block.find_all("h1", {"class": "name", "data-reactid": "119"})
    if len(name_blocks) == 1:
        metadata_row.full_name = name_blocks[0].get_text()
    date_blocks = person_block.find_all("span", {"class": "date", "data-reactid": "120"})
    if len(date_blocks) == 1:
        metadata_row.from_to_years = date_blocks[0].get_text()
    obituary_blocks = person_block.find_all("div", {"class": "container", "data-reactid": "556"})
    if len(obituary_blocks) == 1:

        urls = " ".join([a_block['href'] for a_block in obituary_blocks[0].find_all("a", href=True)])
        for a_block in obituary_blocks[0].find_all("a", href=True):
            a_block.decompose()
        metadata_row.obituary_paragraph = obituary_blocks[0].get_text() + urls
    return metadata_row.get_dict()

def parse_person_125(obit_id, person_block):
    metadata_row = ObitMetadataRow(obit_id, "person_125")
    name_blocks = person_block.find_all("h1", {"class": "name", "data-reactid": "119"})
    if len(name_blocks) == 1:
        metadata_row.full_name = name_blocks[0].get_text()
    date_blocks = person_block.find_all("span", {"class": "date", "data-reactid": "120"})
    if len(date_blocks) == 1:
        metadata_row.from_to_years = date_blocks[0].get_text()
    obituary_blocks = person_block.find_all("div", {"class": "container", "data-reactid": "536"})
    if len(obituary_blocks) == 1:
        urls = " ".join([a_block['href'] for a_block in obituary_blocks[0].find_all("a", href=True)])
        for a_block in obituary_blocks[0].find_all("a", href=True):
            a_block.decompose()
        metadata_row.obituary_paragraph = obituary_blocks[0].get_text() + urls
    return metadata_row.get_dict()



In [None]:
collection_name = "final"

collection_path = os.path.join(ROOT_DIR, "collections", collection_name)
html_files = [filename for filename in os.listdir(os.path.join(collection_path, "metadata")) if filename.endswith(".html")]
missing = []
increment_redux1, increment_redux2, increment_person_div, increment_person_div2 = [], [], [], []
hypernova = []
metadata_rows = []
from tqdm.notebook import tqdm

for i in tqdm(range(len(html_files))):
    html_file = html_files[i]
    obit_id = html_file.split("_")[0]
    with open(os.path.join(collection_path, "metadata", html_file), "r") as f:
        soup = BeautifulSoup(f, "html.parser")
    metadata_blocks = soup.find_all("script", {"type": "application/json", "data-hypernova-key": "ObituaryPage"})
    rendered_page_blocks = soup.find_all("div", {"data-hypernova-key": "ObituaryPage"})
    if not (len(metadata_blocks) == 1 and len(rendered_page_blocks) == 1):
        redux_data_blocks = [block for block in soup.find_all("script", {"type": None, "src": None, "id": None}) if "window.__PRELOADED_STATE__" in block.get_text()]
        if len(redux_data_blocks) == 1:
            """ ~3% of sample: redux data format, type 1 PRELOADED STATE """
            metadata_rows.append(parse_redux_preloaded(obit_id, redux_data_blocks[0]))
            increment_redux1.append(html_file)
        else:
            redux_data_blocks = [block for block in soup.find_all("script", {"type": None, "src": None, "id": None}) if "window.__INITIAL_STATE__" in block.get_text()]
            if len(redux_data_blocks) == 1:
                """ ~0.6% of sample: redux data format, type 2 INITIAL STATE """
                metadata_rows.append(parse_redux_initial(obit_id, redux_data_blocks[0]))
                increment_redux2.append(html_file)
            else:
                # print(f"{html_file} has {len(redux_data_blocks)} redux data blocks")
                person_html_blocks = [block for block in soup.find_all("div", {"class": "Person", "data-reactid": "94"})]
                if len(person_html_blocks) == 1:
                    """ ~0.01% of sample: person div format """
                    metadata_rows.append(parse_person_94(obit_id, person_html_blocks[0]))
                    increment_person_div.append(html_file)
                else:
                    person_html_blocks = [block for block in soup.find_all("div", {"class": "Person", "data-reactid": "125"})]
                    if len(person_html_blocks) == 1:
                        metadata_rows.append(parse_person_125(obit_id, person_html_blocks[0]))
                        """ ~0.01% of sample: person div format2 """
                        increment_person_div2.append(html_file)
                    else:
                        """ ~0.2% of sample: hard to parse """
                        missing.append(html_file)
    else:
        """ ~96% of sample: react hypernova format """
        hypernova.append(html_file)
        metadata_rows.append(parse_page_hypernova_rendered(obit_id, metadata_blocks[0], rendered_page_blocks[0]))
        # pbar.update(1)

with open(os.path.join(collection_path, "extract_data_dump.csv"), "w") as f:
    writer = csv.DictWriter(f, fieldnames=metadata_rows[0].keys())
    writer.writeheader()
    writer.writerows(metadata_rows)

# print("missing")
# print(sorted(missing))
# print("increment_redux1")
# print(increment_redux1)
# print("increment_redux2")
# print(increment_redux2)
print("increment_person_div")
print(increment_person_div)
print("increment_person_div2")
print(increment_person_div2)
# print("hypernova")
# print(hypernova)

  0%|          | 0/107193 [00:00<?, ?it/s]