#### Website information

This notebook is used primarily to gather data from some websites for the project.The data is then saved to the `data` folder for further processing

#### Imports

In [66]:
import concurrent.futures as cf
from collections import defaultdict
from IPython.display import clear_output
import re

from urllib.request import urlopen
from urllib.parse import urljoin

from bs4 import BeautifulSoup

import lxml
import pandas as pd


import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
)
from tqdm.notebook import tqdm



import helper_functions as hf

In [None]:
# suppress warnings
pd.options.mode.chained_assignment = None
# adjust pandas display options
pd.set_option('display.max_columns', 50)
pd.options.display.max_rows = 100
pd.set_option('display.max_colwidth', 100)

In [61]:
def get_driver():
    """
    Returns a selenium webdriver object.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

#### Info about Zurich districts

In [6]:
# savve url of website
zurich_district_url = "https://www.zuerich.com/en/visit/about-zurich/zurichs-districts"
# get html content and parse it
with urlopen(zurich_district_url) as response:
    zurich_districts_html = response.read()
zurich_districts_soup = BeautifulSoup(zurich_districts_html, "lxml")

# select all elements with id starting with 's-' and a number between 1 and 12
pattern = re.compile(r"s-[1-9]|s-1[0-2]")
elements = zurich_districts_soup.find_all(id=pattern)

In [10]:
# create a dataframe with the information of the districts
districts = {element.find("h2").text: element.find("p").text for element in elements}
districts_df = pd.DataFrame.from_dict(districts, orient="index", columns=["desc"])


# make the index into a column and split it into district number and district name
districts_df = districts_df.reset_index()
districts_df = (
    districts_df["index"]
    .str.split("–", expand=True)
    .rename({0: "district_number", 1: "district_name"}, axis=1)
    .join(districts_df)
    .drop("index", axis=1)
)
# strip the whitespace from the columns
districts_df["district_number"] = districts_df["district_number"].str.strip()
# create regex to get the number from the district_number column
regex_pattern = re.compile(r"([\d]+)")

# create a new column with the district number
districts_df["district"] = (
    districts_df["district_number"].str.extract(regex_pattern).astype("category")
)
districts_df.drop("district_number", axis=1, inplace=True)

districts_df["district_name"] = districts_df["district_name"].str.strip()
districts_df["desc"] = districts_df["desc"].str.strip()

# save to csv
districts_df.to_csv("../data/zurich_districts.csv", index=False)

In [11]:
districts_df.info()
display(districts_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   district_name  12 non-null     object  
 1   desc           12 non-null     object  
 2   district       12 non-null     category
dtypes: category(1), object(2)
memory usage: 728.0+ bytes


Unnamed: 0,district_name,desc,district
0,"Old Town, City Center",The most central district encompasses the hist...,1
1,Left Shore of the Lake,The architecturally attractive Enge Train Stat...,2
2,Creative Quarter of Wiedikon,"Once mainly a working-class neighborhood, the ...",3
3,Creative Quarter of Langstrasse,The district around Langstrasse was long regar...,4
4,Creative Quarter of Zürich-West,In the quarter where huge machines once clatte...,5
5,University Quarter,"University buildings, Jugendstil villas, green...",6
6,On the Zürichberg,"The hillside location, little traffic, and fab...",7
7,Right Shore of the Lake,This district starts behind the Opera House an...,8
8,At the Foot of the Uetliberg,"For a long time, this district was solely know...",9
9,Right of the Limmat,This district boasts a fantastic location: in ...,10


In [12]:
# create a styler object and set wrap parameter to true
districts_df_styler = districts_df.style.set_properties(
    **{"white-space": "pre-wrap"})
districts_df_styler.format({"description": lambda x: x})

Unnamed: 0,district_name,desc,district
0,"Old Town, City Center","The most central district encompasses the historical Old Town on both banks of the River Limmat, as well as the area to the south bordering on the lake basin. It is home to the prestigious Bahnhofstrasse, magnificent guild houses, imposing churches, and the famous Opera House. The best view of District 1 is to be had from the top of the Karlsturm tower.",1
1,Left Shore of the Lake,"The architecturally attractive Enge Train Station built out of Ticino granite, the Museum Rietberg with its beautiful park, the Seebad Enge lido, and the Rote Fabrik give this district a Mediterranean feel.",2
2,Creative Quarter of Wiedikon,"Once mainly a working-class neighborhood, the district below the Uetliberg is now a popular residential area with cozy cafés, boutiques, and vintage stores. The Houdini movie theater also lies on Wiedikon territory.",3
3,Creative Quarter of Langstrasse,"The district around Langstrasse was long regarded as a den of iniquity. Nowadays, it is known for its diverse restaurants and never-sleeping nightlife, with bars such as the Olé and the Club Zukunft.",4
4,Creative Quarter of Zürich-West,"In the quarter where huge machines once clattered away, now nightclubs, cultural institutions, and universities cluster around the Prime Tower. Converted structures such as the Viadukt and designer stores like the Freitag Tower give the neighborhood its characteristic trendy atmosphere.",5
5,University Quarter,"University buildings, Jugendstil villas, green areas, and the Dynamo cultural center characterize Zurich’s District 6. Thanks to its tranquility and closeness to the city center, it is a much sought-after residential area.",6
6,On the Zürichberg,"The hillside location, little traffic, and fabulous views mean than living here does not come cheap. Neighbors include Zurich Zoo and the luxury hotel, The Dolder Grand.",7
7,Right Shore of the Lake,"This district starts behind the Opera House and stretches as far as the open-air bathing facility, Seebad Tiefenbrunnen. It features boutiques, villas, the Chinawiese recreational area, and the Seebad Utoquai outdoor swimming bath.",8
8,At the Foot of the Uetliberg,"For a long time, this district was solely known for the Letzigrund Stadium. In recent years, however, the quarter around Altstetten has become THE place to be, with creative club and gastronomy concepts.",9
9,Right of the Limmat,"This district boasts a fantastic location: in summer, everyone meets at the Unterer Letten and Oberer Letten riverside lidos to swim and bask in the sun. In addition, the view over Zurich from the Waid quarter is unbeatable.",10


#### Info for Dog Breeds from the FCI website

In [14]:
def process_breed(breed_link):
    """Get the breed info from a breed link and process it for info we want"""
    soup = BeautifulSoup(urlopen(breed_link).read(), "html.parser")
    breed_info = {
        "breed_text": soup.find("h2").text.strip() if soup.find("h2") else None,
        "link": breed_link,
        "breed_group": soup.find(id="ContentPlaceHolder1_GroupeHyperLink").text.strip()
        if soup.find(id="ContentPlaceHolder1_GroupeHyperLink")
        else None,
        "breed_translations": [
            row.find_all("span")[0].text
            for row in soup.find(class_="racesgridview").find_all("tr")[1:]
        ]
        if soup.find(class_="racesgridview")
        else None,
        "subsection": next(
            (
                cells[1].text.strip()
                for row in soup.find_all(class_="racetable")[0].find_all("tr")
                if (cells := row.find_all("td"))
                and "subsection" in cells[0].text.lower()
            ),
            None,
        ),
        "section": next(
            (
                cells[1].text.strip()
                for row in soup.find_all(class_="racetable")[0].find_all("tr")
                if (cells := row.find_all("td")) and "section" in cells[0].text.lower()
            ),
            None,
        ),
        "date_of_acceptance": next(
            (
                cells[1].text.strip()
                for row in soup.find_all(class_="racetable")[0].find_all("tr")
                if (cells := row.find_all("td"))
                and "date of acceptance" in cells[0].text.lower()
            ),
            None,
        ),
        "country_of_origin": [
            cells[1].text.strip()
            for row in soup.find_all(class_="racetable")[1].find_all("tr")
            if (cells := row.find_all("td"))
            and "country of origin" in cells[0].text.lower()
        ],
        "varieties": [
            spans[0].text
            for variety in soup.find(class_="varietes").find_all(class_="variete")
            if (spans := variety.find_all("span"))
        ]
        if soup.find(class_="varietes")
        else [],
    }
    return breed_info


def process_letter(letter, link):
    """Get the breeds for a letter and process them"""
    letter_page = urlopen(link)
    soup = BeautifulSoup(letter_page, "html.parser")

    # Get the breeds on this page and process them
    breeds_element = soup.find("ul", {"class": "listeraces"})
    breed_links = [urljoin(link, a["href"]) for a in breeds_element.find_all("a")]
    breed_info_futures = []
    with cf.ThreadPoolExecutor() as executor:
        breed_info_futures = [
            executor.submit(process_breed, breed_link)
            for breed_link in tqdm(
                breed_links, total=len(breed_links), desc=f"Processing {letter}"
            )
        ]
    return breed_info_futures


def get_fci_breed_data():
    """Get breed data from the FCI website for processing"""
    fci_nonmenclature_url = "https://fci.be/en/Nomenclature/Default.aspx"

    fci_response = urlopen(fci_nonmenclature_url)
    fci_content = fci_response.read()
    fci_soup = BeautifulSoup(fci_content, "html.parser")

    letters_element = fci_soup.find("ul", {"class": "initiales"})

    # we have 'D':href=../../nomenclature/races.aspx?init=D
    # we want https://fci.be/en/nomenclature/races.aspx?init=D so use urljoin
    letter_link = {
        a.text: urljoin(fci_nonmenclature_url, a["href"])
        for a in letters_element.find_all("a")
    }

    with cf.ThreadPoolExecutor() as executor:
        letter_futures = {
            executor.submit(process_letter, letter, link)
            for letter, link in letter_link.items()
        }
        fci_breed_data = []
        for future in tqdm(
            cf.as_completed(letter_futures),
            total=len(letter_futures),
            desc="Processing letters",
        ):
            breed_info_futures = future.result()
            for breed_future in cf.as_completed(breed_info_futures):
                fci_breed_data.append(breed_future.result())
        return fci_breed_data

In [17]:
fci_breed_data = get_fci_breed_data()
clear_output()

In [19]:
# put in dataframe and save to json to avoid having to scrape again
fci_breeds_df = pd.DataFrame(fci_breed_data)
fci_breeds_df.to_json("../data/fci_breeds_raw.json", orient="records")

##### Read in data

Read in the raw data to be cleaned

In [76]:
# read in the data and take a look
fci_breeds_df = pd.read_json("../data/fci_breeds_raw.json", orient="records")
fci_breeds_df.sample()

Unnamed: 0,breed_text,link,breed_group,breed_translations,subsection,section,date_of_acceptance,country_of_origin,varieties
316,WEST HIGHLAND WHITE TERRIER\r\n\t\t\t(85),https://fci.be/en/nomenclature/WEST-HIGHLAND-W...,n°3 - Terriers,"[WEST HIGHLAND WHITE TERRIER, WEST HIGHLAND WH...",,Small sized Terriers,10/29/1954,[GREAT BRITAIN],[]


In [77]:
# get the number in ()
fci_breeds_df["fci_num"] = fci_breeds_df["breed_text"].str.extract(
    r"\((\d+)\)", expand=False
)
# get the name before the \r\n\t
fci_breeds_df["breed"] = fci_breeds_df["breed_text"].str.extract(r"(.*?)\r\n\t")
# add the value in the breed column to the list in the translations column in each respective row
fci_breeds_df["alt_names"] = fci_breeds_df.apply(
    lambda x: x["breed_translations"] + [x["breed"]], axis=1
)
# convert to lower case
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].apply(
    lambda x: [i.lower() for i in x]
)
# english version is the first translation
fci_breeds_df["breed_en"] = fci_breeds_df["breed_translations"].apply(
    lambda x: x[0].lower()
)

We extracted some data which we placed in a separate column but we still left the original column for reference. Looking at the dataframe you will notice that each breed is in its own row, and some columns such as `breed_translation`, `alt_names`,  `country_of_origin` are actually a list in the dataframe column. This is something that we need to be aware about during our data extraction and cleaning process.

In [78]:
fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains(r"belgian", case=False, regex=True)
]

Unnamed: 0,breed_text,link,breed_group,breed_translations,subsection,section,date_of_acceptance,country_of_origin,varieties,fci_num,breed,alt_names,breed_en
197,CHIEN DE BERGER BELGE\r\n\t\t\t(15),https://fci.be/en/nomenclature/BELGIAN-SHEPHER...,n°1 - Sheepdogs and Cattledogs (except Swiss C...,"[BELGIAN SHEPHERD DOG, CHIEN DE BERGER BELGE, ...",,Sheepdogs,1/1/1956,[BELGIUM],"[a) Groenendael, b) Laekenois, c) Malinois, d)...",15,CHIEN DE BERGER BELGE,"[belgian shepherd dog, chien de berger belge, ...",belgian shepherd dog


We'll refine our data to improve the accuracy of our fuzzy matching function. This includes adding colloquial breed names to the `alt_names` column, which serves as a comprehensive reference for matching.

In [79]:
# edit all breeds with '- haired' in the name to remove the space and the '-'
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.replace(
    r"- ?haired", "haired", regex=True
)
# add the 'breed_en' breed to the list in alt_names column
fci_breeds_df["alt_names"] = fci_breeds_df.apply(
    lambda x: x["alt_names"] + [x["breed_en"]], axis=1
)

# create a column for the number of varieties from the varieties column
fci_breeds_df["n_varieties"] = fci_breeds_df["varieties"].transform(len)
# clean up the letter-numbering in the varieties column 'a)'
fci_breeds_df["varieties"] = fci_breeds_df["varieties"].apply(
    lambda x: [re.sub(r"^[a-z]\) ", "", i).lower() for i in x]
)

# extract the group number and name from the group column
fci_breeds_df["group_num"] = fci_breeds_df["breed_group"].str.extract(r"(\d+)")
fci_breeds_df["group_name"] = (
    fci_breeds_df["breed_group"].str.split("-", n=1, expand=True)[1].str.strip()
)

# if there is a breed name with 'pointing dog' in its alt_names, also add the breed name with 'pointer' in its name
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.replace(
    r"pointing dog", "pointer", regex=True
)
fci_breeds_df["alt_names"] = fci_breeds_df.apply(
    lambda x: x["alt_names"] + [x["breed_en"]], axis=1
)

# display only 3 of the the breeds with varieties
fci_breeds_df[fci_breeds_df["n_varieties"] > 0][
    ["breed", "varieties", "alt_names", "breed_en"]
].sample(3)

Unnamed: 0,breed,varieties,alt_names,breed_en
344,ST.BERNHARDSHUND - BERNHARDINER,"[short-haired, long-haired]","[st. bernard, chien du mont saint-bernard - sa...",st. bernard
295,PERRO SIN PELO DEL PERÚ,"[large, medium-sized, miniature]","[peruvian hairless dog, chien nu du perou, per...",peruvian hairless dog
329,SALUKI,"[fringed, smooth]","[saluki, saluki, saluki, saluki, saluki, saluk...",saluki


Some of the `varieties` are just variations in size, coat-color, -hair-length of the same breed. Still, some variations are so popular that they are referred to by this variation name. We will add these variations to the their `alt_names` list.These include:
- swiss hound
- small swiss hound
- german spitz
- belgian shepherd dog
- continental toy spaniel
- chinese crested dog

In [80]:
# add the varieties to the alt_names column of some popular breeds


popular_breeds = [
    "swiss hound",
    "small swiss hound",
    "german spitz",
    "belgian shepherd dog",
    "continental toy spaniel",
    "chinese crested dog",
]


popular_names_mask = fci_breeds_df["breed_en"].isin(popular_breeds)


fci_breeds_df.loc[popular_names_mask, "alt_names"] = (
    fci_breeds_df.loc[popular_names_mask, "alt_names"]
    + fci_breeds_df.loc[popular_names_mask, "varieties"]
)

fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("belgian shepherd dog"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("belgian shepherd dog"), "alt_names"
].apply(
    lambda x: x
    + [
        "belgian sheepdog",
        "belgian tervuren",
        "belgian malinois",
        "belgian groenendael",
        "belgian laekenois",
    ]
)

fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("toll"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("toll"), "alt_names"
].apply(
    lambda x: x + ["toller"]
)

fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("vallhund"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("vallhund"), "alt_names"
].apply(
    lambda x: x + ["westgotenspitz"]
)
fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("bernese"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("bernese"), "alt_names"
].apply(
    lambda x: x + ["durbachler"]
)
fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("shetland"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("shetland"), "alt_names"
].apply(
    lambda x: x + ["sheltie"]
)
fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("german spitz"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("german spitz"), "alt_names"
].apply(
    lambda x: x
    + [
        "wolfsspitz",
        "keeshond",
        "kleinspitz",
        "mittelspitz",
        "grossspitz",
        "zwergspitz",
    ]
)

In [81]:
fci_breeds_df["no_accent"] = fci_breeds_df["alt_names"].apply(
    lambda x: [hf.remove_accents(i) for i in x]
)
# add the no_accent to the alt_names column and remove duplicates
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"] + fci_breeds_df["no_accent"]
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].apply(
    lambda x: [i.lower() for i in x]
)
# reduce the duplicates within each alt_names list
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].transform(set)

In [82]:
# remove the '()' from the alt_names
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].apply(
    lambda x: [i.replace("(", "").replace(")", "") for i in x]
)

In [83]:
fci_breeds_df.sample(3)

Unnamed: 0,breed_text,link,breed_group,breed_translations,subsection,section,date_of_acceptance,country_of_origin,varieties,fci_num,breed,alt_names,breed_en,n_varieties,group_num,group_name,no_accent
335,SAMOIEDSKAÏA SABAKA\r\n\t\t\t(212),https://fci.be/en/nomenclature/SAMOYED-212.html,n°5 - Spitz and primitive types,"[SAMOYED, SAMOYEDE, SAMOJEDE, SAMOYEDO]",,Nordic Sledge Dogs,6/26/1959,"[NORTHERN RUSSIA, SIBERIA]",[],212,SAMOIEDSKAÏA SABAKA,"[samoyede, samojede, samoyed, samoyedo, samoie...",samoyed,0,5,Spitz and primitive types,"[samoyed, samoyede, samojede, samoyedo, samoie..."
330,SHAR PEI\r\n\t\t\t(309),https://fci.be/en/nomenclature/SHAR-PEI-309.html,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[SHAR PEI, SHAR PEI, SHAR PEI, SHAR PEI]",Mastiff type,Molossian type,6/30/1981,[CHINA],[],309,SHAR PEI,[shar pei],shar pei,0,2,Pinscher and Schnauzer - Molossoid and Swiss M...,"[shar pei, shar pei, shar pei, shar pei, shar ..."
157,CHIEN DE BERGER DES PYRENEES A FACE RASE\r\n\t...,https://fci.be/en/nomenclature/PYRENEAN-SHEEPD...,n°1 - Sheepdogs and Cattledogs (except Swiss C...,"[PYRENEAN SHEEPDOG - SMOOTH FACED, CHIEN DE BE...",,Sheepdogs,1/26/1955,[FRANCE],[],138,CHIEN DE BERGER DES PYRENEES A FACE RASE,"[pyrenaen-hutehund mit kurzhaarigem gesicht, c...",pyrenean sheepdog - smooth faced,0,1,Sheepdogs and Cattledogs (except Swiss Cattled...,"[pyrenean sheepdog - smooth faced, chien de be..."


In [84]:
fci_breeds_df.country_of_origin.apply(
    lambda x: any(re.search("Switzerland", i, re.IGNORECASE) for i in x)
).sum()

10

In [85]:
# save to json
fci_breeds_df.to_json("../data/fci_breeds.json", orient="records")

#### Info for Dog Breeds for AKC website


In [59]:
akc_home_url = "https://www.akc.org/dog-breeds/"

In [62]:
def get_breed_link_info(link):
    """Get the breed link info from the AKC website"""
    breed_link = {}
    driver = get_driver()
    try:
        driver.get(link)
        WebDriverWait(driver, 30).until(
            ec.presence_of_element_located(
                (By.XPATH, "//select[@class='custom-select__select']/option")
            )
        )
        # navigate_to_akc_homepage(driver, link)
        options = driver.find_elements(
            By.XPATH, "//select[@class='custom-select__select']/option"
        )
        for option in options:
            breed_link[option.text] = option.get_attribute("value")
        print(f"Number of breeds: {len(breed_link)}")

        return breed_link
    except TimeoutException:
        print(f"Timeout while loading page: {link}")
    except Exception as e:
        print(f"An error occurred while loading page: {link}\n{e}")
    finally:
        driver.quit()


# get the links for each breed
breed_link = get_breed_link_info(akc_home_url)

Number of breeds: 289


In [63]:
# Remove the empty values
akc_breed_link_dict = {
    key.lower(): value for (key, value) in breed_link.items() if value != ""
}
len(akc_breed_link_dict)

288

In [64]:
def get_soup(page_source):
    return BeautifulSoup(page_source, "lxml")


def get_breed_info(page_source):
    """Function to get the breed info from the AKC website."""
    breed_metadata = defaultdict(str)

    soup = get_soup(page_source)

    def extract_text(tag, class_name):
        """Helper function to extract text from a tag with error handling."""
        try:
            return soup.find(tag, {"class": class_name}).text
        except AttributeError:
            return ""

    # Extract breed, temperment, popularity rank,  year recognized, group
    breed_metadata["breed_page"] = extract_text("h1", "page-header__title")
    breed_metadata["temperment"] = extract_text(
        "p", "breed-page__intro__temperment")
    breed_metadata["popularity"] = extract_text(
        "span", "breed-page__popularity__custom-label"
    )
    breed_metadata["year_recognized"] = extract_text(
        "p", "breed-page__popularity__ranking-title"
    )
    breed_metadata["group"] = extract_text(
        "a", "breed-page__intro__group__tooltip")

    # Extract height, weight, life expectancy
    try:
        element = soup.find(
            "div", {"class": "breed-page__hero__overview__icon-block-wrap"}
        )
        attribute_map = {
            "height": "height",
            "weight": "weight",
            "life expectancy": "life_expectancy",
        }
        for ele in element:
            for attribute, key in attribute_map.items():
                if attribute in ele.text.lower():
                    breed_metadata[key] = ele.text
    except AttributeError:
        breed_metadata["height"] = ""
        breed_metadata["weight"] = ""
        breed_metadata["life_expectancy"] = ""

    return breed_metadata
# Constants
SELECTORS = [
    "h1.page-header__title",
    "p.breed-page__intro__temperment",
    "div.breed-page__hero__overview__icon-block",
    "div.breed-page__intro__group",
]


def get_page_source(driver, link):
    """Navigate to the page and return the page source."""
    try:
        driver.get(link)
        # Wait for the specific elements to be loaded
        for selector in SELECTORS:
            WebDriverWait(driver, 30).until(
                ec.visibility_of_element_located((By.CSS_SELECTOR, selector))
            )
        # Get the webpage's HTML content
        return driver.page_source
    except TimeoutException:
        print(f"Timeout while loading page: {link}")
        return None
    except Exception as e:
        print(f"An error occurred while loading page: {link}\n{e}")
        return None


def get_contents(breed, link):
    breed_info = defaultdict(str)

    with get_driver() as driver:
        try:
            page_source = get_page_source(driver, link)
            if page_source is not None:
                breed_info = get_breed_info(page_source)
                breed_info["breed"] = breed
                breed_info["link"] = link
            return breed_info
        except TimeoutException:
            print(f"Timeout while loading page: {link}")
        except AttributeError:
            print(f"Attribute error | {breed}")
        except Exception as e:
            print(f"An error occurred while loading page: {link}\n{e}")
        finally:
            driver.quit()


def get_akc_breed_data():
    with cf.ThreadPoolExecutor() as executor:
        futures = {
            executor.submit(get_contents, breed, link)
            # for breed, link in sample_dict.items()
            for breed, link in akc_breed_link_dict.items()
        }
        breed_data = list(
            tqdm(
                (future.result() for future in cf.as_completed(futures)),
                total=len(akc_breed_link_dict),
                # total=len(sample_dict),
            )
        )
        return breed_data

In [68]:
breed_data = get_akc_breed_data()

  0%|          | 0/288 [00:00<?, ?it/s]

Timeout while loading page: https://www.akc.org/dog-breeds/braque-saint-germain/
Timeout while loading page: https://www.akc.org/dog-breeds/st-bernard/
Timeout while loading page: https://www.akc.org/dog-breeds/slovakian-wirehaired-pointer/


In [69]:
breed_data = [x for x in breed_data if x is not None]

akc_physical_traits = pd.DataFrame(breed_data)
akc_physical_traits["breed"] = (
    akc_physical_traits["breed"]
    .str.lower()
    .str.replace("-", " ")
    .str.replace("'", "")
    # .apply(remove_accents)
)
akc_physical_traits.sample()

Unnamed: 0,breed_page,temperment,popularity,year_recognized,group,height,weight,life_expectancy,breed,link
146,Jagdterrier,intelligent / courageous / hardworking,,,Foundation Stock Service »,Height13-16 inches,Weight17-22 pounds,Life Expectancy10-12 years,jagdterrier,https://www.akc.org/dog-breeds/jagdterrier/


In [70]:
def parse_range(s):
    """"""
    if isinstance(s, str):
        numbers = list(map(float, re.findall(r"\d+(?:\.\d+)?", s)))
        if numbers:
            return numbers + [None] * (2 - len(numbers))
    return [None, None]


def get_upper_lower_bound(dataframe, column):
    """Parse a range string and return the lower and upper bounds."""
    dataframe[column + "_ll"], dataframe[column + "_ul"] = zip(
        *dataframe[column].map(parse_range)
    )
    dataframe[column + "_ul"].fillna(dataframe[column + "_ll"], inplace=True)


# add in the columns of the upper and lower bounds for height, weight, and life expectancy
get_upper_lower_bound(akc_physical_traits, "weight")
get_upper_lower_bound(akc_physical_traits, "height")
get_upper_lower_bound(akc_physical_traits, "life_expectancy")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe[column + "_ul"].fillna(dataframe[column + "_ll"], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe[column + "_ul"].fillna(dataframe[column + "_ll"], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because 

In [71]:
akc_physical_traits["group"] = akc_physical_traits["group"].str.strip(" »")
akc_physical_traits["breed_page"] = akc_physical_traits["breed_page"].str.lower()
akc_physical_traits["year_accepted"] = akc_physical_traits[
    "year_recognized"
].str.extract(r"(\d{4})")
akc_physical_traits["is_recognized_breed"] = akc_physical_traits[
    "year_accepted"
].notna()

In [72]:
# create a function which takes the column names and creates a column with a unique list of those column values in each row
def create_alt_names(dataframe, list_of_columns):
    """creates a column with a unique list of those column values in each row"""
    dataframe["alt_names"] = dataframe[list_of_columns].apply(
        lambda row: set(x for x in row if pd.notna(x)), axis=1
    )

In [73]:
create_alt_names(akc_physical_traits, ["breed_page", "breed"])

In [74]:
print(f"{akc_physical_traits.shape=}")
akc_physical_traits.isna().sum()
akc_physical_traits.loc[akc_physical_traits["weight_ll"].isna()]

akc_physical_traits.shape=(288, 19)


Unnamed: 0,breed_page,temperment,popularity,year_recognized,group,height,weight,life_expectancy,breed,link,weight_ll,weight_ul,height_ll,height_ul,life_expectancy_ll,life_expectancy_ul,year_accepted,is_recognized_breed,alt_names
60,,,,,,,,,,,,,,,,,,False,{}
71,cane corso,intelligent / affectionate / majestic,18 of 201,The Cane Corso was recognized as a breed by th...,Working Group,WeightProportionate to height,WeightProportionate to height,Life Expectancy9-12 years,cane corso,https://www.akc.org/dog-breeds/cane-corso/,,,,,9.0,12.0,2010.0,True,{cane corso}
234,,,,,,,,,,,,,,,,,,False,{}
250,,,,,,,,,,,,,,,,,,False,{}
252,spinone italiano,sociable / patient / docile,112 of 201,The Spinone Italiano was recognized as a breed...,Sporting Group,Height23-27 inches (male)22-25 inches (female),WeightIn direct proportion to size and structu...,Life Expectancy10-12 years,spinone italiano,https://www.akc.org/dog-breeds/spinone-italiano/,,,23.0,27.0,10.0,12.0,2000.0,True,{spinone italiano}


In [86]:
akc_physical_traits.sample(5)

Unnamed: 0,breed_page,temperment,popularity,year_recognized,group,height,weight,life_expectancy,breed,link,weight_ll,weight_ul,height_ll,height_ul,life_expectancy_ll,life_expectancy_ul,year_accepted,is_recognized_breed,alt_names
95,drentsche patrijshond,loyal / intelligent / friendly,,,Foundation Stock Service,Height23-25 (males)21.5 - 23.5 (females),Weight70-73 (males)55-60 (females),Life Expectancy11-14 years,drentsche patrijshond,https://www.akc.org/dog-breeds/drentsche-patri...,70.0,73.0,23.0,25.0,11.0,14.0,,False,{drentsche patrijshond}
25,basset fauve de bretagne,sociable / lively / smart,,,Miscellaneous Class,Height12.5-15.5 inches,Weight23-39 pounds,Life Expectancy13-16 years,basset fauve de bretagne,https://www.akc.org/dog-breeds/basset-fauve-de...,23.0,39.0,12.5,15.5,13.0,16.0,,False,{basset fauve de bretagne}
276,vizsla,energetic / affectionate / gentle,33 of 201,The Vizsla was recognized as a breed by the AK...,Sporting Group,Height22-24 inches (male)21-23 inches (female),Weight55-60 pounds (male)44-55 pounds (female),Life Expectancy12-14 years,vizsla,https://www.akc.org/dog-breeds/vizsla/,55.0,60.0,22.0,24.0,12.0,14.0,1960.0,True,{vizsla}
206,portuguese podengo pequeno,playful / charming / lively,171 of 201,The Portuguese Podengo Pequeno was recognized ...,Hound Group,Height8-12 inches,Weight9-13 pounds,Life Expectancy12-15 years,portuguese podengo pequeno,https://www.akc.org/dog-breeds/portuguese-pode...,9.0,13.0,8.0,12.0,12.0,15.0,2013.0,True,{portuguese podengo pequeno}
8,airedale terrier,friendly / courageous / clever,65 of 201,The Airedale Terrier was recognized as a breed...,Terrier Group,Height23 inches,Weight50-70 pounds,Life Expectancy11-14 years,airedale terrier,https://www.akc.org/dog-breeds/airedale-terrier/,50.0,70.0,23.0,23.0,11.0,14.0,1888.0,True,{airedale terrier}


In [75]:
akc_physical_traits.to_json("../data/akc_breeds.json", orient="records")