#### Website information

This notebook is used primarily to gather data from some websites for the project.The data is then saved to the `data` folder for further processing

#### Imports

In [1]:
# Standard library imports
import concurrent.futures as cf  # For concurrent execution of tasks
from collections import defaultdict  # For creating a dictionary with default values
from pathlib import Path  # For working with file paths
import re  # For regular expressions
from urllib.request import urlopen  # For opening URLs
from urllib.parse import urljoin  # For joining URLs

# Third party imports
from bs4 import BeautifulSoup  # For web scraping
from IPython.display import clear_output  # For clearing the output in Jupyter Notebook
import lxml  # For parsing HTML using lxml library
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
from selenium import webdriver  # For automated web browsing
from selenium.webdriver.chrome.options import Options  # For configuring Chrome options
from selenium.webdriver.common.by import By  # For locating elements by various strategies
from selenium.webdriver.support.wait import WebDriverWait  # For waiting until certain conditions are met
from selenium.webdriver.support import expected_conditions as ec  # For defining expected conditions
from selenium.common.exceptions import (  # For handling exceptions in Selenium
  TimeoutException,
  NoSuchElementException,
)
from tqdm.notebook import tqdm  # For creating progress bars in Jupyter Notebook

# Local application imports
import helper_functions as hf  # For importing custom helper functions

# Clear the output in Jupyter Notebook
clear_output()

In [2]:
# suppress warnings
pd.options.mode.chained_assignment = None
# adjust pandas display options
pd.set_option("display.max_columns", 50)
pd.options.display.max_rows = 100
pd.set_option("display.max_colwidth", 100)

In [3]:
def get_driver():
    """
    Returns a selenium webdriver object.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

#### Info about Zurich districts

In [4]:
# savve url of website
zurich_district_url = "https://www.zuerich.com/en/visit/about-zurich/zurichs-districts"
# get html content and parse it
with urlopen(zurich_district_url) as response:
    zurich_districts_html = response.read()
zurich_districts_soup = BeautifulSoup(zurich_districts_html, "lxml")

# select all elements with id starting with 's-' and a number between 1 and 12
pattern = re.compile(r"s-[1-9]|s-1[0-2]")
elements = zurich_districts_soup.find_all(id=pattern)

In [5]:
# create a dataframe with the information of the districts
districts = {element.find("h2").text: element.find(
    "p").text for element in elements}
districts_df = pd.DataFrame.from_dict(
    districts, orient="index", columns=["desc"])


# make the index into a column and split it into district number and district name
districts_df = districts_df.reset_index()
districts_df = (
    districts_df["index"]
    .str.split("–", expand=True)
    .rename({0: "district_number", 1: "district_name"}, axis=1)
    .join(districts_df)
    .drop("index", axis=1)
)
# strip the whitespace from the columns
districts_df["district_number"] = districts_df["district_number"].str.strip()
# create regex to get the number from the district_number column
regex_pattern = re.compile(r"([\d]+)")

# create a new column with the district number
districts_df["district"] = (
    districts_df["district_number"].str.extract(
        regex_pattern).astype("int")
)
districts_df.drop("district_number", axis=1, inplace=True)

districts_df["district_name"] = districts_df["district_name"].str.strip()
districts_df["desc"] = districts_df["desc"].str.strip()
districts_df['link'] = districts_df['district'].apply(
    lambda x: f"{zurich_district_url}#s-{x}"
)


In [6]:
districts_df.info()
display(districts_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   district_name  12 non-null     object
 1   desc           12 non-null     object
 2   district       12 non-null     int32 
 3   link           12 non-null     object
dtypes: int32(1), object(3)
memory usage: 464.0+ bytes


Unnamed: 0,district_name,desc,district,link
0,"Old Town, City Center","The most central district encompasses the historical Old Town on both banks of the River Limmat,...",1,https://www.zuerich.com/en/visit/about-zurich/zurichs-districts#s-1
1,Left Shore of the Lake,"The architecturally attractive Enge Train Station built out of Ticino granite, the Museum Rietbe...",2,https://www.zuerich.com/en/visit/about-zurich/zurichs-districts#s-2
2,Creative Quarter of Wiedikon,"Once mainly a working-class neighborhood, the district below the Uetliberg is now a popular resi...",3,https://www.zuerich.com/en/visit/about-zurich/zurichs-districts#s-3
3,Creative Quarter of Langstrasse,"The district around Langstrasse was long regarded as a den of iniquity. Nowadays, it is known fo...",4,https://www.zuerich.com/en/visit/about-zurich/zurichs-districts#s-4
4,Creative Quarter of Zürich-West,"In the quarter where huge machines once clattered away, now nightclubs, cultural institutions, a...",5,https://www.zuerich.com/en/visit/about-zurich/zurichs-districts#s-5
5,University Quarter,"University buildings, Jugendstil villas, green areas, and the Dynamo cultural center characteriz...",6,https://www.zuerich.com/en/visit/about-zurich/zurichs-districts#s-6
6,On the Zürichberg,"The hillside location, little traffic, and fabulous views mean than living here does not come ch...",7,https://www.zuerich.com/en/visit/about-zurich/zurichs-districts#s-7
7,Right Shore of the Lake,This district starts behind the Opera House and stretches as far as the open-air bathing facilit...,8,https://www.zuerich.com/en/visit/about-zurich/zurichs-districts#s-8
8,At the Foot of the Uetliberg,"For a long time, this district was solely known for the Letzigrund Stadium. In recent years, how...",9,https://www.zuerich.com/en/visit/about-zurich/zurichs-districts#s-9
9,Right of the Limmat,"This district boasts a fantastic location: in summer, everyone meets at the Unterer Letten and O...",10,https://www.zuerich.com/en/visit/about-zurich/zurichs-districts#s-10


In [7]:
# create a folder data if it does not exist yet
def save_to_data(df: pd.DataFrame, filename: str, data_dir: str = '../data'):
  """Saves the dataframe to the data folder as a csv file. 
    If the folder does not exist, it will be created."""
  # create the full path
  df_path = Path(data_dir) / filename
  # check if the data folder exists
  if not df_path.parent.exists():
    df_path.parent.mkdir(parents=True)
  # save the dataframe as file according to the extension
  ext = df_path.suffix.lstrip('.')
  match ext:
    case 'csv':
      df.to_csv(df_path, index=False)
    case 'json':
      df.to_json(df_path, orient='records')
    case _:
      raise ValueError(f"Extension {ext} not supported.")


# save the dataframe to the data folder
save_to_data(districts_df, 'zurich_districts.csv')

# districts_df.to_csv("../data/zurich_districts.csv", index=False)

#### Info for Dog Breeds from the FCI website

In [8]:
def process_breed(breed_link):
  """Get the breed info from a breed link and process it for info we want"""
  soup = BeautifulSoup(urlopen(breed_link).read(), "html.parser")
  breed_info = {
      "breed_text":
      soup.find("h2").text.strip() if soup.find("h2") else None,
      "link":
      breed_link,
      "breed_group":
      soup.find(id="ContentPlaceHolder1_GroupeHyperLink").text.strip()
      if soup.find(id="ContentPlaceHolder1_GroupeHyperLink") else None,
      "breed_translations": [
          row.find_all("span")[0].text
          for row in soup.find(class_="racesgridview").find_all("tr")[1:]
      ] if soup.find(class_="racesgridview") else None,
      "subsection":
      next(
          (cells[1].text.strip()
           for row in soup.find_all(class_="racetable")[0].find_all("tr")
           if (cells := row.find_all("td"))
           and "subsection" in cells[0].text.lower()),
          None,
      ),
      "section":
      next(
          (cells[1].text.strip()
           for row in soup.find_all(class_="racetable")[0].find_all("tr")
           if (cells := row.find_all("td"))
           and "section" in cells[0].text.lower()),
          None,
      ),
      "date_of_acceptance":
      next(
          (cells[1].text.strip()
           for row in soup.find_all(class_="racetable")[0].find_all("tr")
           if (cells := row.find_all("td"))
           and "date of acceptance" in cells[0].text.lower()),
          None,
      ),
      "country_of_origin": [
          cells[1].text.strip()
          for row in soup.find_all(class_="racetable")[1].find_all("tr")
          if (cells := row.find_all("td"))
          and "country of origin" in cells[0].text.lower()
      ],
      "varieties": [
          spans[0].text for variety in soup.find(class_="varietes").find_all(
              class_="variete") if (spans := variety.find_all("span"))
      ] if soup.find(class_="varietes") else [],
  }
  return breed_info


def process_letter(letter, link):
  """Get the breeds for a letter and process them"""
  letter_page = urlopen(link)
  soup = BeautifulSoup(letter_page, "html.parser")

  # Get the breeds on this page and process them
  breeds_element = soup.find("ul", {"class": "listeraces"})
  breed_links = [
      urljoin(link, a["href"]) for a in breeds_element.find_all("a")
  ]
  breed_info_futures = []
  with cf.ThreadPoolExecutor() as executor:
    breed_info_futures = [
        executor.submit(process_breed, breed_link) for breed_link in tqdm(
            breed_links, total=len(breed_links), desc=f"Processing {letter}")
    ]
  return breed_info_futures


def get_fci_breed_data():
  """Get breed data from the FCI website for processing"""
  fci_nonmenclature_url = "https://fci.be/en/Nomenclature/Default.aspx"

  fci_response = urlopen(fci_nonmenclature_url)
  fci_content = fci_response.read()
  fci_soup = BeautifulSoup(fci_content, "html.parser")

  letters_element = fci_soup.find("ul", {"class": "initiales"})

  # we have 'D':href=../../nomenclature/races.aspx?init=D
  # we want https://fci.be/en/nomenclature/races.aspx?init=D so use urljoin
  letter_link = {
      a.text: urljoin(fci_nonmenclature_url, a["href"])
      for a in letters_element.find_all("a")
  }

  with cf.ThreadPoolExecutor() as executor:
    letter_futures = {
        executor.submit(process_letter, letter, link)
        for letter, link in letter_link.items()
    }
    fci_breed_data = []
    for future in tqdm(
        cf.as_completed(letter_futures),
        total=len(letter_futures),
        desc="Processing letters",
    ):
      breed_info_futures = future.result()
      for breed_future in cf.as_completed(breed_info_futures):
        fci_breed_data.append(breed_future.result())
    return fci_breed_data

In [9]:
fci_breed_data = get_fci_breed_data()
clear_output()

In [10]:

# put in dataframe and save to json to avoid having to scrape again
fci_breeds_df = pd.DataFrame(fci_breed_data)
# save to json
save_to_data(fci_breeds_df, 'fci_breeds_raw.json')
# fci_breeds_df.to_json("../data/fci_breeds_raw.json", orient="records")

##### Read in data

Read in the raw data to be cleaned

In [11]:
# read in the data and take a look
fci_breeds_df = pd.read_json("../data/fci_breeds_raw.json", orient="records")
fci_breeds_df.sample()
fci_breeds_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 356 entries, 0 to 355
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   breed_text          356 non-null    object 
 1   link                356 non-null    object 
 2   breed_group         356 non-null    object 
 3   breed_translations  356 non-null    object 
 4   subsection          0 non-null      float64
 5   section             355 non-null    object 
 6   date_of_acceptance  0 non-null      float64
 7   country_of_origin   356 non-null    object 
 8   varieties           356 non-null    object 
dtypes: float64(2), object(7)
memory usage: 27.8+ KB


In [12]:
# get the number in ()
fci_breeds_df["fci_num"] = fci_breeds_df["breed_text"].str.extract(
    r"\((\d+)\)", expand=False
)
# get the name before the \r\n\t
fci_breeds_df["breed"] = fci_breeds_df["breed_text"].str.extract(r"(.*?)\r\n\t")
# add the value in the breed column to the list in the translations column in each respective row
fci_breeds_df["alt_names"] = fci_breeds_df.apply(
    lambda x: x["breed_translations"] + [x["breed"]], axis=1
)
# convert to lower case
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].apply(
    lambda x: [i.lower() for i in x]
)
# english version is the first translation
fci_breeds_df["breed_en"] = fci_breeds_df["breed_translations"].apply(
    lambda x: x[0].lower()
)

We extracted some data which we placed in a separate column but we still left the original column for reference. Looking at the dataframe you will notice that each breed is in its own row, and some columns such as `breed_translation`, `alt_names`,  `country_of_origin` are actually a list in the dataframe column. This is something that we need to be aware about during our data extraction and cleaning process.

In [13]:
fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains(r"belgian", case=False, regex=True)
]

Unnamed: 0,breed_text,link,breed_group,breed_translations,subsection,section,date_of_acceptance,country_of_origin,varieties,fci_num,breed,alt_names,breed_en
197,CHIEN DE BERGER BELGE\r\n\t\t\t(15),https://fci.be/en/nomenclature/BELGIAN-SHEPHERD-DOG-15.html,n°1 - Sheepdogs and Cattledogs (except Swiss Cattledogs),"[BELGIAN SHEPHERD DOG, CHIEN DE BERGER BELGE, BELGISCHER SCHÄFERHUND, PERRO DE PASTOR BELGA]",,Sheepdogs,,[],"[a) Groenendael, b) Laekenois, c) Malinois, d) Tervueren]",15,CHIEN DE BERGER BELGE,"[belgian shepherd dog, chien de berger belge, belgischer schäferhund, perro de pastor belga, chi...",belgian shepherd dog


We'll refine our data to improve the accuracy of our fuzzy matching function. This includes adding colloquial breed names to the `alt_names` column, which serves as a comprehensive reference for matching.

In [14]:
# edit all breeds with '- haired' in the name to remove the space and the '-'
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.replace(
    r"- ?haired", "haired", regex=True
)
# add the 'breed_en' breed to the list in alt_names column
fci_breeds_df["alt_names"] = fci_breeds_df.apply(
    lambda x: x["alt_names"] + [x["breed_en"]], axis=1
)

# create a column for the number of varieties from the varieties column
fci_breeds_df["n_varieties"] = fci_breeds_df["varieties"].transform(len)
# clean up the letter-numbering in the varieties column 'a)'
fci_breeds_df["varieties"] = fci_breeds_df["varieties"].apply(
    lambda x: [re.sub(r"^[a-z]\) ", "", i).lower() for i in x]
)

# extract the group number and name from the group column
fci_breeds_df["group_num"] = fci_breeds_df["breed_group"].str.extract(r"(\d+)")
fci_breeds_df["group_name"] = (
    fci_breeds_df["breed_group"].str.split("-", n=1, expand=True)[1].str.strip()
)

# if there is a breed name with 'pointing dog' in its alt_names, also add the breed name with 'pointer' in its name
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.replace(
    r"pointing dog", "pointer", regex=True
)
fci_breeds_df["alt_names"] = fci_breeds_df.apply(
    lambda x: x["alt_names"] + [x["breed_en"]], axis=1
)

# display only 3 of the the breeds with varieties
fci_breeds_df[fci_breeds_df["n_varieties"] > 0][
    ["breed", "varieties", "alt_names", "breed_en"]
].sample(3)

Unnamed: 0,breed,varieties,alt_names,breed_en
296,PODENGO PORTUGUÊS,"[small, medium-sized, large]","[portuguese warren hound-portuguese podengo, chien de garenne portugais, portugiesischer podengo...",portuguese warren hound-portuguese podengo
115,DEUTSCHER PINSCHER,"[self coloured: deer red, reddish-brown to dark red brown, black and tan]","[german pinscher, pinscher allemand, deutscher pinscher, pinscher alemán, deutscher pinscher, ge...",german pinscher
132,DEUTSCHE DOGGE,"[fawnb) brindle, blackd) harlequin, blue]","[great dane, dogue allemand, deutsche dogge, gran danés, deutsche dogge, great dane, great dane]",great dane


Some of the `varieties` are just variations in size, coat-color, -hair-length of the same breed. Still, some variations are so popular that they are referred to by this variation name. We will add these variations to the their `alt_names` list.These include:
- swiss hound
- small swiss hound
- german spitz
- belgian shepherd dog
- continental toy spaniel
- chinese crested dog

In [15]:
# add the varieties to the alt_names column of some popular breeds


popular_breeds = [
    "swiss hound",
    "small swiss hound",
    "german spitz",
    "belgian shepherd dog",
    "continental toy spaniel",
    "chinese crested dog",
]


popular_names_mask = fci_breeds_df["breed_en"].isin(popular_breeds)


fci_breeds_df.loc[popular_names_mask, "alt_names"] = (
    fci_breeds_df.loc[popular_names_mask, "alt_names"]
    + fci_breeds_df.loc[popular_names_mask, "varieties"]
)

fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("belgian shepherd dog"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("belgian shepherd dog"), "alt_names"
].apply(
    lambda x: x
    + [
        "belgian sheepdog",
        "belgian tervuren",
        "belgian malinois",
        "belgian groenendael",
        "belgian laekenois",
    ]
)

fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("toll"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("toll"), "alt_names"
].apply(
    lambda x: x + ["toller"]
)

fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("vallhund"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("vallhund"), "alt_names"
].apply(
    lambda x: x + ["westgotenspitz"]
)
fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("bernese"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("bernese"), "alt_names"
].apply(
    lambda x: x + ["durbachler"]
)
fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("shetland"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("shetland"), "alt_names"
].apply(
    lambda x: x + ["sheltie"]
)
fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("german spitz"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("german spitz"), "alt_names"
].apply(
    lambda x: x
    + [
        "wolfsspitz",
        "keeshond",
        "kleinspitz",
        "mittelspitz",
        "grossspitz",
        "zwergspitz",
    ]
)

In [16]:
fci_breeds_df["no_accent"] = fci_breeds_df["alt_names"].apply(
    lambda x: [hf.remove_accents(i) for i in x]
)
# add the no_accent to the alt_names column and remove duplicates
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"] + fci_breeds_df["no_accent"]
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].apply(
    lambda x: [i.lower() for i in x]
)
# reduce the duplicates within each alt_names list
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].transform(set)

In [17]:
# remove the '()' from the alt_names
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].apply(
    lambda x: [i.replace("(", "").replace(")", "") for i in x]
)

In [18]:
fci_breeds_df.sample(3)

Unnamed: 0,breed_text,link,breed_group,breed_translations,subsection,section,date_of_acceptance,country_of_origin,varieties,fci_num,breed,alt_names,breed_en,n_varieties,group_num,group_name,no_accent
288,ZWERGSCHNAUZER\r\n\t\t\t(183),https://fci.be/en/nomenclature/MINIATURE-SCHNAUZER-183.html,n°2 - Pinscher and Schnauzer - Molossoid and Swiss Mountain and Cattledogs,"[MINIATURE SCHNAUZER, SCHNAUZER NAIN, ZWERGSCHNAUZER, SCHNAUZER MINIATURA]",,Pinscher and Schnauzer type,,[],"[pepper and salt, pure black with black undercoat, black and silver, pure white with white under...",183,ZWERGSCHNAUZER,"[miniature schnauzer, schnauzer nain, zwergschnauzer, schnauzer miniatura]",miniature schnauzer,4,2,Pinscher and Schnauzer - Molossoid and Swiss Mountain and Cattledogs,"[miniature schnauzer, schnauzer nain, zwergschnauzer, schnauzer miniatura, zwergschnauzer, minia..."
110,DOGO ARGENTINO\r\n\t\t\t(292),https://fci.be/en/nomenclature/DOGO-ARGENTINO-292.html,n°2 - Pinscher and Schnauzer - Molossoid and Swiss Mountain and Cattledogs,"[DOGO ARGENTINO, DOGUE ARGENTIN, ARGENTINISCHE DOGGE, DOGO ARGENTINO]",,Molossian type,,[],[],292,DOGO ARGENTINO,"[dogue argentin, argentinische dogge, dogo argentino]",dogo argentino,0,2,Pinscher and Schnauzer - Molossoid and Swiss Mountain and Cattledogs,"[dogo argentino, dogue argentin, argentinische dogge, dogo argentino, dogo argentino, dogo argen..."
177,BASENJI\r\n\t\t\t(43),https://fci.be/en/nomenclature/BASENJI-43.html,n°5 - Spitz and primitive types,"[BASENJI, BASENJI, BASENJI, BASENJI]",,Primitive type,,[],[],43,BASENJI,[basenji],basenji,0,5,Spitz and primitive types,"[basenji, basenji, basenji, basenji, basenji, basenji, basenji]"


In [19]:
# # stopped working as the website has changed but not necessary for the project
# fci_breeds_df.country_of_origin.apply(
#     lambda x: any(re.search("Switzerland", i, re.IGNORECASE) for i in x)
# ).sum()

In [20]:
# save to json
save_to_data(fci_breeds_df, 'fci_breeds.json')
# fci_breeds_df.to_json("../data/fci_breeds.json", orient="records")

#### Info for Dog Breeds for AKC website


In [21]:
akc_home_url = "https://www.akc.org/dog-breeds/"

In [22]:
def get_breed_link_info(link):
    """Get the breed link info from the AKC website"""
    breed_link = {}
    driver = get_driver()
    try:
        driver.get(link)
        WebDriverWait(driver, 30).until(
            ec.presence_of_element_located(
                (By.XPATH, "//select[@class='custom-select__select']/option")
            )
        )
        # navigate_to_akc_homepage(driver, link)
        options = driver.find_elements(
            By.XPATH, "//select[@class='custom-select__select']/option"
        )
        for option in options:
            breed_link[option.text] = option.get_attribute("value")
        print(f"Number of breeds: {len(breed_link)}")

        return breed_link
    except TimeoutException:
        print(f"Timeout while loading page: {link}")
    except Exception as e:
        print(f"An error occurred while loading page: {link}\n{e}")
    finally:
        driver.quit()


# get the links for each breed
breed_link = get_breed_link_info(akc_home_url)

Number of breeds: 289


In [23]:
# Remove the empty values
akc_breed_link_dict = {
    key.lower(): value for (key, value) in breed_link.items() if value != ""
}
len(akc_breed_link_dict)

288

In [24]:
def get_soup(page_source):
    return BeautifulSoup(page_source, "lxml")


def get_breed_info(page_source):
    """Function to get the breed info from the AKC website."""
    breed_metadata = defaultdict(str)

    soup = get_soup(page_source)

    def extract_text(tag, class_name):
        """Helper function to extract text from a tag with error handling."""
        try:
            return soup.find(tag, {"class": class_name}).text
        except AttributeError:
            return ""

    # Extract breed, temperment, popularity rank,  year recognized, group
    breed_metadata["breed_page"] = extract_text("h1", "page-header__title")
    breed_metadata["temperment"] = extract_text(
        "p", "breed-page__intro__temperment")
    breed_metadata["popularity"] = extract_text(
        "span", "breed-page__popularity__custom-label"
    )
    breed_metadata["year_recognized"] = extract_text(
        "p", "breed-page__popularity__ranking-title"
    )
    breed_metadata["group"] = extract_text(
        "a", "breed-page__intro__group__tooltip")

    # Extract height, weight, life expectancy
    try:
        element = soup.find(
            "div", {"class": "breed-page__hero__overview__icon-block-wrap"}
        )
        attribute_map = {
            "height": "height",
            "weight": "weight",
            "life expectancy": "life_expectancy",
        }
        for ele in element:
            for attribute, key in attribute_map.items():
                if attribute in ele.text.lower():
                    breed_metadata[key] = ele.text
    except AttributeError:
        breed_metadata["height"] = ""
        breed_metadata["weight"] = ""
        breed_metadata["life_expectancy"] = ""

    return breed_metadata
# Constants
SELECTORS = [
    "h1.page-header__title",
    "p.breed-page__intro__temperment",
    "div.breed-page__hero__overview__icon-block",
    "div.breed-page__intro__group",
]


def get_page_source(driver, link):
    """Navigate to the page and return the page source."""
    try:
        driver.get(link)
        # Wait for the specific elements to be loaded
        for selector in SELECTORS:
            WebDriverWait(driver, 30).until(
                ec.visibility_of_element_located((By.CSS_SELECTOR, selector))
            )
        # Get the webpage's HTML content
        return driver.page_source
    except TimeoutException:
        print(f"Timeout while loading page: {link}")
        return None
    except Exception as e:
        print(f"An error occurred while loading page: {link}\n{e}")
        return None


def get_contents(breed, link):
    breed_info = defaultdict(str)

    with get_driver() as driver:
        try:
            page_source = get_page_source(driver, link)
            if page_source is not None:
                breed_info = get_breed_info(page_source)
                breed_info["breed"] = breed
                breed_info["link"] = link
            return breed_info
        except TimeoutException:
            print(f"Timeout while loading page: {link}")
        except AttributeError:
            print(f"Attribute error | {breed}")
        except Exception as e:
            print(f"An error occurred while loading page: {link}\n{e}")
        finally:
            driver.quit()


def get_akc_breed_data():
    with cf.ThreadPoolExecutor() as executor:
        futures = {
            executor.submit(get_contents, breed, link)
            # for breed, link in sample_dict.items()
            for breed, link in akc_breed_link_dict.items()
        }
        breed_data = list(
            tqdm(
                (future.result() for future in cf.as_completed(futures)),
                total=len(akc_breed_link_dict),
                # total=len(sample_dict),
            )
        )
        return breed_data

In [25]:
breed_data = get_akc_breed_data()

  0%|          | 0/288 [00:00<?, ?it/s]

Timeout while loading page: https://www.akc.org/dog-breeds/braque-saint-germain/
Timeout while loading page: https://www.akc.org/dog-breeds/slovakian-wirehaired-pointer/


In [26]:
breed_data = [x for x in breed_data if x is not None]

akc_physical_traits = pd.DataFrame(breed_data)
akc_physical_traits["breed"] = (
    akc_physical_traits["breed"]
    .str.lower()
    .str.replace("-", " ")
    .str.replace("'", "")
    # .apply(remove_accents)
)
akc_physical_traits.sample()

Unnamed: 0,breed_page,temperment,popularity,year_recognized,group,height,weight,life_expectancy,breed,link
87,Deutscher Wachtelhund,friendly / versatile / determined,,,Foundation Stock Service »,Height18-21 inches,Weight40-55 pounds,Life Expectancy12-14 years,deutscher wachtelhund,https://www.akc.org/dog-breeds/deutscher-wachtelhund/


In [27]:
def parse_range(s):
    """"""
    if isinstance(s, str):
        numbers = list(map(float, re.findall(r"\d+(?:\.\d+)?", s)))
        if numbers:
            return numbers + [None] * (2 - len(numbers))
    return [None, None]


def get_upper_lower_bound(dataframe, column):
    """Parse a range string and return the lower and upper bounds."""
    dataframe[column + "_ll"], dataframe[column + "_ul"] = zip(
        *dataframe[column].map(parse_range)
    )
    dataframe[column + "_ul"].fillna(dataframe[column + "_ll"], inplace=True)


# add in the columns of the upper and lower bounds for height, weight, and life expectancy
get_upper_lower_bound(akc_physical_traits, "weight")
get_upper_lower_bound(akc_physical_traits, "height")
get_upper_lower_bound(akc_physical_traits, "life_expectancy")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe[column + "_ul"].fillna(dataframe[column + "_ll"], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe[column + "_ul"].fillna(dataframe[column + "_ll"], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because 

In [28]:
akc_physical_traits["group"] = akc_physical_traits["group"].str.strip(" »")
akc_physical_traits["breed_page"] = akc_physical_traits["breed_page"].str.lower()
akc_physical_traits["year_accepted"] = akc_physical_traits[
    "year_recognized"
].str.extract(r"(\d{4})")
akc_physical_traits["is_recognized_breed"] = akc_physical_traits[
    "year_accepted"
].notna()

In [29]:
# create a function which takes the column names and creates a column with a unique list of those column values in each row
def create_alt_names(dataframe, list_of_columns):
    """creates a column with a unique list of those column values in each row"""
    dataframe["alt_names"] = dataframe[list_of_columns].apply(
        lambda row: set(x for x in row if pd.notna(x)), axis=1
    )

In [30]:
create_alt_names(akc_physical_traits, ["breed_page", "breed"])

In [31]:
print(f"{akc_physical_traits.shape=}")
akc_physical_traits.isna().sum()
akc_physical_traits.loc[akc_physical_traits["weight_ll"].isna()]

akc_physical_traits.shape=(288, 19)


Unnamed: 0,breed_page,temperment,popularity,year_recognized,group,height,weight,life_expectancy,breed,link,weight_ll,weight_ul,height_ll,height_ul,life_expectancy_ll,life_expectancy_ul,year_accepted,is_recognized_breed,alt_names
62,,,,,,,,,,,,,,,,,,False,{}
68,cane corso,intelligent / affectionate / majestic,18 of 201,The Cane Corso was recognized as a breed by the AKC in 2010.,Working Group,WeightProportionate to height,WeightProportionate to height,Life Expectancy9-12 years,cane corso,https://www.akc.org/dog-breeds/cane-corso/,,,,,9.0,12.0,2010.0,True,{cane corso}
251,,,,,,,,,,,,,,,,,,False,{}
255,spinone italiano,sociable / patient / docile,112 of 201,The Spinone Italiano was recognized as a breed by the AKC in 2000.,Sporting Group,Height23-27 inches (male)22-25 inches (female),WeightIn direct proportion to size and structure of a dog,Life Expectancy10-12 years,spinone italiano,https://www.akc.org/dog-breeds/spinone-italiano/,,,23.0,27.0,10.0,12.0,2000.0,True,{spinone italiano}


In [32]:
akc_physical_traits.sample(5)

Unnamed: 0,breed_page,temperment,popularity,year_recognized,group,height,weight,life_expectancy,breed,link,weight_ll,weight_ul,height_ll,height_ul,life_expectancy_ll,life_expectancy_ul,year_accepted,is_recognized_breed,alt_names
51,boston terrier,friendly / bright / amusing,24 of 201,The Boston Terrier was recognized as a breed by the AKC in 1893.,Non-Sporting Group,Height10-12 inches,Weight12-25 pounds,Life Expectancy11-13 years,boston terrier,https://www.akc.org/dog-breeds/boston-terrier/,12.0,25.0,10.0,12.0,11.0,13.0,1893.0,True,{boston terrier}
38,belgian tervuren,alert / intelligent / courageous,104 of 201,The Belgian Tervuren was recognized as a breed by the AKC in 1959.,Herding Group,Height24-26 inches (male)22-24 inches (female),Weight55-75 pounds (male)45-60 pounds (female),Life Expectancy12-14 years,belgian tervuren,https://www.akc.org/dog-breeds/belgian-tervuren/,55.0,75.0,24.0,26.0,12.0,14.0,1959.0,True,{belgian tervuren}
13,american staffordshire terrier,good-natured / confident / smart,87 of 201,The American Staffordshire Terrier was recognized as a breed by the AKC in 1936.,Terrier Group,Height18-19 inches (male)17-18 inches (female),Weight55-70 pounds (male)40-55 pounds (female),Life Expectancy12-16 years,american staffordshire terrier,https://www.akc.org/dog-breeds/american-staffordshire-terrier/,55.0,70.0,18.0,19.0,12.0,16.0,1936.0,True,{american staffordshire terrier}
23,basenji,independent / smart / poised,91 of 201,The Basenji was recognized as a breed by the AKC in 1944.,Hound Group,Height17 inches (male)16 inches (female),Weight24 pounds (male)22 pounds (female),Life Expectancy13-14 years,basenji,https://www.akc.org/dog-breeds/basenji/,24.0,22.0,17.0,16.0,13.0,14.0,1944.0,True,{basenji}
159,korean jindo dog,alert / loyal / intelligent,,,Foundation Stock Service,Height19-22 (males)17-20 (females),Weight40 -50 (males)30-40 (females),Life Expectancy14-15 years,korean jindo dog,https://www.akc.org/dog-breeds/korean-jindo-dog/,40.0,50.0,19.0,22.0,14.0,15.0,,False,{korean jindo dog}


In [None]:
# akc_physical_traits.to_json("../data/akc_breeds.json", orient="records")
# save to json
save_to_data(akc_physical_traits, 'akc_breeds.json')