In [None]:
# Standard library imports
import itertools as it
from typing import Callable, Optional
from contextlib import contextmanager
import concurrent.futures as cf


# import json
import re
from collections import defaultdict
from functools import partial
from pathlib import Path
from typing import Optional
from urllib.request import urlopen
from urllib.parse import urljoin
import unicodedata

# Third-party imports
from bs4 import BeautifulSoup
from fiona.io import ZipMemoryFile
from lxml import etree
import lxml
from matplotlib import pyplot as plt
import geopandas as gpd
import geoviews as gv
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    StaleElementReferenceException,
    TimeoutException,
)
from thefuzz import fuzz
from thefuzz import process
from tqdm.notebook import tqdm
from IPython.display import clear_output

# Local application imports
from translate_app import translate_list_to_dict

In [None]:
# suppress warnings
pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
pd.set_option("display.max_colwidth", 100)

In [None]:
def convert_to_snake_case(item):
    # Add _ before uppercase in camelCase
    s1 = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", item)
    # Add _ before uppercase following lowercase or digit
    s2 = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", s1)
    # Add _ between letter and digit
    s3 = re.sub(r"([a-zA-Z])([0-9])", r"\1_\2", s2)
    s4 = re.sub(r"[-\s]", "_", s3).lower()  # Replace hyphen or space with _
    return s4

In [None]:
@contextmanager
def start_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    try:
        yield driver
    finally:
        driver.quit()


def get_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [None]:
def remove_accents(input_str):
    """Function to remove accents from a string.
    It takes as argument a string and returns the same string
    without accents."""
    nfkd_form = (
        unicodedata.normalize("NFKD", input_str).encode(
            "ASCII", "ignore").decode()
    )
    # return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
    return nfkd_form


remove_accents("résuméö")

In [None]:
def sanitize_df_column_names(df):
    """Function to danitize column names by translating and conveting to snake case"""
    column_list = df.columns.tolist()
    # translate the column names
    translated_dict = translate_list_to_dict(column_list)
    # map the translated column names to the column names
    df.rename(columns=translated_dict, inplace=True)
    # convert the column names to snake case
    df.columns = [convert_to_snake_case(col) for col in df.columns]
    return df


def rename_keys(d, prefix="zurich_gdf_"):
    return {f"{prefix}{i}": v for i, (k, v) in enumerate(d.items())}

In [None]:
# define a function to match breed names using FuzzyWuzzy
def match_breed_name(name, choices, scorer=fuzz.token_sort_ratio):
    if name in choices:
        return name, 100
    mismo, score, *_ = process.extractOne(name, choices, scorer=scorer)
    return mismo, score

In [None]:
def find_breed_match(
    input_breed: str,
    breeds_df: pd.DataFrame,
    scoring_functions: list[Callable[[str, str], int]],
    threshold: int = 90,
) -> Optional[str]:
    """
    Find the match for the breed in the FCI breeds dataframe.
    breeds_df dataframe must have both a breed_en and alt_names column.
    """
    # Initialize the maximum score and best match
    max_score = threshold
    best_match = np.nan

    # Iterate over each row in the breeds dataframe
    for index, breed_row in breeds_df.iterrows():
        # Get the alternative names for the current breed
        alternative_names = breed_row["alt_names"]

        # Calculate the score for the input breed and each alternative name
        # using each scoring function, and take the maximum of these scores
        current_score = max(
            max(
                scoring_function(input_breed, alt_name)
                for scoring_function in scoring_functions
            )
            for alt_name in alternative_names
        )
        # If the current score is greater than the maximum score, update the
        # maximum score and best match
        if current_score >= max_score:
            best_match, max_score = breed_row["breed_en"], current_score

        # If the maximum score is 100, we have a perfect match and can break
        # out of the loop early
        if max_score == 100:
            break

    # print(
    # f"Best match: {best_match} | score: {max_score} | input: {input_breed}")
    # Return the best match
    return best_match


def apply_fuzzy_matching_to_breed_column(
    dataframe: pd.DataFrame,
    breed_column: str,
    fci_df: pd.DataFrame,
    scoring_functions: list[Callable[[str, str], int]],
    threshold: int = 90,
) -> pd.Series:
    """Apply fuzzy matching to the breed column in the dataframe."""

    return dataframe[breed_column].apply(
        lambda breed: find_breed_match(
            breed, fci_df, scoring_functions, threshold=threshold
        )
    )

#### Info about Zurich districts



In [None]:
# save the url of the website
zurich_districts_url = "https://www.zuerich.com/en/visit/about-zurich/zurichs-districts"

# get the html content of the website
zurich_response = urlopen(zurich_districts_url)
zurich_html_content = zurich_response.read()

In [None]:
# parse the html content
zurich_soup = BeautifulSoup(zurich_html_content, "lxml")

In [None]:
# select all elements with id starting with 's-' and a number between 1 and 12
pattern = re.compile(r"s-[1-9]|s-1[0-2]")
elements = zurich_soup.find_all(id=pattern)

In [None]:
# create a dataframe with the information of the districts
districts = {element.find("h2").text: element.find("p").text for element in elements}
districts_df = pd.DataFrame.from_dict(districts, orient="index", columns=["desc"])


# make the index into a column and split it into district number and district name
districts_df = districts_df.reset_index()
districts_df = (
    districts_df["index"]
    .str.split("–", expand=True)
    .rename({0: "district_number", 1: "district_name"}, axis=1)
    .join(districts_df)
    .drop("index", axis=1)
)
# strip the whitespace from the columns
districts_df["district_number"] = districts_df["district_number"].str.strip()
# create regex to get the number from the district_number column
regex_pattern = re.compile(r"([\d]+)")

# create a new column with the district number
districts_df["district"] = (
    districts_df["district_number"]
    .str.extract(
        regex_pattern,
    )
    .astype("category")
)
districts_df.drop("district_number", axis=1, inplace=True)

districts_df["district_name"] = districts_df["district_name"].str.strip()
districts_df["desc"] = districts_df["desc"].str.strip()

# Add column for the length of the desc
districts_df["desc_length"] = districts_df["desc"].str.len()

print(districts_df)

In [None]:
districts_df.info()

In [None]:
# create a styler object and set the wrap parameter to True
styler = districts_df.style.set_properties(**{"white-space": "pre-wrap"})

formatted_df = styler.format({"description": lambda x: x})
formatted_df

In [None]:
districts_df.to_csv("../data/zurich_districts.csv", index=False)

#### Info for Dog breeds from hunde-zauber.de

In [None]:
hz_url = "https://hunde-zauber.de/liste-aller-hunderassen-von-a-bis-z/"
hz_size_weight_url = "https://hunde-zauber.de/hund-gewicht-groesse-tabelle/"

hz_response = urlopen(hz_url)
hz_html_content = hz_response.read()
hz_soup = BeautifulSoup(hz_html_content, "lxml")

hz_size_weight_response = urlopen(hz_size_weight_url)
hz_size_weight_html_content = hz_size_weight_response.read()
hz_size_weight_soup = BeautifulSoup(hz_size_weight_html_content, "lxml")

In [None]:
hz_size_weight_tree = etree.HTML(hz_size_weight_html_content)
# get the table header
header = hz_size_weight_tree.xpath("//table/thead/tr")
column_headers = [th.text for th in header[0].xpath("//th")]
# get the table body
body = hz_size_weight_tree.xpath("//table/tbody")
rows = body[0].xpath("//tr")
row_data = [[td.text for td in row.xpath(".//td")] for row in rows]
# convert nested list into a dataframe
hz_size_weight_df = pd.DataFrame()
hz_size_weight_df = pd.DataFrame(row_data[1:], columns=column_headers)
hz_size_weight_df

In [None]:
# Get the first column and column names, translate them
german_to_translate = (
    hz_size_weight_df.iloc[:, 0].tolist() + hz_size_weight_df.columns.tolist()
)
translated_dict = translate_list_to_dict(german_to_translate)

# Apply translations to column names and first column
hz_size_weight_df.columns = [
    translated_dict.get(col, col) for col in hz_size_weight_df.columns
]
hz_size_weight_df["breed_en"] = hz_size_weight_df.iloc[:, 0].map(
    lambda x: translated_dict.get(x, x)
)

# hz_size_weight_df

In [None]:
hz_size_weight_df.columns = [
    "breed_de",
    "f_height_cm",
    "f_weight_kg",
    "m_height_cm",
    "m_weight_kg",
    "breed_en",
]


def split_column(df, column):
    """Function to extract the numbers from a column and create two new columns."""
    df_copy = df[[column]]
    df_copy[[f"{column}_low", f"{column}_high"]] = df_copy[column].str.extract(
        r"(\d+).*?(\d+)"
    )
    df_copy.drop(column, axis=1, inplace=True)
    return df_copy


columns_to_split = ["f_height_cm", "f_weight_kg", "m_height_cm", "m_weight_kg"]
numbers_df = pd.concat(
    [split_column(hz_size_weight_df, column) for column in columns_to_split], axis=1
)
hz_size_weight_df[["breed_de", "breed_en"]].join(numbers_df)
hz_size_weight_df["breed_de"] = hz_size_weight_df["breed_de"].str.lower()
hz_size_weight_df["breed_en"] = hz_size_weight_df["breed_en"].str.lower()

In [None]:
# # save the dataframe as a json file
hz_size_weight_df.to_json("../data/hz_breeds_size.json", orient="records")

In [None]:
# hz_size_weight_df

In [None]:
with start_driver() as driver:
    driver.get(hz_url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//table"))
    )
    cells = driver.find_elements(By.XPATH, "//table//td")
    cell_orig = [cell.text for cell in cells]

In [None]:
cell_orig

In [None]:
breed_de = pd.DataFrame(cell_orig)
# for each element extract the number and the breed name ('142. English Pointer' -> '142', 'English Pointer')
pattern = re.compile(r"(\d+\.)?(.*)")
# put in a new column in the dataframe
breed_de[["breed_number", "breed_de"]] = (
    breed_de[0]
    .str.extract(pattern)
    .rename({0: "breed_number", 1: "breed_de"}, axis=1)
    .dropna()
)

breed_de.dropna(inplace=True)

breed_de["breed_number"] = breed_de["breed_number"].str.strip(
    ".").astype("int")
breed_de["breed_de"] = breed_de["breed_de"].str.strip().str.lower()
breed_de["breed_en"] = breed_de["breed_de"].map(
    translate_list_to_dict(breed_de["breed_de"].tolist())
)
breed_de["breed_en"] = breed_de["breed_en"].str.lower()
breed_de

In [None]:
# save the dataframe as a json file
breed_de[["breed_en", "breed_de"]].to_json(
    "../data/hz_breeds.json", orient="records")

In [None]:
diff_set = set(breed_de["breed_de"].tolist()).symmetric_difference(
    set(hz_size_weight_df["breed_de"].tolist())
)

diff_set_df = pd.concat(
    [
        breed_de.loc[breed_de["breed_de"].isin(
            diff_set)][["breed_de", "breed_en"]],
        hz_size_weight_df.loc[hz_size_weight_df["breed_de"].isin(diff_set)][
            ["breed_de", "breed_en"]
        ],
    ],
)
diff_set_df.sort_values(by="breed_de")

#### Info for Dog breeds from FCI

In [None]:
fci_url = "https://www.fci.be/en/Nomenclature/educationGroupe.aspx"
fci_response = urlopen(fci_url)
fci_html_content = fci_response.read()

fci_parsed_html = etree.HTML(fci_html_content)

In [None]:
breed_groups = {}
elements = fci_parsed_html.xpath("//*[@class='nom']")
for element in elements:
    breed_groups[element.text] = element.get("href")

In [None]:
fci_breeds_df = (
    pd.DataFrame.from_dict(breed_groups, orient="index", columns=["link"])
    .reset_index()
    .rename(columns={"index": "breed"})
)

# define regex pattern to get what is in the most right brackets
regex_pattern = re.compile(r"\((?=[^()]*\))([^()]+)\)$")

fci_breeds_df.sample(3)

In [None]:
fci_breeds_df[["breed_orig", "breed_en"]] = fci_breeds_df["breed"].str.split(
    "(", n=1, expand=True
)
fci_breeds_df.sample(3)

Turned out that webpage only had 33 breeds

In [None]:
only_letters_pattern = r"\(?([A-Za-z-\.\s]+)\)"
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.extract(only_letters_pattern)
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].fillna(
    fci_breeds_df["breed_orig"].transform(lambda x: x)
)
fci_breeds_df["breed_orig"] = fci_breeds_df["breed_orig"].str.strip().str.lower()
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.strip().str.lower()
# fci_breeds_df

In [None]:
fci_breeds_df["weblink"] = fci_breeds_df["link"].apply(lambda x: "www.fci.be" + x)


# fci_breeds_df

#### Get all the breeds from the FCI individually

In [None]:
# def click_element(driver, elements, index):
#     try:
#         elements[index].click()
#     except StaleElementReferenceException:
#         elements = driver.find_element(
#             By.CLASS_NAME, elements[0].get_attribute("class")
#         )
#         elements[index].click()


# def get_fci_breed_info(driver, breeds, n2):
#     breed_text = breeds[n2].text
#     breed_ref = breeds[n2].get_attribute("href")
#     click_element(driver, breeds, n2)

#     WebDriverWait(driver, 10).until(
#         EC.presence_of_element_located((By.ID, "ContentPlaceHolder1_GroupeHyperLink"))
#     )
#     breed_group = driver.find_element(By.ID, "ContentPlaceHolder1_GroupeHyperLink").text
#     table = driver.find_element(By.CLASS_NAME, "racesgridview")
#     breed_translations = [
#         row.find_elements(By.TAG_NAME, "span")[0].text
#         for row in table.find_elements(By.TAG_NAME, "tr")[1:]
#     ]

#     table2 = driver.find_elements(By.CLASS_NAME, "racetable")
#     left_rows2 = table2[0].find_elements(By.TAG_NAME, "tr")
#     right_rows2 = table2[1].find_elements(By.TAG_NAME, "tr")

#     breed_section = None
#     breed_subsection = None
#     breed_date_of_acceptance = None
#     breed_country_of_origin = None

#     for row in left_rows2:
#         cells = row.find_elements(By.TAG_NAME, "td")
#         if len(cells) >= 2:
#             if "subsection" in cells[0].text.lower():
#                 breed_subsection = cells[1].text
#             elif "section" in cells[0].text.lower():
#                 breed_section = cells[1].text
#             elif "date of acceptance" in cells[0].text.lower():
#                 breed_date_of_acceptance = cells[1].text

#     breed_country_of_origin = [
#         cells[1].text
#         for row in right_rows2
#         if len(cells := row.find_elements(By.TAG_NAME, "td")) >= 2
#         and "country of origin" in cells[0].text.lower()
#     ]

#     try:
#         table3 = driver.find_element(By.CLASS_NAME, "varietes")
#         breed_varieties = [
#             spans[0].text
#             for variety in table3.find_elements(By.CLASS_NAME, "variete")
#             if (spans := variety.find_elements(By.TAG_NAME, "span"))
#         ]
#     except NoSuchElementException:
#         breed_varieties = []

#     return (
#         breed_text,
#         breed_ref,
#         breed_group,
#         breed_translations,
#         breed_section,
#         breed_subsection,
#         breed_date_of_acceptance,
#         breed_country_of_origin,
#         breed_varieties,
#     )


# def get_fci_breeds(link):
#     name_link_list = []
#     driver = None
#     try:
#         driver = get_driver()
#         driver.get(link)
#         WebDriverWait(driver, 10).until(
#             EC.presence_of_element_located((By.CLASS_NAME, "initiales"))
#         )
#         letters = driver.find_element(By.CLASS_NAME, "initiales").find_elements(
#             By.TAG_NAME, "a"
#         )

#         for n, _ in enumerate(letters):
#             click_element(driver, letters, n)
#             WebDriverWait(driver, 10).until(
#                 EC.presence_of_element_located((By.CLASS_NAME, "listeraces"))
#             )
#             breeds = driver.find_element(By.CLASS_NAME, "listeraces").find_elements(
#                 By.TAG_NAME, "a"
#             )

#             for n2, _ in enumerate(breeds):
#                 breed_info = get_fci_breed_info(driver, breeds, n2)
#                 name_link_list.append(breed_info)
#                 driver.back()

#     except Exception as e:
#         print(f"An error occurred: {e}")
#     finally:
#         if driver is not None:
#             driver.quit()

#     return name_link_list

In [None]:
def process_breed(breed_link):
    soup = BeautifulSoup(urlopen(breed_link).read(), "html.parser")
    breed_info = {
        "breed_text": soup.find("h2").text.strip() if soup.find("h2") else None,
        "link": breed_link,
        "breed_group": soup.find(id="ContentPlaceHolder1_GroupeHyperLink").text.strip()
        if soup.find(id="ContentPlaceHolder1_GroupeHyperLink")
        else None,
        "breed_translations": [
            row.find_all("span")[0].text
            for row in soup.find(class_="racesgridview").find_all("tr")[1:]
        ]
        if soup.find(class_="racesgridview")
        else None,
        "subsection": next(
            (
                cells[1].text.strip()
                for row in soup.find_all(class_="racetable")[0].find_all("tr")
                if (cells := row.find_all("td"))
                and "subsection" in cells[0].text.lower()
            ),
            None,
        ),
        "section": next(
            (
                cells[1].text.strip()
                for row in soup.find_all(class_="racetable")[0].find_all("tr")
                if (cells := row.find_all("td")) and "section" in cells[0].text.lower()
            ),
            None,
        ),
        "date_of_acceptance": next(
            (
                cells[1].text.strip()
                for row in soup.find_all(class_="racetable")[0].find_all("tr")
                if (cells := row.find_all("td"))
                and "date of acceptance" in cells[0].text.lower()
            ),
            None,
        ),
        "country_of_origin": [
            cells[1].text.strip()
            for row in soup.find_all(class_="racetable")[1].find_all("tr")
            if (cells := row.find_all("td"))
            and "country of origin" in cells[0].text.lower()
        ],
        "varieties": [
            spans[0].text
            for variety in soup.find(class_="varietes").find_all(class_="variete")
            if (spans := variety.find_all("span"))
        ]
        if soup.find(class_="varietes")
        else [],
    }
    return breed_info

In [None]:
def process_letter(letter, link):
    letter_page = urlopen(link)
    soup = BeautifulSoup(letter_page, "html.parser")

    # Get the breeds on this page and process them
    breeds_element = soup.find("ul", {"class": "listeraces"})
    breed_links = [urljoin(link, a["href"])
                   for a in breeds_element.find_all("a")]
    breed_info_futures = []
    with cf.ThreadPoolExecutor() as executor:
        breed_info_futures = [
            executor.submit(process_breed, breed_link)
            for breed_link in tqdm(
                breed_links, total=len(breed_links), desc=f"Processing {letter}"
            )
        ]
    return breed_info_futures


def get_fci_breed_data():
    fci_nonmenclature_url = "https://fci.be/en/Nomenclature/Default.aspx"

    fci_response = urlopen(fci_nonmenclature_url)
    fci_content = fci_response.read()
    fci_soup = BeautifulSoup(fci_content, "html.parser")

    letters_element = fci_soup.find("ul", {"class": "initiales"})
    # we have 'D':href=../../nomenclature/races.aspx?init=D

    # we want https://fci.be/en/nomenclature/races.aspx?init=D so use urljoin
    letter_link = {
        a.text: urljoin(fci_nonmenclature_url, a["href"])
        for a in letters_element.find_all("a")
    }

    with cf.ThreadPoolExecutor() as executor:
        letter_futures = {
            executor.submit(process_letter, letter, link)
            for letter, link in letter_link.items()
        }
        fci_breed_data = []
        for future in tqdm(
            cf.as_completed(letter_futures),
            total=len(letter_futures),
            desc="Processing letters",
        ):
            breed_info_futures = future.result()
            for breed_future in cf.as_completed(breed_info_futures):
                fci_breed_data.append(breed_future.result())
        return fci_breed_data


fci_breed_data = get_fci_breed_data()
clear_output()

In [None]:
# fci_breed_data

In [None]:
fci_breed_df = pd.DataFrame()
fci_breeds_df = pd.DataFrame(fci_breed_data)
fci_breeds_df.to_json("../data/fci_breeds_raw.json", orient="records")

In [None]:
fci_breeds_df = pd.read_json("../data/fci_breeds_raw.json")
fci_breeds_df.sample()
# fci_breeds_df[fci_breeds_df["breed"] == "KA"]

In [None]:
# get the number in ()
fci_breeds_df["fci_num"] = fci_breeds_df["breed_text"].str.extract(
    r"\((\d+)\)", expand=False
)
# get the name before the \r\n\t
fci_breeds_df["breed"] = fci_breeds_df["breed_text"].str.extract(r"(.*?)\r\n\t")
# add the value in the breed column to the list in the translations column in each respective row
fci_breeds_df["alt_names"] = fci_breeds_df.apply(
    lambda x: x["breed_translations"] + [x["breed"]], axis=1
)
# convert to lower case
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].apply(
    lambda x: [i.lower() for i in x]
)
# english version is the first translation
fci_breeds_df["breed_en"] = fci_breeds_df["breed_translations"].apply(
    lambda x: x[0].lower()
)

In [None]:
# fci_breeds_df.loc[
#     fci_breeds_df["breed_en"].str.contains(r"point", case=False, regex=True)
# ]

In [None]:
# edit all breeds with '- haired' in the name to remove the space and the '-'
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.replace(
    r"- ?haired", "haired", regex=True
)
# add the 'breed_en' breed to the list in alt_names column
fci_breeds_df["alt_names"] = fci_breeds_df.apply(
    lambda x: x["alt_names"] + [x["breed_en"]], axis=1
)

# create a column for the number of varieties from the varieties column
fci_breeds_df["n_varieties"] = fci_breeds_df["varieties"].transform(len)
# clean up the letter-numbering in the varieties column 'a)'
fci_breeds_df["varieties"] = fci_breeds_df["varieties"].apply(
    lambda x: [re.sub(r"^[a-z]\) ", "", i).lower() for i in x]
)

In [None]:
# extract the group number and name from the group column
fci_breeds_df["group_num"] = fci_breeds_df["breed_group"].str.extract(r"(\d+)")
fci_breeds_df["group_name"] = (
    fci_breeds_df["breed_group"].str.split(
        "-", n=1, expand=True)[1].str.strip()
)
# fci_breeds_df

In [None]:
# if there is a breed name with 'pointing dog' in its alt_names, also add the breed name with 'pointer' in its name
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.replace(
    r"pointing dog", "pointer", regex=True
)
fci_breeds_df["alt_names"] = fci_breeds_df.apply(
    lambda x: x["alt_names"] + [x["breed_en"]], axis=1
)

In [None]:
# display only 3 of the the breeds with varieties
fci_breeds_df[fci_breeds_df["n_varieties"] > 0][
    ["breed", "varieties", "alt_names", "breed_en"]
].sample(3)

Some of the `varieties` are just variations in size, coat-color, -hair-length of the same breed. Still, some variations are so popular that they are referred to by this variation name. We will add these variations to the their `alt_names` list.These include:
- swiss hound
- small swiss hound
- german spitz
- belgian shepherd dog
- continental toy spaniel
- chinese crested dog

In [None]:
# add the varieties to the alt_names column of some popular breeds


popular_breeds = [
    "swiss hound",
    "small swiss hound",
    "german spitz",
    "belgian shepherd dog",
    "continental toy spaniel",
    "chinese crested dog",
]


popular_names_mask = fci_breeds_df["breed_en"].isin(popular_breeds)


fci_breeds_df.loc[popular_names_mask, "alt_names"] = (
    fci_breeds_df.loc[popular_names_mask, "alt_names"]
    + fci_breeds_df.loc[popular_names_mask, "varieties"]
)

fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("belgian shepherd dog"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("belgian shepherd dog"), "alt_names"
].apply(
    lambda x: x
    + [
        "belgian sheepdog",
        "belgian tervuren",
        "belgian malinois",
        "belgian groenendael",
        "belgian laekenois",
    ]
)

fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("vallhund"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("vallhund"), "alt_names"
].apply(
    lambda x: x + ["westgotenspitz"]
)
fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("bernese"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("bernese"), "alt_names"
].apply(
    lambda x: x + ["durbachler"]
)
fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("shetland"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("shetland"), "alt_names"
].apply(
    lambda x: x + ["sheltie"]
)
fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("german spitz"), "alt_names"
] = fci_breeds_df.loc[
    fci_breeds_df["breed_en"].str.contains("german spitz"), "alt_names"
].apply(
    lambda x: x
    + [
        "wolfsspitz",
        "keeshond",
        "kleinspitz",
        "mittelspitz",
        "grossspitz",
        "zwergspitz",
    ]
)

In [None]:
fci_breeds_df["no_accent"] = fci_breeds_df["alt_names"].apply(
    lambda x: [remove_accents(i) for i in x]
)
# add the no_accent to the alt_names column and remove duplicates
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"] + \
    fci_breeds_df["no_accent"]
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].apply(
    lambda x: [i.lower() for i in x]
)
# reduce the duplicates within each alt_names list
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].transform(set)

In [None]:
# remove the '()' from the alt_names
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].apply(
    lambda x: [i.replace("(", "").replace(")", "") for i in x]
)

In [None]:
fci_breeds_df.sample(3)

In [None]:
# save to json
fci_breeds_df.to_json("../data/fci_breeds.json", orient="records")

In [None]:
fci_breeds_df.sample(3)

#### From AKC dog breeds page

In [None]:
akc_home_url = "https://www.akc.org/dog-breeds/"


def get_breed_link_info(link):
    breed_link = {}
    driver = get_driver()
    try:
        driver.get(link)
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located(
                (By.XPATH, "//select[@class='custom-select__select']/option")
            )
        )
        # navigate_to_akc_homepage(driver, link)
        options = driver.find_elements(
            By.XPATH, "//select[@class='custom-select__select']/option"
        )
        for option in options:
            breed_link[option.text] = option.get_attribute("value")
        print(f"Number of breeds: {len(breed_link)}")

        return breed_link
    except TimeoutException:
        print(f"Timeout while loading page: {link}")
    except Exception as e:
        print(f"An error occurred while loading page: {link}\n{e}")
    finally:
        driver.quit()


# get the links for each breed
breed_link = get_breed_link_info(akc_home_url)

In [None]:
# create a function which takes the column names and creates a column with a unique list of those column values in each row
def create_alt_names(dataframe, list_of_columns):
    """creates a column with a unique list of those column values in each row"""
    dataframe["alt_names"] = dataframe[list_of_columns].apply(
        lambda row: set(x for x in row if pd.notna(x)), axis=1
    )

In [None]:
breed_link

#### AKC physical traits

In [None]:
akc_breed_link_dict = {
    key.lower(): value for (key, value) in breed_link.items() if value != ""
}
len(akc_breed_link_dict)

In [None]:
def get_breed_info(page_source):
    """Function to get the breed info from the AKC website."""
    breed_metadata = defaultdict(str)

    soup = BeautifulSoup(page_source, "html.parser")

    def extract_text(tag, class_name):
        """Helper function to extract text from a tag with error handling."""
        try:
            return soup.find(tag, {"class": class_name}).text
        except AttributeError:
            return ""

    # Extract breed, temperment, popularity rank,  year recognized, group
    breed_metadata["breed_page"] = extract_text("h1", "page-header__title")
    breed_metadata["temperment"] = extract_text(
        "p", "breed-page__intro__temperment")
    breed_metadata["popularity"] = extract_text(
        "span", "breed-page__popularity__custom-label"
    )
    breed_metadata["year_recognized"] = extract_text(
        "p", "breed-page__popularity__ranking-title"
    )
    breed_metadata["group"] = extract_text(
        "a", "breed-page__intro__group__tooltip")

    # Extract height, weight, life expectancy
    try:
        element = soup.find(
            "div", {"class": "breed-page__hero__overview__icon-block-wrap"}
        )
        attribute_map = {
            "height": "height",
            "weight": "weight",
            "life expectancy": "life_expectancy",
        }
        for ele in element:
            for attribute, key in attribute_map.items():
                if attribute in ele.text.lower():
                    breed_metadata[key] = ele.text
    except AttributeError:
        breed_metadata["height"] = ""
        breed_metadata["weight"] = ""
        breed_metadata["life_expectancy"] = ""

    return breed_metadata

In [None]:
# Constants
SELECTORS = [
    "h1.page-header__title",
    "p.breed-page__intro__temperment",
    "div.breed-page__hero__overview__icon-block-wrap",
    "div.breed-page__intro__group",
]


def navigate_to_page(driver, link):
    """Navigate to the page and return the page source."""
    try:
        driver.get(link)
        # Wait for the specific elements to be loaded
        for selector in SELECTORS:
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, selector))
            )
        # Get the webpage's HTML content
        return driver.page_source
    except TimeoutException:
        print(f"Timeout while loading page: {link}")
        return None
    except Exception as e:
        print(f"An error occurred while loading page: {link}\n{e}")
        return None


def get_contents(breed, link):
    driver = get_driver()
    try:
        page_source = navigate_to_page(driver, link)
        if page_source is not None:
            breed_info = get_breed_info(page_source)
            breed_info["breed"] = breed
            return breed_info
            # return parse_page_source(page_source)
    except TimeoutException:
        print(f"Timeout while loading page: {link}")
    except Exception as e:
        print(f"An error occurred while loading page: {link}\n{e}")
    finally:
        driver.quit()


with cf.ThreadPoolExecutor() as executor:
    futures = {
        executor.submit(get_contents, breed, link)
        for breed, link in akc_breed_link_dict.items()
    }
    breed_data = list(
        tqdm(
            (future.result() for future in cf.as_completed(futures)),
            total=len(akc_breed_link_dict),
        )
    )

In [None]:
# breed_data

In [None]:
breed_data = [x for x in breed_data if x is not None]

akc_physical_traits = pd.DataFrame(breed_data)
akc_physical_traits["breed"] = (
    akc_physical_traits["breed"]
    .str.lower()
    .str.replace("-", " ")
    .str.replace("'", "")
    .apply(remove_accents)
)
# add comments


def parse_range(s):
    if isinstance(s, str):
        numbers = list(map(float, re.findall(r"\d+(?:\.\d+)?", s)))
        if numbers:
            return numbers + [None] * (2 - len(numbers))
    return [None, None]


def get_upper_lower_bound(dataframe, column):
    """Parse a range string and return the lower and upper bounds."""
    dataframe[column + "_ll"], dataframe[column + "_ul"] = zip(
        *dataframe[column].map(parse_range)
    )
    dataframe[column + "_ul"].fillna(dataframe[column + "_ll"], inplace=True)


# add in the columns of the upper and lower bounds for height, weight, and life expectancy
get_upper_lower_bound(akc_physical_traits, "weight")
get_upper_lower_bound(akc_physical_traits, "height")
get_upper_lower_bound(akc_physical_traits, "life_expectancy")

In [None]:
akc_physical_traits["group"] = akc_physical_traits["group"].str.strip(" »")
akc_physical_traits["breed_page"] = akc_physical_traits["breed_page"].str.lower()
akc_physical_traits["year_accepted"] = akc_physical_traits[
    "year_recognized"
].str.extract(r"(\d{4})")
akc_physical_traits["is_recognized_breed"] = akc_physical_traits[
    "year_accepted"
].notna()

create_alt_names(akc_physical_traits, ["breed_page", "breed"])

In [None]:
print(f"{akc_physical_traits.shape=}")
akc_physical_traits.isna().sum()

In [None]:
akc_physical_traits.sample(3)
# akc_physical_traits.group.value_counts()
# akc_physical_traits.popularity.value_counts()
# akc_physical_traits.sort_values(by=["life_expectancy_ll"])

akc_physical_traits.query('popularity=="1 of 201"')

In [None]:

columns_to_drop = ['height', 'weight', 'year_recognized', 'life_expectancy', ]

# save dataframe to a json file
akc_physical_traits.to_json("../data/akc_breeds.json", orient='records')

#### Matching breeds among the breeds lists

In [None]:
fuzz_funcs = [
    fuzz.WRatio,
    fuzz.UWRatio,
    fuzz.UQRatio,
    fuzz.token_set_ratio,
    fuzz.token_sort_ratio,
    # fuzz.partial_token_sort_ratio,
]

In [None]:
# FCI breeds
fci_breeds_df = pd.read_json("../data/fci_breeds.json", orient="records")


fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].transform(set)
fci_breeds_df.sample().T

In [None]:
# AKC breeds
new_akc_df = pd.read_json("../data/akc_breeds.json", orient="records")
new_akc_df["breed_en"] = new_akc_df["breed"].copy()
new_akc_df["alt_names"] = new_akc_df["alt_names"].apply(
    lambda x: [i for i in x if i is not None]
)
new_akc_df["alt_names"] = new_akc_df["alt_names"].transform(set)

new_akc_df["match_breed"] = None
new_akc_df.sample().T

In [None]:
new_akc_df.shape

In [None]:
# merge the akc breeds with the fci breeds

akc_nan_mask = new_akc_df["match_breed"].isna()
print(f"Nan count:{akc_nan_mask.sum()} |Column: match_breed ")
# find matches for the fci breeds which match the akc breeds
new_akc_df.loc[akc_nan_mask, "match_breed"] = apply_fuzzy_matching_to_breed_column(
    new_akc_df.loc[akc_nan_mask], "breed_en", fci_breeds_df, fuzz_funcs
)[akc_nan_mask]

akc_nan_mask = new_akc_df["match_breed"].isna()
print(f"Nan count:{akc_nan_mask.sum()} |Column: match_breed ")
# group for when 1 fci breed matches more than 1 akc breed eg. belgian shephard dog
akc_grouped = (
    new_akc_df.groupby("match_breed")["alt_names"]
    .apply(lambda x: set.union(*x))
    .reset_index()
)
# merge the akc breeds with the fci breeds
fci_akc_breeds = fci_breeds_df.merge(
    akc_grouped,
    how="left",
    left_on="breed_en",
    right_on="match_breed",
    suffixes=("_fci", "_akc"),
)
# add any variation in the breed to the alt_names column
fci_akc_breeds["alt_names"] = fci_akc_breeds.apply(
    lambda row: row["alt_names_fci"].union(row["alt_names_akc"])
    if pd.notnull(row["match_breed"])
    else row["alt_names_fci"],
    axis=1,
)
# add the unmatched akc breeds  to the fci_akc_breeds dataframe
fci_akc_breeds = (
    pd.concat(
        [
            fci_akc_breeds[["breed_en", "alt_names"]],
            new_akc_df[akc_nan_mask][["breed_en", "alt_names"]],
        ]
    )
    .sort_values("breed_en")
    .reset_index(drop=True)
)

In [None]:
# unrecognized_breeds = [
#     "waller",
#     "tamaskan",
#     "ratonero bodeguero",
#     "bobtail",
#     "elo",
#     "bardino majorero",
# ]
# unrecog_df = pd.DataFrame(
#     ((breed, {breed}) for breed in unrecognized_breeds),
#     columns=["breed_en", "alt_names"],
# )
# fci_akc_breeds = pd.concat([fci_akc_breeds, unrecog_df]).reset_index(drop=True)

In [None]:
# Hz website breeds
hz_breeds_df = pd.read_json("../data/hz_breeds.json", orient="records")

In [None]:
# all_breeds = translate_list_to_dict(breeds)
# hz_breeds_df = hz_size_weight_df[["breed_de", "breed_en"]]
# Remove the accents from the German breed names
hz_breeds_df["no_accents"] = hz_breeds_df["breed_de"].apply(remove_accents)
hz_breeds_df["combined_breeds"] = hz_breeds_df.apply(
    lambda row: {row["breed_de"], row["breed_en"], row["no_accents"]}, axis=1
)

hz_breeds_df["match_breed"] = None

columns_to_match = ["breed_de", "breed_en", "no_accents"]

for column in columns_to_match:
    nan_mask = hz_breeds_df["match_breed"].isna()
    print(f"Nan count:{nan_mask.sum()} |Column: {column} ")

    hz_breeds_df.loc[nan_mask, "match_breed"] = apply_fuzzy_matching_to_breed_column(
        hz_breeds_df.loc[nan_mask],
        column,
        fci_akc_breeds,
        fuzz_funcs,
    )[nan_mask]

    nan_mask = hz_breeds_df["match_breed"].isna()
    print(f"Nan count:{nan_mask.sum()} |Column: {column} ")
    # print(f"Number of NaNs after matching on {column}: {nan_mask.sum()}")

hz_breeds_grouped = (
    hz_breeds_df.groupby("match_breed")["combined_breeds"]
    .apply(lambda x: set.union(*x))
    .reset_index()
)

# Then, merge the two dataframes on the column that they share
merged_df = pd.merge(
    fci_akc_breeds,
    hz_breeds_grouped,
    how="left",
    left_on="breed_en",
    right_on="match_breed",
)
merged_df["was_merged"] = ~merged_df["match_breed"].isna()

# Then, apply the function to add 'combined_breeds' to 'alt_names'
merged_df["alt_names"] = merged_df.apply(
    lambda row: row["alt_names"].union(row["combined_breeds"])
    if pd.notnull(row["match_breed"])
    else row["alt_names"],
    axis=1,
)

# If you want to update the original fci_breeds_df DataFrame, you can do so
fci_akc_breeds = merged_df.drop(columns=["combined_breeds", "match_breed"])
nan_mask = hz_breeds_df["match_breed"].isna()
print(f"{nan_mask.sum()} NaN values remaining.")

In [None]:
fci_akc_breeds.to_json("../data/fci_akc_breeds.json", orient="records")

matching the akc breeds to the FCI breeds

In [None]:
hz_breeds_df.query('breed_en.str.contains("elo")').head(50)

hz_breeds_df.loc[
    (
        hz_breeds_df["combined_breeds"].apply(
            lambda x: any(i in x for i in unrecognized_breeds)
        )
    )
]


# new_akc_df

#### Wikipedia list of breeds of dogs

In [None]:
dog_breeds_list_url = "https://en.wikipedia.org/wiki/List_of_dog_breeds"

In [None]:
def get_breeds(driver, link):
    """Function to get the breeds from the wikipedia page."""
    breeds = []
    driver.get(link)
    try:
        # get all the elements with the dog breeds
        div_cols = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "div-col"))
        )
        # get the breeds in each div_col except the last
        for div_col in div_cols[:-1]:
            breed_elements = div_col.find_elements(By.TAG_NAME, "li")
            for breed_element in breed_elements:
                breeds.append(breed_element.text)

    except NoSuchElementException as e:
        print("No such element", e)

    driver.quit()

    return breeds

In [None]:
my_d = start_driver()
breed_driver = partial(get_breeds, my_d)

breeds_list = breed_driver(dog_breeds_list_url)

In [None]:
# removethe [\d] from the breed names
new_breed_list = [re.sub(r"\[\d+\]", "", breed) for breed in breeds_list]
new_breed_list = [breed.upper() for breed in new_breed_list]
# show nnumber of breeds
len(new_breed_list)