In [5]:
from itertools import combinations
import itertools as it
from bs4 import BeautifulSoup
from urllib.request import urlopen
import lxml
from lxml import etree
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from collections import defaultdict
import string
import unicodedata
import pickle


from contextlib import contextmanager

import re
import pandas as pd
from matplotlib import pyplot as plt

from thefuzz import process
from thefuzz import fuzz

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    StaleElementReferenceException,
)

In [48]:
def start_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [4]:
def remove_accents(input_str):
    """Function to remove accents from a string.
    It takes as argument a string and returns the same string
    without accents."""
    nfkd_form = (
        unicodedata.normalize("NFKD", input_str).encode("ASCII", "ignore").decode()
    )
    # return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
    return nfkd_form


remove_accents("résuméö")

'resumeo'

#### Info about Zurich districts

In [5]:
# save the url of the website
zurich_districts_url = "https://www.zuerich.com/en/visit/about-zurich/zurichs-districts"

# get the html content of the website
zurich_response = urlopen(zurich_districts_url)
zurich_html_content = zurich_response.read()

In [6]:
# parse the html content
zurich_soup = BeautifulSoup(zurich_html_content, "lxml")

In [7]:
# select all elements with id starting with 's-' and a number between 1 and 12
pattern = re.compile(r"s-[1-9]|s-1[0-2]")
elements = zurich_soup.find_all(id=pattern)

In [8]:
# create a dataframe with the information of the districts
districts = {element.find("h2").text: element.find("p").text for element in elements}
districts_df = pd.DataFrame.from_dict(districts, orient="index", columns=["desc"])


# make the index into a column and split it into district number and district name
districts_df = districts_df.reset_index()
districts_df = (
    districts_df["index"]
    .str.split("–", expand=True)
    .rename({0: "district_number", 1: "district_name"}, axis=1)
    .join(districts_df)
    .drop("index", axis=1)
)
# strip the whitespace from the columns
districts_df["district_number"] = districts_df["district_number"].str.strip()
# create regex to get the number from the district_number column
regex_pattern = re.compile(r"([\d]+)")

# create a new column with the district number
districts_df["district"] = (
    districts_df["district_number"]
    .str.extract(
        regex_pattern,
    )
    .astype("category")
)
districts_df.drop("district_number", axis=1, inplace=True)

districts_df["district_name"] = districts_df["district_name"].str.strip()
districts_df["desc"] = districts_df["desc"].str.strip()

# Add column for the length of the desc
districts_df["desc_length"] = districts_df["desc"].str.len()

print(districts_df)

                      district_name  \
0             Old Town, City Center   
1            Left Shore of the Lake   
2      Creative Quarter of Wiedikon   
3   Creative Quarter of Langstrasse   
4   Creative Quarter of Zürich-West   
5                University Quarter   
6                 On the Zürichberg   
7           Right Shore of the Lake   
8      At the Foot of the Uetliberg   
9               Right of the Limmat   
10                      Zürich Nord   
11                   Schwamendingen   

                                                 desc district  desc_length  
0   The most central district encompasses the hist...        1          355  
1   The architecturally attractive Enge Train Stat...        2          206  
2   Once mainly a working-class neighborhood, the ...        3          215  
3   The district around Langstrasse was long regar...        4          199  
4   In the quarter where huge machines once clatte...        5          287  
5   University buildings

In [9]:
districts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   district_name  12 non-null     object  
 1   desc           12 non-null     object  
 2   district       12 non-null     category
 3   desc_length    12 non-null     int64   
dtypes: category(1), int64(1), object(2)
memory usage: 824.0+ bytes


In [10]:
# create a styler object and set the wrap parameter to True
styler = districts_df.style.set_properties(**{"white-space": "pre-wrap"})

formatted_df = styler.format({"description": lambda x: x})
formatted_df

Unnamed: 0,district_name,desc,district,desc_length
0,"Old Town, City Center","The most central district encompasses the historical Old Town on both banks of the River Limmat, as well as the area to the south bordering on the lake basin. It is home to the prestigious Bahnhofstrasse, magnificent guild houses, imposing churches, and the famous Opera House. The best view of District 1 is to be had from the top of the Karlsturm tower.",1,355
1,Left Shore of the Lake,"The architecturally attractive Enge Train Station built out of Ticino granite, the Museum Rietberg with its beautiful park, the Seebad Enge lido, and the Rote Fabrik give this district a Mediterranean feel.",2,206
2,Creative Quarter of Wiedikon,"Once mainly a working-class neighborhood, the district below the Uetliberg is now a popular residential area with cozy cafés, boutiques, and vintage stores. The Houdini movie theater also lies on Wiedikon territory.",3,215
3,Creative Quarter of Langstrasse,"The district around Langstrasse was long regarded as a den of iniquity. Nowadays, it is known for its diverse restaurants and never-sleeping nightlife, with bars such as the Olé and the Club Zukunft.",4,199
4,Creative Quarter of Zürich-West,"In the quarter where huge machines once clattered away, now nightclubs, cultural institutions, and universities cluster around the Prime Tower. Converted structures such as the Viadukt and designer stores like the Freitag Tower give the neighborhood its characteristic trendy atmosphere.",5,287
5,University Quarter,"University buildings, Jugendstil villas, green areas, and the Dynamo cultural center characterize Zurich’s District 6. Thanks to its tranquility and closeness to the city center, it is a much sought-after residential area.",6,222
6,On the Zürichberg,"The hillside location, little traffic, and fabulous views mean than living here does not come cheap. Neighbors include Zurich Zoo and the luxury hotel, The Dolder Grand.",7,169
7,Right Shore of the Lake,"This district starts behind the Opera House and stretches as far as the open-air bathing facility, Seebad Tiefenbrunnen. It features boutiques, villas, the Chinawiese recreational area, and the Seebad Utoquai outdoor swimming bath.",8,231
8,At the Foot of the Uetliberg,"For a long time, this district was solely known for the Letzigrund Stadium. In recent years, however, the quarter around Altstetten has become THE place to be, with creative club and gastronomy concepts.",9,203
9,Right of the Limmat,"This district boasts a fantastic location: in summer, everyone meets at the Unterer Letten and Oberer Letten riverside lidos to swim and bask in the sun. In addition, the view over Zurich from the Waid quarter is unbeatable.",10,224


In [11]:
districts_df.to_csv("../data/zurich_districts.csv", index=False)

#### Info for Dog breeds from FCI

In [12]:
fci_url = "https://www.fci.be/en/Nomenclature/educationGroupe.aspx"
fci_response = urlopen(fci_url)
fci_html_content = fci_response.read()

fci_parsed_html = etree.HTML(fci_html_content)

In [13]:
breed_groups = {}
elements = fci_parsed_html.xpath("//*[@class='nom']")
for element in elements:
    breed_groups[element.text] = element.get("href")

In [14]:
fci_breeds_df = (
    pd.DataFrame.from_dict(breed_groups, orient="index", columns=["link"])
    .reset_index()
    .rename(columns={"index": "breed"})
)

# define regex pattern to get what is in the most right brackets
regex_pattern = re.compile(r"\((?=[^()]*\))([^()]+)\)$")


# fci_breeds_df["breed"].str.extract(regex_pattern)
# fci_breeds_df["breed"].str.split("(", n=1, expand=True).rename(
#     columns={0: "breed_orig", 1: "breed_en"}
# )
#
fci_breeds_df.sample(3)

Unnamed: 0,breed,link
1,BOLOGNESE (196),/en/nomenclature/BOLOGNESE-196.html
28,SUOMENPYSTYKORVA (49) (FINNISH SPITZ),/en/nomenclature/FINNISH-SPITZ-49.html
10,DOGUE DE BORDEAUX (116),/en/nomenclature/DOGUE-DE-BORDEAUX-116.html


In [15]:
fci_breeds_df[["breed_orig", "breed_en"]] = fci_breeds_df["breed"].str.split(
    "(", n=1, expand=True
)
fci_breeds_df.sample(3)

Unnamed: 0,breed,link,breed_orig,breed_en
5,CIMARRÓN URUGUAYO (353),/en/nomenclature/CIMARRON-URUGUAYO-353.html,CIMARRÓN URUGUAYO,353)
4,CHIHUAHUEÑO (218) (CHIHUAHUA),/en/nomenclature/CHIHUAHUA-218.html,CHIHUAHUEÑO,218) (CHIHUAHUA)
14,ISTARSKI OSTRODLAKI GONIC (152) (ISTRIAN WIR...,/en/nomenclature/ISTRIAN-WIRE-HAIRED-HOUND-152...,ISTARSKI OSTRODLAKI GONIC,152) (ISTRIAN WIRE-HAIRED HOUND)


Turned out that webpage only had 33 breeds although it was from the all breeds link.

In [16]:
only_letters_pattern = r"\(?([A-Za-z-\.\s]+)\)"
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.extract(only_letters_pattern)
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].fillna(
    fci_breeds_df["breed_orig"].transform(lambda x: x)
)
fci_breeds_df["breed_orig"] = fci_breeds_df["breed_orig"].str.strip().str.lower()
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.strip().str.lower()
# fci_breeds_df

In [17]:
fci_breeds_df["weblink"] = fci_breeds_df["link"].apply(lambda x: "www.fci.be" + x)
# fci_breeds_df

In [18]:
fci_breeds_df.to_csv("../data/fci_dog_breeds.csv", index=False)

#### Get all the breeds from the FCI individually

In [19]:
def get_fci_breeds(driver, link):
    """Function to get the breeds from the FCI website.
    It takes as arguments the driver and the link to the page.
    It navigates to each letter and then to each breed and
    gets the translations of the breed name in other languages."""
    name_link_list = []

    driver.get(link)

    # Wait for the letters to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "initiales"))
    )

    letters = driver.find_element(By.CLASS_NAME, "initiales")

    for n, letter in tqdm(enumerate(letters.find_elements(By.TAG_NAME, "a"))):
        try:
            # click first on the letter
            letter.click()
        except StaleElementReferenceException:
            # the element is no longer attached to the DOM so find them again
            letters = driver.find_element(By.CLASS_NAME, "initiales")
            letter = letters.find_elements(By.TAG_NAME, "a")[n]
            letter.click()

        # Wait for the breeds to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "listeraces"))
        )

        breeds = driver.find_element(By.CLASS_NAME, "listeraces")

        breed_text = None
        breed_ref = None
        for n2, breed in tqdm(enumerate(breeds.find_elements(By.TAG_NAME, "a"))):
            try:
                breed_text = breed.text
                breed_ref = breed.get_attribute("href")
                # print(breed_text, breed_ref)
                # click on the breed
                breed.click()
            except StaleElementReferenceException:
                # the element is no longer attached to the DOM so find them again
                breeds = driver.find_element(By.CLASS_NAME, "listeraces")
                breed = breeds.find_elements(By.TAG_NAME, "a")[n2]
                breed_text = breed.text
                breed_ref = breed.get_attribute("href")
                # print(breed_text, breed_ref)
                breed.click()

            # wait for that breed's page to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "racesgridview"))
            )
            table = driver.find_element(By.CLASS_NAME, "racesgridview")
            # get the other languages transations of the breed name
            breed_translations = []
            # get the other languages transations of the breed name
            # only get that column of the table
            rows = table.find_elements(By.TAG_NAME, "tr")
            for row in rows[1:]:
                breed_translations.append(
                    row.find_elements(By.TAG_NAME, "span")[0].text
                )

            # print((breed_text, breed_ref, breed_translations))
            # add the breed, link, and translations to the list
            name_link_list.append((breed_text, breed_ref, breed_translations))
            # go back to the previous page with the breeds
            driver.back()

    driver.quit()

    return name_link_list

In [20]:
# driver = webdriver.Chrome()

fci_nonmenclature_url = "https://fci.be/en/Nomenclature/Default.aspx"

In [21]:
# This cell gets the fci list of breeds and does it letter by letter
# can take up to 10 minutes to run
my_d = start_driver()
fci_list = get_fci_breeds(my_d, fci_nonmenclature_url)

23it [00:20,  1.11it/s]
41it [01:00,  1.46s/it]
43it [00:57,  1.33s/it]
25it [00:28,  1.12s/it]
16it [00:19,  1.20s/it]
8it [00:10,  1.33s/it]
24it [01:02,  2.61s/it]
11it [00:17,  1.55s/it]
9it [00:20,  2.31s/it]
1it [00:01,  1.28s/it]
3it [00:05,  1.91s/it]]
13it [00:25,  1.99s/it]
8it [00:07,  1.10it/s]]
10it [00:08,  1.12it/s]
13it [00:12,  1.01it/s]
3it [00:02,  1.06it/s]]
1it [00:00,  1.26it/s]]
27it [00:30,  1.14s/it]
10it [00:09,  1.03it/s]
39it [00:43,  1.12s/it]
9it [00:09,  1.04s/it]]
3it [00:03,  1.10s/it]]
9it [00:08,  1.01it/s]]
1it [00:00,  1.19it/s]]
3it [00:02,  1.09it/s]]
3it [00:03,  1.08s/it]]
26it [08:16, 19.10s/it]


In [22]:
fci_breeds_trans_df = pd.DataFrame(fci_list, columns=["breed", "link", "translations"])
fci_breeds_trans_df.sample(3)

# add the value in the breed column to the list in the translations column in each respective row
fci_breeds_trans_df["alt_names"] = fci_breeds_trans_df.apply(
    lambda x: x["translations"] + [x["breed"]], axis=1
)
fci_breeds_trans_df["alt_names"] = (
    fci_breeds_trans_df["alt_names"]
    .transform(set)
    .apply(lambda x: [i.lower() for i in x])
)

In [82]:
fci_breeds_trans_df["no_accent"] = fci_breeds_trans_df["alt_names"].apply(
    lambda x: [remove_accents(i) for i in x]
)

fci_breeds_trans_df["breed_en"] = fci_breeds_trans_df["translations"].apply(
    lambda x: x[0]
)
fci_breeds_trans_df["alt_names"] += fci_breeds_trans_df["no_accent"]
fci_breeds_trans_df["alt_names"] = fci_breeds_trans_df["alt_names"].transform(set)

In [83]:
fci_breeds_trans_df.to_csv("../data/fci_breeds_trans.csv", index=False)

In [24]:
fci_breeds_trans_df[fci_breeds_trans_df["breed"].str.startswith("E")]

Unnamed: 0,breed,link,translations,alt_names,no_accent,breed_en
132,EESTI HAGIJAS,https://fci.be/en/nomenclature/ESTONIAN-HOUND-...,"[ESTONIAN HOUND, CHIEN COURANT D'ESTONIE, ESTN...","[chien courant d'estonie, eesti hagijas, sabue...","[chien courant d'estonie, eesti hagijas, sabue...",ESTONIAN HOUND
133,ENGLISH COCKER SPANIEL,https://fci.be/en/nomenclature/ENGLISH-COCKER-...,"[ENGLISH COCKER SPANIEL, COCKER SPANIEL ANGLAI...","[english cocker spaniel, cocker spaniel inglés...","[english cocker spaniel, cocker spaniel ingles...",ENGLISH COCKER SPANIEL
134,ENGLISH FOXHOUND,https://fci.be/en/nomenclature/ENGLISH-FOXHOUN...,"[ENGLISH FOXHOUND, ENGLISH FOXHOUND, ENGLISH F...","[english foxhound, foxhound inglés]","[english foxhound, foxhound ingles]",ENGLISH FOXHOUND
135,ENGLISH POINTER,https://fci.be/en/nomenclature/ENGLISH-POINTER...,"[ENGLISH POINTER, POINTER ANGLAIS, ENGLISCHER ...","[pointer anglais, englischer pointer, pointer ...","[pointer anglais, englischer pointer, pointer ...",ENGLISH POINTER
136,ENGLISH SETTER,https://fci.be/en/nomenclature/ENGLISH-SETTER-...,"[ENGLISH SETTER, SETTER ANGLAIS, ENGLISCHER SE...","[setter anglais, setter inglés, english setter...","[setter anglais, setter ingles, english setter...",ENGLISH SETTER
137,ENGLISH SPRINGER SPANIEL,https://fci.be/en/nomenclature/ENGLISH-SPRINGE...,"[ENGLISH SPRINGER SPANIEL, ENGLISH SPRINGER SP...","[english springer spaniel, springer spaniel in...","[english springer spaniel, springer spaniel in...",ENGLISH SPRINGER SPANIEL
138,ENGLISH TOY TERRIER,https://fci.be/en/nomenclature/ENGLISH-TOY-TER...,"[ENGLISH TOY TERRIER (BLACK &TAN), TERRIER ANG...","[terrier anglais d'agrement (noir et feu), eng...","[terrier anglais d'agrement (noir et feu), eng...",ENGLISH TOY TERRIER (BLACK &TAN)
139,ENTLEBUCHER SENNENHUND,https://fci.be/en/nomenclature/ENTLEBUCH-CATTL...,"[ENTLEBUCH CATTLE DOG, BOUVIER DE L'ENTLEBUCH,...","[bouvier de l'entlebuch, entlebuch cattle dog,...","[bouvier de l'entlebuch, entlebuch cattle dog,...",ENTLEBUCH CATTLE DOG
140,EPAGNEUL BLEU DE PICARDIE,https://fci.be/en/nomenclature/BLUE-PICARDY-SP...,"[BLUE PICARDY SPANIEL, EPAGNEUL BLEU DE PICARD...","[epagneul bleu de picardie, blue picardy spani...","[epagneul bleu de picardie, blue picardy spani...",BLUE PICARDY SPANIEL
141,EPAGNEUL BRETON,https://fci.be/en/nomenclature/BRITTANY-SPANIE...,"[BRITTANY SPANIEL, EPAGNEUL BRETON, BRETONISCH...","[spaniel bretón, bretonischer spaniel, brittan...","[spaniel breton, bretonischer spaniel, brittan...",BRITTANY SPANIEL


#### Info about Dog breeds from AKC


In [27]:
akc_dog_breed_groups_url = "https://www.akc.org/public-education/resources/general-tips-information/dog-breeds-sorted-groups/"

# get the html content of the website
akc_response = urlopen(akc_dog_breed_groups_url)
akc_html_content = akc_response.read()

In [28]:
# Parse the html content
akc_soup = BeautifulSoup(akc_html_content, "lxml")

In [29]:
# get the elements with links as the text is the info you want
breed_list = []
link_list = []
elements = akc_soup.find_all("a", href=True)
for element in elements:
    if "dog-breeds" in element.get("href"):
        breed_list.append(element.text.strip())
        link_list.append(element.get("href"))

In [30]:
# link_list
# len(link_list)
# len(breed_list)
breed_link_dict = dict(zip(breed_list, link_list))

breed_link_dict_casefolded = {
    key.casefold(): value for key, value in breed_link_dict.items()
}
breed_link_dict_casefolded

{'view all breeds': 'https://www.akc.org/dog-breeds/',
 'dog breeds': '/expert-advice/dog-breeds/',
 'list of breeds by group': 'https://www.akc.org/public-education/resources/general-tips-information/dog-breeds-sorted-groups/',
 'herding group': 'https://www.akc.org/dog-breeds/groups/herding/',
 'australian cattle dog': 'https://www.akc.org/dog-breeds/australian-cattle-dog/',
 'australian shepherd': 'https://www.akc.org/dog-breeds/australian-shepherd/',
 'bearded collie': 'https://www.akc.org/dog-breeds/bearded-collie/',
 'beauceron': 'https://www.akc.org/dog-breeds/beauceron/',
 'belgian laekenois': 'https://www.akc.org/dog-breeds/belgian-laekenois/',
 'belgian malinois': 'https://www.akc.org/dog-breeds/belgian-malinois/',
 'belgian sheepdog': 'https://www.akc.org/dog-breeds/belgian-sheepdog/',
 'belgian tervuren': 'https://www.akc.org/dog-breeds/belgian-tervuren/',
 'bergamasco': 'https://www.akc.org/dog-breeds/bergamasco-sheepdog/',
 'berger picard': 'https://www.akc.org/dog-breeds

In [31]:
# loop over the breed_list and create a dictionary of group breeds and their sub-breeds
group_breeds = {}
current_group = None
for breed in breed_link_dict_casefolded:
    if re.search(r"group|stock|class", breed):
        current_group = breed
        group_breeds[current_group] = []
    elif current_group is not None:
        group_breeds[current_group].append(breed)

# print the resulting dictionary of breed groups and their breeds
# print(group_breeds)

In [32]:
# create a dataframe with only 2 columns, one for the breed group and the other for the breeds in that group
# this dataframe would be in long format

group_breeds_df = pd.DataFrame(group_breeds.items(), columns=["breed_group", "breed"])
akc_breeds_df = group_breeds_df.explode("breed")
akc_breeds_df.reset_index(drop=True, inplace=True)
akc_breeds_df["breed_group"] = (
    akc_breeds_df["breed_group"].str.replace(" GROUP", "").str.lower()
)
akc_breeds_df["breed"] = akc_breeds_df["breed"].str.lower()
akc_breeds_df = akc_breeds_df.iloc[1:-1, :]

In [33]:
akc_breeds_df["links"] = akc_breeds_df["breed"].map(breed_link_dict_casefolded)

In [34]:
akc_breeds_df.to_csv("../data/akc_dog_breeds.csv", index=False)

In [33]:
def find_fuzzy_match(breed):
    """Find a fuzzy match for the breed in the fci breeds dataframe."""
    for index, row in fci_breeds_trans_df.iterrows():
        for alt_name in row["alt_names"]:
            ratio = fuzz.ratio(breed.lower(), alt_name.lower())
            if ratio > 90:
                return index
    return np.nan


def find_fci_index(breed):
    """Find the index of the breed in the fci breeds dataframe."""
    index = fci_breeds_trans_df[
        fci_breeds_trans_df["alt_names"].apply(
            lambda x: breed.lower() in [name.lower() for name in x]
        )
    ].index
    if len(index) > 0:
        return index[0]
    else:
        fuzzy_index = find_fuzzy_match(breed)
        if fuzzy_index is not None:
            fci_breeds_trans_df.at[fuzzy_index, "alt_names"].append(breed)
            return fuzzy_index
        else:
            return np.nan

In [33]:
akc_breeds_df.breed_group.unique().tolist()
breed_type_match = {}
fci_breed_groups = [
    "Sheepdogs and Cattledogs (except Swiss Cattledogs)",
    "Pinscher and Schnauzer - Molossoid and Swiss Mountain and Cattledogs",
    "Terriers",
    "Dachshunds",
    "Spitz and primitive types",
    "Scent hounds and related breeds",
    "Pointing Dogs",
    "Retrievers - Flushing Dogs Water Dogs",
    "Companion and Toy Dogs",
    "Sighthounds",
]
fci_breeds = {fci_breed_groups.index(group) + 1: group for group in fci_breed_groups}
akc_breed_groups = akc_breeds_df.breed_group.unique().tolist()
# akc_breed_groups
fci_breeds

{1: 'Sheepdogs and Cattledogs (except Swiss Cattledogs)',
 2: 'Pinscher and Schnauzer - Molossoid and Swiss Mountain and Cattledogs',
 3: 'Terriers',
 4: 'Dachshunds',
 5: 'Spitz and primitive types',
 6: 'Scent hounds and related breeds',
 7: 'Pointing Dogs',
 8: 'Retrievers - Flushing Dogs Water Dogs',
 9: 'Companion and Toy Dogs',
 10: 'Sighthounds'}

#### Spitz Breed Group

Get info from this story about the spitz breeds.

In [34]:
spitz_story_url = "https://www.akc.org/expert-advice/dog-breeds/spitz-dog-breeds/"

spitz_response = urlopen(spitz_story_url)
spitz_html_content = spitz_response.read()

In [35]:
spitz_soup = BeautifulSoup(spitz_html_content, "lxml")

# get the breeds linted in the 'tag-set__item-link' class
elements = spitz_soup.find_all("a", class_="tag-set__item-link")
spitz_breeds = [element.text for element in elements]

display(spitz_breeds)

# find these breeds in the akc_breeds_df
akc_breeds_df[akc_breeds_df.breed.isin(spitz_breeds)]

['chow chow',
 'norwegian elkhound',
 'siberian husky',
 'samoyed',
 'pomeranian',
 'norwegian buhund',
 'norwich terrier',
 'alaskan malamute',
 'icelandic sheepdog',
 'swedish vallhund',
 'keeshond',
 'finnish lapphund',
 'akita',
 'shiba inu',
 'spitz breeds',
 'american eskimo dog']

Unnamed: 0,breed_group,breed,links
18,herding group,finnish lapphund,https://www.akc.org/dog-breeds/finnish-lapphund/
20,herding group,icelandic sheepdog,https://www.akc.org/dog-breeds/icelandic-sheep...
24,herding group,norwegian buhund,https://www.akc.org/dog-breeds/norwegian-buhund/
33,herding group,swedish vallhund,https://www.akc.org/dog-breeds/swedish-vallhund/
53,hound group,norwegian elkhound,https://www.akc.org/dog-breeds/norwegian-elkho...
81,toy group,pomeranian,https://www.akc.org/dog-breeds/pomeranian/
89,non-sporting group,american eskimo dog,https://www.akc.org/dog-breeds/american-eskimo...
94,non-sporting group,chow chow,https://www.akc.org/dog-breeds/chow-chow/
99,non-sporting group,keeshond,https://www.akc.org/dog-breeds/keeshond/
105,non-sporting group,shiba inu,https://www.akc.org/dog-breeds/shiba-inu/


#### Year breed was recognized by AKC

In [35]:
akc_breed_year_url = "https://www.akc.org/press-center/articles-resources/facts-and-stats/breeds-year-recognized/"

akc_breed_year_response = urlopen(akc_breed_year_url)
akc_breed_year_html_content = akc_breed_year_response.read()

In [36]:
akc_breed_year_soup = BeautifulSoup(akc_breed_year_html_content, "html.parser")

In [37]:
# convert to etree
akc_breed_year_parsed_html = etree.HTML(akc_breed_year_html_content)

# get the elements with the tag span and that have '&nbsp' in the text
akc_breed_year_elements = akc_breed_year_parsed_html.xpath(
    "//*[contains(text(), '\u00A0')]"
)
[element.text.strip() for element in akc_breed_year_elements]

# get elements in the class 'content-body__text-long'
akc_breed_year_element = akc_breed_year_soup.find(
    "div", class_="content-body__text-long"
)

# find the p tag in each of those elements and get the text in the span tag in the p tag
year_breed_list = [
    element.text.strip().replace("\xa0", "")
    for element in akc_breed_year_element
    if element.text.strip()
][2:]

In [38]:
year_breed_df = pd.DataFrame(
    [yb.split("–") for yb in year_breed_list], columns=["year", "breed"]
)
for col in year_breed_df.columns:
    year_breed_df[col] = year_breed_df[col].str.strip()
year_breed_df["year"] = year_breed_df["year"].astype(int)
year_breed_df["breed"] = year_breed_df["breed"].str.lower()

# make breed the first column
year_breed_df = year_breed_df[["breed", "year"]]
year_breed_df

Unnamed: 0,breed,year
0,pointer,1878
1,retriever (chesapeake bay),1878
2,spaniel (clumber),1878
3,spaniel (cocker),1878
4,setter (english),1878
...,...,...
195,belgian laekenois,2020
196,biewer terrier,2021
197,bracco italiano,2022
198,mudi,2022


In [41]:
# akc_breeds_df

In [39]:
# define a function to match breed names using FuzzyWuzzy
def match_breed_name(name, choices, scorer=fuzz.token_sort_ratio):
    mismo, score, *_ = process.extractOne(name, choices, scorer=scorer)
    return mismo, score


# create a copy of the akc_breeds_df dataframe to avoid modifying the original
new_akc_df = akc_breeds_df.copy()
new_akc_df.sample(5)

Unnamed: 0,breed_group,breed,links
223,foundation stock service,bolognese,https://www.akc.org/dog-breeds/bolognese/
270,foundation stock service,swedish lapphund,https://www.akc.org/dog-breeds/swedish-lapphund/
103,non-sporting group,poodle,https://www.akc.org/dog-breeds/poodle/
210,miscellaneous class,portuguese podengo,https://www.akc.org/dog-breeds/portuguese-pode...
7,herding group,belgian sheepdog,https://www.akc.org/dog-breeds/belgian-sheepdog/


In [40]:
# create new columns in the new_akc_df dataframe to store the closest matching breed name and score
new_akc_df["closest_match"], new_akc_df["match_score"] = zip(
    *new_akc_df["breed"].apply(lambda x: match_breed_name(x, year_breed_df["breed"]))
)
missed_breeds = set(year_breed_df["breed"].unique().tolist()) - set(
    new_akc_df["closest_match"].unique().tolist()
)
# fix the one that was missed
new_akc_df.loc[new_akc_df["breed"].str.contains("plott"), "closest_match"] = "plott"

In [41]:
pd.set_option("display.max_rows", 300)

new_akc_df.loc[
    new_akc_df["breed_group"].str.contains(
        r"miscellaneous class|foundation stock service"
    ),
    "closest_match",
] = ""

In [42]:
# merge the dataframes on the closest_match column
merged_df = pd.DataFrame()
merged_df = (
    new_akc_df.merge(
        year_breed_df, left_on="closest_match", right_on="breed", how="left"
    )
    .drop(["breed_y", "match_score", "closest_match"], axis=1)
    .rename(columns={"breed_x": "breed"})
)

In [43]:
merged_df

Unnamed: 0,breed_group,breed,links,year
0,herding group,australian cattle dog,https://www.akc.org/dog-breeds/australian-catt...,1980.0
1,herding group,australian shepherd,https://www.akc.org/dog-breeds/australian-shep...,1991.0
2,herding group,bearded collie,https://www.akc.org/dog-breeds/bearded-collie/,1976.0
3,herding group,beauceron,https://www.akc.org/dog-breeds/beauceron/,2007.0
4,herding group,belgian laekenois,https://www.akc.org/dog-breeds/belgian-laekenois/,2020.0
5,herding group,belgian malinois,https://www.akc.org/dog-breeds/belgian-malinois/,1959.0
6,herding group,belgian sheepdog,https://www.akc.org/dog-breeds/belgian-sheepdog/,1912.0
7,herding group,belgian tervuren,https://www.akc.org/dog-breeds/belgian-tervuren/,1959.0
8,herding group,bergamasco,https://www.akc.org/dog-breeds/bergamasco-shee...,2015.0
9,herding group,berger picard,https://www.akc.org/dog-breeds/berger-picard/,2015.0


In [44]:
merged_df.to_csv("../data/akc_dog_breeds.csv", index=False)
# merged_df.sort_values("breed")

#### AKC physical traits

In [45]:
akc_links = akc_breeds_df["links"].tolist()

In [46]:
def get_breed_info(driver, link):
    breed_metadata = defaultdict(str)
    driver.get(link)

    try:
        breed = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h1.page-header__title"))
        )
        breed_metadata["breed"] = breed.text

        # temperment
        temperment = driver.find_element(
            By.CSS_SELECTOR, "p.breed-page__intro__temperment"
        )
        breed_metadata["temperment"] = temperment.text

        # height, weight, life expectancy
        elements = driver.find_elements(
            By.CSS_SELECTOR, "div.breed-page__hero__overview__icon-block"
        )
        for ele in elements:
            text = ele.find_element(By.TAG_NAME, "p").text
            height_match = re.search(r"(\d+)(?:\s*-\s*(\d+))?\s*inches?", text)
            weight_match = re.search(r"(\d+)(?:\s*-\s*(\d+))?\s*pounds?", text)
            life_expectancy_match = re.search(r"(\d+)(?:\s*-\s*(\d+))?\s*years?", text)

            if height_match:
                breed_metadata["height"] = height_match.group(0)
            elif weight_match:
                breed_metadata["weight"] = weight_match.group(0)
            elif life_expectancy_match:
                breed_metadata["life_expectancy"] = life_expectancy_match.group(0)

    except NoSuchElementException as e:
        print("No such element", e)

    return breed_metadata

In [49]:
my_d = start_driver()
breed_data_driver = partial(get_breed_info, my_d)

list_of_dicts = []
for link in tqdm(akc_links):
    list_of_dicts.append(breed_data_driver(link))
my_d.quit()

100%|██████████| 280/280 [10:15<00:00,  2.20s/it]


In [54]:
akc_physical_traits = pd.DataFrame(list_of_dicts)
akc_physical_traits["breed"] = akc_physical_traits["breed"].str.lower()
merged_df = merged_df.merge(akc_physical_traits, on="breed", how="left")

In [55]:
merged_df.to_csv("../data/akc_dog_breeds.csv", index=False)
# merged_df.merge(akc_physical_traits, on="breed", how="left")

In [144]:
df = pd.DataFrame()
df = pd.read_csv("../data/akc_dog_breeds.csv")

df["temperment"].fillna("").str.lower()
# .str.split(" / |, | and | but ", expand=True).stack().value_counts()

from gensim import corpora
from gensim.models import LdaModel

# Step 1: Preprocess the text
df["temperment_words"] = (
    df["temperment"]
    .fillna("")
    .str.lower()
    .str.split(" / |, | and | but ", regex=True)
    # .str.replace("[^\w\s]", "", regex=True)
    # .apply(lambda x: re.split("/|,|and|but", x))
    # .apply(lambda x: x[0].split())
)

# Step 2: Create a dictionary and a corpus
dictionary = corpora.Dictionary(df["temperment_words"])
corpus = [dictionary.doc2bow(text) for text in df["temperment_words"]]

# Step 3: Train an LDA model
lda_model = LdaModel(
    corpus, num_topics=6, id2word=dictionary, passes=10
)  # choose the number of topics


# Step 4: Calculate the dominant topic for each dog
def dominant_topic(words):
    bow = dictionary.doc2bow(words)
    topics = lda_model.get_document_topics(bow)
    return max(topics, key=lambda x: x[1])[
        0
    ]  # return the topic with the highest probability


df["dominant_topic"] = df["temperment_words"].apply(dominant_topic)

# Step 5: Group the dogs based on their dominant topics
df_grouped = df.groupby("dominant_topic")

In [146]:
df.to_csv("../data/akc_dog_breeds.csv", index=False)

# df.sort_values("dominant_topic")
# df_grouped["dominant_topic"].value_counts()

#### match both AKC and FCI breeds


In [3]:
pd.read_csv("../data/fci_breeds_trans.csv")

Unnamed: 0,breed,link,translations,alt_names,no_accent,breed_en
0,AFFENPINSCHER,https://fci.be/en/nomenclature/AFFENPINSCHER-1...,"['AFFENPINSCHER', 'AFFENPINSCHER', 'AFFENPINSC...",{'affenpinscher'},"['affenpinscher', 'affenpinscher', 'affenpinsc...",AFFENPINSCHER
1,AFGHAN HOUND,https://fci.be/en/nomenclature/AFGHAN-HOUND-22...,"['AFGHAN HOUND', 'LEVRIER AFGHAN', 'AFGHANISCH...","{'afghanischer windhund', 'levrier afghan', 'a...","['afghanischer windhund', 'afghan hound', 'lev...",AFGHAN HOUND
2,AÏDI (CHIEN DE MONTAGNE DE L'ATLAS),https://fci.be/en/nomenclature/ATLAS-MOUNTAIN-...,"['ATLAS MOUNTAIN DOG (AIDI)', ""CHIEN DE MONTAG...","{'atlas-berghund (aïdi)', ""aidi (chien de mont...","['perro de montana del atlas (aidi)', 'atlas m...",ATLAS MOUNTAIN DOG (AIDI)
3,AIREDALE TERRIER,https://fci.be/en/nomenclature/AIREDALE-TERRIE...,"['AIREDALE TERRIER', 'AIREDALE TERRIER', 'AIRE...",{'airedale terrier'},"['airedale terrier', 'airedale terrier', 'aire...",AIREDALE TERRIER
4,AKITA,https://fci.be/en/nomenclature/AKITA-255.html,"['AKITA', 'AKITA', 'AKITA', 'AKITA']",{'akita'},"['akita', 'akita', 'akita', 'akita', 'akita', ...",AKITA
...,...,...,...,...,...,...
351,YORKSHIRE TERRIER,https://fci.be/en/nomenclature/YORKSHIRE-TERRI...,"['YORKSHIRE TERRIER', 'TERRIER DU YORKSHIRE', ...","{'terrier du yorkshire', 'yorkshire terrier'}","['terrier du yorkshire', 'yorkshire terrier', ...",YORKSHIRE TERRIER
352,YUZHNORUSSKAYA OVCHARKA,https://fci.be/en/nomenclature/SOUTH-RUSSIAN-S...,"['SOUTH RUSSIAN SHEPHERD DOG', 'BERGER DE RUSS...","{'perro de pastor de rusia meridional', 'sudru...","['south russian shepherd dog', 'berger de russ...",SOUTH RUSSIAN SHEPHERD DOG
353,ZAPADNO-SIBIRSKAÏA LAÏKA,https://fci.be/en/nomenclature/WEST-SIBERIAN-L...,"['WEST SIBERIAN LAIKA', 'LAIKA DE SIBERIE OCCI...","{'westsibirischer laika', 'zapadno-sibirskaïa ...","['laika de siberie occidentale', 'zapadno-sibi...",WEST SIBERIAN LAIKA
354,ZWERGPINSCHER,https://fci.be/en/nomenclature/MINIATURE-PINSC...,"['MINIATURE PINSCHER', 'PINSCHER NAIN', 'ZWERG...","{'zwergpinscher', 'miniature pinscher', 'pinsc...","['pinscher miniatura', 'pinscher nain', 'zwerg...",MINIATURE PINSCHER


In [46]:
# all_fci_names = fci_breeds_trans_df["alt_names"].tolist()

standard_dict = {
    row[1]["breed_en"]: row[1]["alt_names"] for row in fci_breeds_trans_df.iterrows()
}


all_fci_names = list(it.chain.from_iterable(fci_breeds_trans_df["alt_names"]))
len(all_fci_names)

# find the closest match for each breed in the akc breeds dataframe
akc_breeds_df["closest_match"], akc_breeds_df["match_score"] = zip(
    *akc_breeds_df["breed"].apply(
        lambda x: match_breed_name(x, all_fci_names, scorer=fuzz.token_set_ratio)
    )
)
akc_breeds_df.sort_values("match_score", ascending=False).drop(
    columns=["links"]
).reset_index()

akc_breeds_df[akc_breeds_df["match_score"] > 90]["closest_match"]

# if the match score is greater than 90,
# then put the key from the standard_dict in the standard column for that row
akc_breeds_df["standard"] = akc_breeds_df[akc_breeds_df["match_score"] > 80][
    "closest_match"
].apply(lambda x: [key for key, value in standard_dict.items() if x in value][0])


akc_matches = akc_breeds_df[akc_breeds_df["standard"].notnull()]

for row in akc_matches.iterrows():
    standard_dict[row[1]["standard"]].append(row[1]["breed"])

In [72]:
unmatched_akc = akc_breeds_df[akc_breeds_df["standard"].isna()]["breed"].tolist()

In [73]:
len(standard_dict)

breed_standards = defaultdict(set)
breed_standards = {k: set(v) for k, v in standard_dict.items()}
for breed in unmatched_akc:
    breed_standards[breed] = set([breed])

# save the breed_standards dictionary to disk as pickle file
with open("../data/breed_standards.pkl", "wb") as f:
    pickle.dump(breed_standards, f)

In [6]:
with open("../data/breed_standards.pkl", "rb") as f:
    breed_standards = pickle.load(f)

breed_standards

{'AFFENPINSCHER': {'affenpinscher'},
 'AFGHAN HOUND': {'afghan hound',
  'afghanischer windhund',
  'lebrel afgano',
  'levrier afghan'},
 'ATLAS MOUNTAIN DOG (AIDI)': {'atlas mountain dog (aidi)',
  'atlas-berghund (aïdi)',
  "aïdi (chien de montagne de l'atlas)",
  "chien de montagne de l'atlas (aïdi)",
  'perro de montaña del atlas (aïdi)'},
 'AIREDALE TERRIER': {'airedale terrier'},
 'AKITA': {'akita'},
 'ALASKAN MALAMUTE': {'alaskan malamute',
  'malamute de alaska',
  "malamute de l'alaska"},
 'ALPINE DACHSBRACKE': {'alpenländische dachsbracke',
  'alpine dachsbracke',
  'basset des alpes',
  'dachsbracke de los alpes'},
 'AMERICAN AKITA': {'akita americain',
  'akita americano',
  'american akita',
  'amerikanischer akita'},
 'AMERICAN COCKER SPANIEL': {'american cocker spaniel',
  'amerikanischer cocker spaniel',
  'cocker américain',
  'cocker spaniel',
  'cocker spaniel americano'},
 'AMERICAN FOXHOUND': {'american foxhound'},
 'AMERICAN STAFFORDSHIRE TERRIER': {'american sta

In [7]:
len(breed_standards)

398

In [25]:
def find_query(query, breed_standards=breed_standards):
    pattern = re.compile(query, re.IGNORECASE)
    query_result = [
        key
        for key, value in breed_standards.items()
        if any(pattern.search(v) for v in value)
    ]
    if len(query_result) > 0:
        for q in query_result:
            print(q)
            print(breed_standards[q])
        # print(query_result)
        # print(standard_dict[query_result[0]])
    else:
        print("No results found.")


find_query("bulldog")

FRENCH BULLDOG
{'french bulldog', 'bulldog', 'bulldog francés', 'französische bulldogge', 'bouledogue français'}
BULLDOG
{'bulldog', 'american bulldog'}
CONTINENTAL BULLDOG
{'bulldog continental', 'continental bulldog'}


In [64]:
breed_standards

{'AFFENPINSCHER': {'affenpinscher'},
 'AFGHAN HOUND': {'afghan hound',
  'afghanischer windhund',
  'lebrel afgano',
  'levrier afghan'},
 'ATLAS MOUNTAIN DOG (AIDI)': {'atlas mountain dog (aidi)',
  'atlas-berghund (aïdi)',
  "aïdi (chien de montagne de l'atlas)",
  "chien de montagne de l'atlas (aïdi)",
  'perro de montaña del atlas (aïdi)'},
 'AIREDALE TERRIER': {'airedale terrier'},
 'AKITA': {'akita'},
 'ALASKAN MALAMUTE': {'alaskan malamute',
  'malamute de alaska',
  "malamute de l'alaska"},
 'ALPINE DACHSBRACKE': {'alpenländische dachsbracke',
  'alpine dachsbracke',
  'basset des alpes',
  'dachsbracke de los alpes'},
 'AMERICAN AKITA': {'akita americain',
  'akita americano',
  'american akita',
  'amerikanischer akita'},
 'AMERICAN COCKER SPANIEL': {'american cocker spaniel',
  'amerikanischer cocker spaniel',
  'cocker américain',
  'cocker spaniel',
  'cocker spaniel americano'},
 'AMERICAN FOXHOUND': {'american foxhound'},
 'AMERICAN STAFFORDSHIRE TERRIER': {'american sta