In [1]:
from itertools import combinations
import itertools as it
from bs4 import BeautifulSoup
from urllib.request import urlopen
import lxml
from lxml import etree
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from collections import defaultdict
import string
import unicodedata
import pickle

import json
from translate_app import translate_list_to_dict


from contextlib import contextmanager

import re
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from thefuzz import process
from thefuzz import fuzz

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    StaleElementReferenceException,
)

In [119]:
from gensim import corpora
from gensim import models

import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [128]:
def start_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [2]:
def remove_accents(input_str):
    """Function to remove accents from a string.
    It takes as argument a string and returns the same string
    without accents."""
    nfkd_form = (
        unicodedata.normalize("NFKD", input_str).encode("ASCII", "ignore").decode()
    )
    # return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
    return nfkd_form


remove_accents("résuméö")

'resumeo'

In [3]:
# define a function to match breed names using FuzzyWuzzy
def match_breed_name(name, choices, scorer=fuzz.token_sort_ratio):
    if name in choices:
        return name, 100
    mismo, score, *_ = process.extractOne(name, choices, scorer=scorer)
    return mismo, score

#### Info about Zurich districts

In [None]:
# save the url of the website
zurich_districts_url = "https://www.zuerich.com/en/visit/about-zurich/zurichs-districts"

# get the html content of the website
zurich_response = urlopen(zurich_districts_url)
zurich_html_content = zurich_response.read()

In [None]:
# parse the html content
zurich_soup = BeautifulSoup(zurich_html_content, "lxml")

In [None]:
# select all elements with id starting with 's-' and a number between 1 and 12
pattern = re.compile(r"s-[1-9]|s-1[0-2]")
elements = zurich_soup.find_all(id=pattern)

In [None]:
# create a dataframe with the information of the districts
districts = {element.find("h2").text: element.find("p").text for element in elements}
districts_df = pd.DataFrame.from_dict(districts, orient="index", columns=["desc"])


# make the index into a column and split it into district number and district name
districts_df = districts_df.reset_index()
districts_df = (
    districts_df["index"]
    .str.split("–", expand=True)
    .rename({0: "district_number", 1: "district_name"}, axis=1)
    .join(districts_df)
    .drop("index", axis=1)
)
# strip the whitespace from the columns
districts_df["district_number"] = districts_df["district_number"].str.strip()
# create regex to get the number from the district_number column
regex_pattern = re.compile(r"([\d]+)")

# create a new column with the district number
districts_df["district"] = (
    districts_df["district_number"]
    .str.extract(
        regex_pattern,
    )
    .astype("category")
)
districts_df.drop("district_number", axis=1, inplace=True)

districts_df["district_name"] = districts_df["district_name"].str.strip()
districts_df["desc"] = districts_df["desc"].str.strip()

# Add column for the length of the desc
districts_df["desc_length"] = districts_df["desc"].str.len()

print(districts_df)

In [None]:
districts_df.info()

In [None]:
# create a styler object and set the wrap parameter to True
styler = districts_df.style.set_properties(**{"white-space": "pre-wrap"})

formatted_df = styler.format({"description": lambda x: x})
formatted_df

In [None]:
districts_df.to_csv("../data/zurich_districts.csv", index=False)

Info on the population of the districts of Zurich fromm wikipedia but it is from 2005.

In [109]:
subdivisions_of_zurich_url = "https://en.wikipedia.org/wiki/Subdivisions_of_Z%C3%BCrich"

table = pd.read_html(subdivisions_of_zurich_url)
type(table)

list

In [110]:
# get the html content
sub_zurich_response = urlopen(subdivisions_of_zurich_url)
sub_zurich_html_content = sub_zurich_response.read()

sub_zurich_soup = BeautifulSoup(sub_zurich_html_content, "lxml")

# select the second table on the page
table = sub_zurich_soup.find_all("table")[1]

table_body = table.find()

In [112]:
# create a defaultdict to store the information
sub_zurich_dict = defaultdict(dict)

# loop through the rows and append the information to the defaultdict
rows = [ele for ele in table_body.children if len(ele.text.strip()) > 1]
for i, row in enumerate(rows[1:], 1):
    cells = [ele.text.strip() for ele in row.children if len(ele.text.strip()) > 1]
    if len(cells) > 5:
        sub_zurich_dict[i]["district"] = cells[0]

        sub_zurich_dict[i]["neighborhoods"] = cells[1].split("\n\n\n")
        sub_zurich_dict[i]["population"] = cells[5].replace(",", "").split("\n\n\n")
    # sub_zurich_dict[i]["neighborhood"] = cells[3].find("a").string

neighborhood_df = pd.DataFrame.from_dict(sub_zurich_dict).T
# create regex to split the district values where a number is directly in front of a letter
regex_pattern = re.compile(r"(?<=\d)(?=[A-Z])")

neighborhood_df["district_number"] = (
    neighborhood_df["district"].str.split(regex_pattern).apply(lambda x: x[0])
)

# explode the neighborhoods column
neighborhood_df[["district", "neighborhoods"]].explode("neighborhoods")
# explode the population column
neighborhood_df[["district_number", "population"]].explode("population")

neighborhoods_df = pd.concat(
    [
        neighborhood_df[["district", "neighborhoods"]].explode("neighborhoods"),
        neighborhood_df[["district_number", "population"]].explode("population"),
    ],
    axis=1,
)

neighborhoods_df["population"] = neighborhoods_df["population"].astype("int")
neighborhoods_df["population"].sum()

366809

In [113]:
start_value = (
    366_809  # taken from https://en.wikipedia.org/wiki/Subdivisions_of_Z%C3%BCrich
)
end_value = 391_400  # taken from https://worldpopulationreview.com/world-cities/zurich-population
# end_value = 428700
years = 2014 - 2004

cagr = (end_value / start_value) ** (1 / years) - 1

# Convert to percentage
# cagr = cagr * 100
print(f"Compounded annual growth rate: {cagr:.2%}")

Compounded annual growth rate: 0.65%


In [115]:
adjusted_population = (neighborhoods_df["population"] * (1 + cagr) ** 10).astype(int)
neighborhoods_df["adj_population"] = adjusted_population
neighborhoods_df.to_csv("../data/zurich_neighborhoods.csv", index=False)

#### Info for Dog breeds from hunde-zauber.de

In [4]:
hz_url = "https://hunde-zauber.de/liste-aller-hunderassen-von-a-bis-z/"
hz_response = urlopen(hz_url)
hz_html_content = hz_response.read()
hz_soup = BeautifulSoup(hz_html_content, "lxml")

In [5]:
# find the table on the page
breed_table = hz_soup.find("table").find("tbody")
# for each row in the table, get the breed name in the td tags
breed_names = [
    cell.text.strip()
    for row in breed_table.find_all("tr")
    for cell in row.find_all("td")
]
# get just the name from each item on the list which is after a number and '.' and strip the whitespace
# use regex
pattern = re.compile(r"(?<=\d\.)(.*)")
breeds = [
    pattern.search(breed).group().strip()
    for breed in breed_names
    if pattern.search(breed)
]
sorted(breeds)
# [breed.split("."). for breed in breed_names]

['Affenpinscher',
 'Afghanischer Windhund',
 'Aidi',
 'Airedale Terrier',
 'Akita Inu',
 'Alano Español',
 'Alaskan Husky',
 'Alpenländische Dachsbracke',
 'Altdeutscher Schäferhund',
 'Altdänischer Vorstehhund',
 'American Akita',
 'American Bulldog',
 'American Eskimo Dog',
 'American Foxhound',
 'American Hairless Terrier',
 'American Leopard Hound',
 'American Pit Bull Terrier',
 'American Staffordshire Terrier',
 'American Staghound',
 'American Toy Terrier',
 'American Water Spaniel',
 'Amerikanischer Cocker Spaniel',
 'Anatolischer Hirtenhund',
 'Anglo-Français de petite vénerie',
 'Appenzeller Sennenhund',
 'Ariégeois',
 'Armant',
 'Australian Cattle Dog',
 'Australian Kelpie',
 'Australian Shepherd',
 'Australian Silky Terrier',
 'Australian Terrier',
 'Azawakh',
 'Barbet',
 'Barsoi',
 'Basenji',
 'Basset Hound',
 'Basset artésien normand',
 'Basset bleu de Gascogne',
 'Basset fauve de Bretagne',
 'Bayerischer Gebirgsschweißhund',
 'Beagle',
 'Beagle-Harrier',
 'Bearded Collie

In [6]:
translated_breeds = translate_list_to_dict(breeds)
translated_breeds

{'Affenpinscher': 'Affenpinscher',
 'English Pointer': 'English Pointer',
 'Moskauer Wachhund': 'Moscow watchdog',
 'Afghanischer Windhund': 'Afghan Hound',
 'English Setter': 'English Setters',
 'Mucuchies': 'Mucuchies',
 'Aidi': 'Aidi',
 'English Shepherd': 'English Shepherd',
 'Mudhol Hound': 'Mudhol Hound',
 'Airedale Terrier': 'Airedale Terriers',
 'English Springer Spaniel': 'English Springer Spaniel',
 'Mudi': 'Mudi',
 'Akita Inu': 'Akita Inu',
 'English Toy Terrier': 'English Toy Terrier',
 'Neufundländer': 'Newfoundland',
 'Alano Español': 'Alano Spanish',
 'Entlebucher Sennenhund': 'Entlebuch Mountain Dog',
 'Norfolk Terrier': 'Norfolk Terriers',
 'Alaskan Husky': 'Alaskan Husky',
 'Epagneul Bleu de Picardie': 'Epagneul Bleu de Picardie',
 'Norrbottenspitz': 'Norrbottenspitz',
 'Alpenländische Dachsbracke': 'Alpine Dachsbracke',
 'Epagneul Breton': 'Epagneul Breton',
 'Norwegischer Buhund': 'Norwegian Buhund',
 'Altdänischer Vorstehhund': 'Old Danish pointer',
 'Epagneul de P

In [7]:
breeds_422 = {k: translated_breeds[k] for k in sorted(translated_breeds)}
# sorted_translated_breeds
# make breeds_422 the master list of breeds and use it to create a dataframe
breeds_df = pd.DataFrame(breeds_422.items(), columns=["breed_de", "breed_en"])
breeds_df

Unnamed: 0,breed_de,breed_en
0,Affenpinscher,Affenpinscher
1,Afghanischer Windhund,Afghan Hound
2,Aidi,Aidi
3,Airedale Terrier,Airedale Terriers
4,Akita Inu,Akita Inu
...,...,...
417,Zwergpinscher,Miniature Pinscher
418,Zwergschnauzer,Miniature Schnauzer
419,Zwergspitz,Pomeranian
420,Österreichischer Kurzhaarpinscher,Austrian Shorthaired Pinscher


#### Info for Dog breeds from FCI

In [None]:
fci_url = "https://www.fci.be/en/Nomenclature/educationGroupe.aspx"
fci_response = urlopen(fci_url)
fci_html_content = fci_response.read()

fci_parsed_html = etree.HTML(fci_html_content)

In [None]:
breed_groups = {}
elements = fci_parsed_html.xpath("//*[@class='nom']")
for element in elements:
    breed_groups[element.text] = element.get("href")

In [None]:
fci_breeds_df = (
    pd.DataFrame.from_dict(breed_groups, orient="index", columns=["link"])
    .reset_index()
    .rename(columns={"index": "breed"})
)

# define regex pattern to get what is in the most right brackets
regex_pattern = re.compile(r"\((?=[^()]*\))([^()]+)\)$")


# fci_breeds_df["breed"].str.extract(regex_pattern)
# fci_breeds_df["breed"].str.split("(", n=1, expand=True).rename(
#     columns={0: "breed_orig", 1: "breed_en"}
# )
#
fci_breeds_df.sample(3)

In [None]:
fci_breeds_df[["breed_orig", "breed_en"]] = fci_breeds_df["breed"].str.split(
    "(", n=1, expand=True
)
fci_breeds_df.sample(3)

Turned out that webpage only had 33 breeds although it was from the all breeds link.

In [None]:
only_letters_pattern = r"\(?([A-Za-z-\.\s]+)\)"
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.extract(only_letters_pattern)
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].fillna(
    fci_breeds_df["breed_orig"].transform(lambda x: x)
)
fci_breeds_df["breed_orig"] = fci_breeds_df["breed_orig"].str.strip().str.lower()
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.strip().str.lower()
# fci_breeds_df

In [None]:
fci_breeds_df["weblink"] = fci_breeds_df["link"].apply(lambda x: "www.fci.be" + x)
# fci_breeds_df

In [None]:
fci_breeds_df.to_csv("../data/fci_dog_breeds.csv", index=False)

#### Get all the breeds from the FCI individually

In [None]:
def get_fci_breeds(driver, link):
    """Function to get the breeds from the FCI website.
    It takes as arguments the driver and the link to the page.
    It navigates to each letter and then to each breed and
    gets the translations of the breed name in other languages."""
    name_link_list = []

    driver.get(link)

    # Wait for the letters to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "initiales"))
    )

    letters = driver.find_element(By.CLASS_NAME, "initiales")

    for n, letter in tqdm(enumerate(letters.find_elements(By.TAG_NAME, "a"))):
        try:
            # click first on the letter
            letter.click()
        except StaleElementReferenceException:
            # the element is no longer attached to the DOM so find them again
            letters = driver.find_element(By.CLASS_NAME, "initiales")
            letter = letters.find_elements(By.TAG_NAME, "a")[n]
            letter.click()

        # Wait for the breeds to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "listeraces"))
        )

        breeds = driver.find_element(By.CLASS_NAME, "listeraces")

        breed_text = None
        breed_ref = None
        for n2, breed in tqdm(enumerate(breeds.find_elements(By.TAG_NAME, "a"))):
            try:
                breed_text = breed.text
                breed_ref = breed.get_attribute("href")
                # print(breed_text, breed_ref)
                # click on the breed
                breed.click()
            except StaleElementReferenceException:
                # the element is no longer attached to the DOM so find them again
                breeds = driver.find_element(By.CLASS_NAME, "listeraces")
                breed = breeds.find_elements(By.TAG_NAME, "a")[n2]
                breed_text = breed.text
                breed_ref = breed.get_attribute("href")
                # print(breed_text, breed_ref)
                breed.click()

            # wait for that breed's page to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "racesgridview"))
            )
            table = driver.find_element(By.CLASS_NAME, "racesgridview")
            # get the other languages transations of the breed name
            breed_translations = []
            # get the other languages transations of the breed name
            # only get that column of the table
            rows = table.find_elements(By.TAG_NAME, "tr")
            for row in rows[1:]:
                breed_translations.append(
                    row.find_elements(By.TAG_NAME, "span")[0].text
                )

            # print((breed_text, breed_ref, breed_translations))
            # add the breed, link, and translations to the list
            name_link_list.append((breed_text, breed_ref, breed_translations))
            # go back to the previous page with the breeds
            driver.back()

    driver.quit()

    return name_link_list

In [None]:
# driver = webdriver.Chrome()

fci_nonmenclature_url = "https://fci.be/en/Nomenclature/Default.aspx"

In [None]:
# This cell gets the fci list of breeds and does it letter by letter
# can take up to 10 minutes to run
my_d = start_driver()
fci_list = get_fci_breeds(my_d, fci_nonmenclature_url)

In [None]:


with open("../data/fci_list.json", "w") as f:
    json.dump(fci_list, f)

In [93]:
with open("../data/fci_list.json", "r") as f:
    loaded_list = json.load(f)

# create a dataframe with the information of the breeds
# pd.DataFrame(loaded_list, columns=["breed", "link", "translations"])

In [94]:
fci_breeds_trans_df = pd.DataFrame(
    loaded_list, columns=["breed", "link", "translations"]
)
fci_breeds_trans_df.sample(3)

# add the value in the breed column to the list in the translations column in each respective row
fci_breeds_trans_df["alt_names"] = fci_breeds_trans_df.apply(
    lambda x: x["translations"] + [x["breed"]], axis=1
)
fci_breeds_trans_df["alt_names"] = (
    fci_breeds_trans_df["alt_names"]
    .transform(set)
    .apply(lambda x: [i.lower() for i in x])
)

In [95]:
fci_breeds_trans_df["no_accent"] = fci_breeds_trans_df["alt_names"].apply(
    lambda x: [remove_accents(i) for i in x]
)

fci_breeds_trans_df["breed_en"] = fci_breeds_trans_df["translations"].apply(
    lambda x: x[0]
)
fci_breeds_trans_df["alt_names"] += fci_breeds_trans_df["no_accent"]
fci_breeds_trans_df["alt_names"] = fci_breeds_trans_df["alt_names"].transform(set)
# fci_breeds_trans_df["alt_names"] = fci_breeds_trans_df["alt_names"].transform(list)

Added some alternative names for the breeds.

In [97]:
def add_pointing_names(row):
    if "pointing dog" in row["breed_en"].casefold():
        new_names = [
            # row["breed_en"],
            row["breed_en"]
            .lower()
            .replace(
                "pointing dog",
                "pointer",
            ),
        ]
        row["alt_names"].update(new_names)
    return row


fci_breeds_trans_df = fci_breeds_trans_df.apply(add_pointing_names, axis=1)
# fci_breeds_trans_df

In [98]:
# Add in the 4 varieties of Swiss hounds
mask_alt_names_list = []
swiss_mask = fci_breeds_trans_df["breed"].str.contains("SCHWEIZER LAUFHUND")
other_swiss = [
    "bernese laufhund",
    "jura laufhund",
    "lucerne laufhund",
    "schwyz laufhund",
    "bernese hound",
    "jura hound",
    "lucerne hound",
    "schwyz hound",
]
mask_alt_names_list.append((swiss_mask, other_swiss))

phalene_mask = fci_breeds_trans_df["breed"].str.contains("epagneul nain", case=False)
other_phalenes = {"papillon", "phalene"}
mask_alt_names_list.append((phalene_mask, other_phalenes))

belge_mask = fci_breeds_trans_df["breed"].str.contains("berger belge", case=False)
other_belges = {
    "belgian shepherd",
    "belgian sheepdog",
    "belgian tervuren",
    "malinois",
    "groenendael",
    "laekenois",
    "tervuren",
}
mask_alt_names_list.append((belge_mask, other_belges))
# bolonka_mask = fci_breeds_trans_df["breed"].str.contains("bolonka") # not fci recognized
bobtail_mask = fci_breeds_trans_df["breed"].str.contains(
    "old english sheepdog", case=False
)
other_bobtails = {"bobtail"}
mask_alt_names_list.append((bobtail_mask, other_bobtails))

kangal_mask = fci_breeds_trans_df["breed"].str.contains("kangal", case=False)
other_kangals = {"anatolian shepherd dog"}
mask_alt_names_list.append((kangal_mask, other_kangals))

vizsla_mask = fci_breeds_trans_df["breed"].str.contains("vizsla", case=False)
other_vizslas = {"wirehaired vizsla"}
mask_alt_names_list.append((vizsla_mask, other_vizslas))

sheltie_mask = fci_breeds_trans_df["breed"].str.contains("shetland", case=False)
other_shelties = {"sheltie"}
mask_alt_names_list.append((sheltie_mask, other_shelties))

pinscher_mask = fci_breeds_trans_df["breed"].str.contains("zwergpinscher", case=False)
other_pinschers = {"rehpinscher"}
mask_alt_names_list.append((pinscher_mask, other_pinschers))

spitz_mask = fci_breeds_trans_df["breed"].str.contains("deutscher spitz", case=False)
other_spitz = {"keeshond", "pomeranian"}
mask_alt_names_list.append((spitz_mask, other_spitz))

deutscher_mask = fci_breeds_trans_df["breed"].str.contains("schäfer", case=False)
other_deutscher = {"schäfer"}
mask_alt_names_list.append((deutscher_mask, other_deutscher))


other_bolonkas = {"bolonka zwetna"}
# for idx in fci_breeds_trans_df.loc[swiss_mask].index:
#     fci_breeds_trans_df.at[idx, "alt_names"].extend(other_swiss_hounds)


for mask, alts in mask_alt_names_list:
    row_index = fci_breeds_trans_df[mask].index[0]
    fci_breeds_trans_df.at[row_index, "alt_names"] = list(
        set(fci_breeds_trans_df.at[row_index, "alt_names"]).union(alts)
    )


# fci_breeds_trans_df
fci_breeds_trans_df["alt_names"] = fci_breeds_trans_df["alt_names"].transform(set)
fci_breeds_trans_df["alt_names"] = fci_breeds_trans_df["alt_names"].transform(list)

In [99]:
fci_breeds_trans_df.to_json(
    "../data/fci_breeds_trans.json", orient="records", lines=True
)

In [None]:
# fci_breeds_trans_df.to_csv("../data/fci_breeds_trans.csv", index=False)

#### Info about Dog breeds from AKC


In [146]:
akc_dog_breed_groups_url = "https://www.akc.org/public-education/resources/general-tips-information/dog-breeds-sorted-groups/"

# get the html content of the website
akc_response = urlopen(akc_dog_breed_groups_url)
akc_html_content = akc_response.read()

In [147]:
# Parse the html content
akc_soup = BeautifulSoup(akc_html_content, "lxml")

In [148]:
# get the elements with links as the text is the info you want
breed_list = []
link_list = []
elements = akc_soup.find_all("a", href=True)
for element in elements:
    if "dog-breeds" in element.get("href"):
        breed_list.append(element.text.strip())
        link_list.append(element.get("href"))

In [149]:
# link_list
# len(link_list)
# len(breed_list)
breed_link_dict = dict(zip(breed_list, link_list))

breed_link_dict_casefolded = {
    key.casefold(): value for key, value in breed_link_dict.items()
}
breed_link_dict_casefolded

{'view all breeds': 'https://www.akc.org/dog-breeds/',
 'dog breeds': '/expert-advice/dog-breeds/',
 'list of breeds by group': 'https://www.akc.org/public-education/resources/general-tips-information/dog-breeds-sorted-groups/',
 'herding group': 'https://www.akc.org/dog-breeds/groups/herding/',
 'australian cattle dog': 'https://www.akc.org/dog-breeds/australian-cattle-dog/',
 'australian shepherd': 'https://www.akc.org/dog-breeds/australian-shepherd/',
 'bearded collie': 'https://www.akc.org/dog-breeds/bearded-collie/',
 'beauceron': 'https://www.akc.org/dog-breeds/beauceron/',
 'belgian laekenois': 'https://www.akc.org/dog-breeds/belgian-laekenois/',
 'belgian malinois': 'https://www.akc.org/dog-breeds/belgian-malinois/',
 'belgian sheepdog': 'https://www.akc.org/dog-breeds/belgian-sheepdog/',
 'belgian tervuren': 'https://www.akc.org/dog-breeds/belgian-tervuren/',
 'bergamasco': 'https://www.akc.org/dog-breeds/bergamasco-sheepdog/',
 'berger picard': 'https://www.akc.org/dog-breeds

In [150]:
# loop over the breed_list and create a dictionary of group breeds and their sub-breeds
group_breeds = {}
current_group = None
for breed in breed_link_dict_casefolded:
    if re.search(r"group|stock|class", breed):
        current_group = breed
        group_breeds[current_group] = []
    elif current_group is not None:
        group_breeds[current_group].append(breed)

# print the resulting dictionary of breed groups and their breeds
# print(group_breeds)

In [151]:
# create a dataframe with only 2 columns, one for the breed group and the other for the breeds in that group
# this dataframe would be in long format

group_breeds_df = pd.DataFrame(group_breeds.items(), columns=["breed_group", "breed"])
akc_breeds_df = group_breeds_df.explode("breed")
akc_breeds_df.reset_index(drop=True, inplace=True)
akc_breeds_df["breed_group"] = (
    akc_breeds_df["breed_group"].str.replace(" GROUP", "").str.lower()
)
akc_breeds_df["breed"] = akc_breeds_df["breed"].str.lower()
akc_breeds_df = akc_breeds_df.iloc[1:-1, :]

In [152]:
akc_breeds_df["links"] = akc_breeds_df["breed"].map(breed_link_dict_casefolded)

In [153]:
akc_breeds_df.to_csv("../data/akc_dog_breeds.csv", index=False)

In [None]:
def find_fuzzy_match(breed):
    """Find a fuzzy match for the breed in the fci breeds dataframe."""
    for index, row in fci_breeds_trans_df.iterrows():
        for alt_name in row["alt_names"]:
            ratio = fuzz.ratio(breed.lower(), alt_name.lower())
            if ratio > 90:
                return index
    return np.nan


def find_fci_index(breed):
    """Find the index of the breed in the fci breeds dataframe."""
    index = fci_breeds_trans_df[
        fci_breeds_trans_df["alt_names"].apply(
            lambda x: breed.lower() in [name.lower() for name in x]
        )
    ].index
    if len(index) > 0:
        return index[0]
    else:
        fuzzy_index = find_fuzzy_match(breed)
        if fuzzy_index is not None:
            fci_breeds_trans_df.at[fuzzy_index, "alt_names"].append(breed)
            return fuzzy_index
        else:
            return np.nan

In [None]:
akc_breeds_df.breed_group.unique().tolist()
breed_type_match = {}
fci_breed_groups = [
    "Sheepdogs and Cattledogs (except Swiss Cattledogs)",
    "Pinscher and Schnauzer - Molossoid and Swiss Mountain and Cattledogs",
    "Terriers",
    "Dachshunds",
    "Spitz and primitive types",
    "Scent hounds and related breeds",
    "Pointing Dogs",
    "Retrievers - Flushing Dogs Water Dogs",
    "Companion and Toy Dogs",
    "Sighthounds",
]
fci_breeds = {fci_breed_groups.index(group) + 1: group for group in fci_breed_groups}
akc_breed_groups = akc_breeds_df.breed_group.unique().tolist()
# akc_breed_groups
fci_breeds

#### Spitz Breed Group

Get info from this story about the spitz breeds.

In [None]:
spitz_story_url = "https://www.akc.org/expert-advice/dog-breeds/spitz-dog-breeds/"

spitz_response = urlopen(spitz_story_url)
spitz_html_content = spitz_response.read()

In [None]:
spitz_soup = BeautifulSoup(spitz_html_content, "lxml")

# get the breeds linted in the 'tag-set__item-link' class
elements = spitz_soup.find_all("a", class_="tag-set__item-link")
spitz_breeds = [element.text for element in elements]

display(spitz_breeds)

# find these breeds in the akc_breeds_df
akc_breeds_df[akc_breeds_df.breed.isin(spitz_breeds)]

#### Year breed was recognized by AKC

In [154]:
akc_breed_year_url = "https://www.akc.org/press-center/articles-resources/facts-and-stats/breeds-year-recognized/"

akc_breed_year_response = urlopen(akc_breed_year_url)
akc_breed_year_html_content = akc_breed_year_response.read()

In [155]:
akc_breed_year_soup = BeautifulSoup(akc_breed_year_html_content, "html.parser")

In [156]:
# convert to etree
akc_breed_year_parsed_html = etree.HTML(akc_breed_year_html_content)

# get the elements with the tag span and that have '&nbsp' in the text
akc_breed_year_elements = akc_breed_year_parsed_html.xpath(
    "//*[contains(text(), '\u00A0')]"
)
[element.text.strip() for element in akc_breed_year_elements]

# get elements in the class 'content-body__text-long'
akc_breed_year_element = akc_breed_year_soup.find(
    "div", class_="content-body__text-long"
)

# find the p tag in each of those elements and get the text in the span tag in the p tag
year_breed_list = [
    element.text.strip().replace("\xa0", "")
    for element in akc_breed_year_element
    if element.text.strip()
][2:]

In [157]:
year_breed_df = pd.DataFrame(
    [yb.split("–") for yb in year_breed_list], columns=["year", "breed"]
)
for col in year_breed_df.columns:
    year_breed_df[col] = year_breed_df[col].str.strip()
year_breed_df["year"] = year_breed_df["year"].astype(int)
year_breed_df["breed"] = year_breed_df["breed"].str.lower()

# make breed the first column
year_breed_df = year_breed_df[["breed", "year"]]
year_breed_df

Unnamed: 0,breed,year
0,pointer,1878
1,retriever (chesapeake bay),1878
2,spaniel (clumber),1878
3,spaniel (cocker),1878
4,setter (english),1878
...,...,...
195,belgian laekenois,2020
196,biewer terrier,2021
197,bracco italiano,2022
198,mudi,2022


In [None]:
# akc_breeds_df

In [158]:
# create a copy of the akc_breeds_df dataframe to avoid modifying the original
new_akc_df = akc_breeds_df.copy()
new_akc_df.sample(3)

Unnamed: 0,breed_group,breed,links
82,toy group,poodle (toy),https://www.akc.org/dog-breeds/poodle/
261,foundation stock service,romanian mioritic shepherd dog,https://www.akc.org/dog-breeds/romanian-miorit...
1,herding group,australian cattle dog,https://www.akc.org/dog-breeds/australian-catt...


In [159]:
# create new columns in the new_akc_df dataframe to store the closest matching breed name and score
new_akc_df["closest_match"], new_akc_df["match_score"] = zip(
    *new_akc_df["breed"].apply(lambda x: match_breed_name(x, year_breed_df["breed"]))
)
missed_breeds = set(year_breed_df["breed"].unique().tolist()) - set(
    new_akc_df["closest_match"].unique().tolist()
)
# fix the one that was missed
new_akc_df.loc[new_akc_df["breed"].str.contains("plott"), "closest_match"] = "plott"

In [160]:
pd.set_option("display.max_rows", 300)

new_akc_df.loc[
    new_akc_df["breed_group"].str.contains(
        r"miscellaneous class|foundation stock service"
    ),
    "closest_match",
] = ""

In [161]:
# merge the dataframes on the closest_match column
merged_df = pd.DataFrame()
merged_df = (
    new_akc_df.merge(
        year_breed_df, left_on="closest_match", right_on="breed", how="left"
    )
    .drop(["breed_y", "match_score", "closest_match"], axis=1)
    .rename(columns={"breed_x": "breed"})
)

In [162]:
merged_df[merged_df["breed"].str.contains("bernese", case=False)]

Unnamed: 0,breed_group,breed,links,year
174,working group,bernese mountain dog,https://www.akc.org/dog-breeds/bernese-mountai...,1937.0


In [163]:
merged_df.to_csv("../data/akc_dog_breeds.csv", index=False)
# merged_df.sort_values("breed")

#### AKC physical traits

In [139]:
akc_links = akc_breeds_df["links"].tolist()

In [140]:
def get_breed_info(driver, link):
    breed_metadata = defaultdict(str)
    driver.get(link)

    try:
        breed = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h1.page-header__title"))
        )
        breed_metadata["breed"] = breed.text

        # temperment
        temperment = driver.find_element(
            By.CSS_SELECTOR, "p.breed-page__intro__temperment"
        )
        breed_metadata["temperment"] = temperment.text

        # height, weight, life expectancy
        elements = driver.find_elements(
            By.CSS_SELECTOR, "div.breed-page__hero__overview__icon-block"
        )
        for ele in elements:
            text = ele.find_element(By.TAG_NAME, "p").text
            height_match = re.search(
                r"(\d*\.?\d+)(?:\s*-\s*(\d*\.?\d+))?\s*inches?", text
            )
            weight_match = re.search(r"(\d+)(?:\s*-\s*(\d+))?\s*pounds?", text)
            life_expectancy_match = re.search(r"(\d+)(?:\s*-\s*(\d+))?\s*years?", text)

            if height_match:
                breed_metadata["height"] = height_match.group(0)
            elif weight_match:
                breed_metadata["weight"] = weight_match.group(0)
            elif life_expectancy_match:
                breed_metadata["life_expectancy"] = life_expectancy_match.group(0)

    except NoSuchElementException as e:
        print("No such element", e)

    return breed_metadata

In [141]:
my_d = start_driver()
breed_data_driver = partial(get_breed_info, my_d)

list_of_dicts = []
for link in tqdm(akc_links):
    list_of_dicts.append(breed_data_driver(link))
my_d.quit()

100%|██████████| 281/281 [10:39<00:00,  2.28s/it]


In [164]:
akc_physical_traits = pd.DataFrame(list_of_dicts)
akc_physical_traits["breed"] = akc_physical_traits["breed"].str.lower()
merged_df = merged_df.merge(akc_physical_traits, on="breed", how="left")

In [165]:
merged_df[merged_df["breed"].str.contains("bernese", case=False)]

Unnamed: 0,breed_group,breed,links,year,temperment,height,weight,life_expectancy
176,working group,bernese mountain dog,https://www.akc.org/dog-breeds/bernese-mountai...,1937.0,good-natured / calm / strong,25-27.5 inches,80-115 pounds,7-10 years


In [166]:
merged_df.to_csv("../data/akc_dog_breeds.csv", index=False)
# merged_df.merge(akc_physical_traits, on="breed", how="left")

#### AKC temperment grouping

In [120]:
df = pd.DataFrame()
df = pd.read_csv("../data/akc_dog_breeds.csv")

df["temperment"].fillna("").str.lower()
# .str.split(" / |, | and | but ", expand=True).stack().value_counts()


# Step 1: Preprocess the text
df["temperment_words"] = (
    df["temperment"].fillna("").str.lower().str.split(" / |, | and | but ", regex=True)
)

# Step 2: Create a dictionary and a corpus
dictionary = corpora.Dictionary(df["temperment_words"])
# save the dictionary
dictionary.save("../data/temperment_dict.dict")
bow_corpus = [dictionary.doc2bow(text) for text in df["temperment_words"]]

# Step 3: Train an LDA model
lda_model = models.LdaModel(
    bow_corpus, num_topics=6, id2word=dictionary, passes=10
)  # choose the number of topics


# print the top words for each topic
for topic in lda_model.print_topics():
    print(topic)


# Step 4: Calculate the dominant topic for each dog
def dominant_topic(words, model=lda_model, dictionary=dictionary):
    bow = dictionary.doc2bow(words)
    topics = model.get_document_topics(bow)
    return max(topics, key=lambda x: x[1])[
        0
    ]  # return the topic with the highest probability


df["dominant_topic"] = df["temperment_words"].apply(dominant_topic)

# Step 5: Group the dogs based on their dominant topics
df_grouped = df.groupby("dominant_topic")

(0, '0.185*"intelligent" + 0.102*"loyal" + 0.085*"alert" + 0.035*"devoted" + 0.032*"affectionate" + 0.031*"friendly" + 0.028*"active" + 0.027*"independent" + 0.025*"courageous" + 0.024*"smart"')
(1, '0.146*"loyal" + 0.122*"affectionate" + 0.061*"dignified" + 0.045*"calm" + 0.039*"enthusiastic" + 0.038*"family-oriented" + 0.038*"brave" + 0.037*"smart" + 0.031*"good-natured" + 0.031*"independent"')
(2, '0.110*"smart" + 0.082*"confident" + 0.072*"alert" + 0.061*"fearless" + 0.043*"energetic" + 0.042*"independent" + 0.041*"affectionate" + 0.037*"sociable" + 0.031*"patient" + 0.031*"curious"')
(3, '0.125*"" + 0.120*"courageous" + 0.057*"alert" + 0.047*"friendly" + 0.042*"determined" + 0.040*"lively" + 0.034*"good-natured" + 0.034*"dignified" + 0.034*"athletic" + 0.033*"calm"')
(4, '0.129*"friendly" + 0.078*"charming" + 0.053*"playful" + 0.041*"gentle" + 0.036*"noble" + 0.035*"outgoing" + 0.030*"inquisitive" + 0.029*"bright" + 0.028*"affectionate" + 0.028*"bold"')
(5, '0.149*"friendly" + 0.0

In [None]:
# df.to_csv("../data/akc_dog_breeds.csv", index=False)

# df.sort_values("dominant_topic")
# df_grouped["dominant_topic"].value_counts()

In [None]:
# Prepare the visualization data
# vis_data = gensimvis.prepare(lda_model, bow_corpus, dictionary)

# Display the visualization
# pyLDAvis.display(vis_data)

Find the best num_topics for the LDA model based on coherence score

In [None]:
# Compute Coherence Score
coherence_model_lda = models.CoherenceModel(
    model=lda_model,
    texts=df["temperment_words"],
    dictionary=dictionary,
    coherence="c_v",
)
coherence_lda = coherence_model_lda.get_coherence()
n_topics = lda_model.num_topics
print("\nNum Topics =", n_topics)
print("\nCoherence Score: ", coherence_lda)


def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit, step)):
        model = models.LdaMulticore(
            corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=150
        )
        model_list.append(model)
        coherencemodel = models.CoherenceModel(
            model=model, texts=texts, dictionary=dictionary, coherence="c_v"
        )
        coherence_values.append(coherencemodel.get_coherence())
        # print(f"Num topic: {num_topics}, Coherence: {coherence_values[-1]:.3f}")

    return model_list, coherence_values


# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(
    dictionary=dictionary,
    corpus=bow_corpus,
    texts=df["temperment_words"],
    start=2,
    limit=14,
    step=2,
)

coherence_values = np.array(coherence_values)
max_coherence_idx = np.argmax(coherence_values)

print(f"Max coherence: {coherence_values[max_coherence_idx]:.3f}")

best_model = model_list[max_coherence_idx]

print(f"Num topic: {best_model.num_topics}")

df["dominant_topic"] = df["temperment_words"].apply(
    dominant_topic, model=best_model, dictionary=dictionary
)

In [None]:
# Prepare the visualization data
vis_data = gensimvis.prepare(best_model, bow_corpus, dictionary)

# Display the visualization
pyLDAvis.display(vis_data)


In this analysis, we are planning to explore the potential benefits of combining two powerful natural language processing techniques: Word2Vec and Latent Dirichlet Allocation (LDA). Word2Vec is a neural network-based technique for creating word embeddings, which are vector representations of words that capture their meanings. LDA, on the other hand, is a probabilistic model used for topic modeling, which allows us to discover abstract topics within our text data.

Our approach will involve first applying Word2Vec to our corpus to generate word embeddings. These embeddings capture the semantic relationships between words, including synonyms, antonyms, and more. After generating these embeddings, we will then feed them into an LDA model. The goal is to see if the additional semantic information provided by the Word2Vec embeddings can improve the quality of the topics discovered by the LDA model. This could potentially lead to more meaningful and interpretable topics, enhancing our understanding of the underlying themes in our text data.

In [None]:
# df["temperment_words"]
# print(dictionary)
# bow_corpus
my_doc = ["canny", "sophisicated", "friendly", "alert", "loyal", "independent"]
vec_bow = dictionary.doc2bow(my_doc)
best_model.get_document_topics(vec_bow)

In [None]:

from nltk.corpus import wordnet

In [None]:
temperment_words = set(
    re.sub(
        r"\b(very|also|to|in|please|with|of)\b",
        "",
        re.sub(
            r"-",
            "_",
            x,
        ),
    ).strip()
    for x in sorted(list(it.chain.from_iterable(df["temperment_words"])))
)
temperment_words = set(it.chain.from_iterable([x.split() for x in temperment_words]))

synonyms = {
    word: {
        lemma.name() for synset in wordnet.synsets(word) for lemma in synset.lemmas()
    }
    for word in temperment_words
}
for k in synonyms:
    if len(synonyms[k]) == 0:
        synonyms[k] = {k}


# Group words that share at least one synonym.
synonyms_groups = defaultdict(set)
for word, syms in synonyms.items():
    for sym in syms:
        synonyms_groups[sym].add(word)

synonyms_groups["intelligent"]
synonyms["happy"]

In [None]:
# Train the FastText model on the corpus
ft_model = models.FastText(
    df["temperment_words"], min_count=1, workers=4, max_n=8, seed=628,
)

# vectorize the words in the corpus
word_vectors = ft_model.wv

# get the most similar words to the word 'friendly'
word_vectors.most_similar("friendly")

# Replace each word in the document with its corresponding word vector
vectorized_words = [word_vectors[word] for word in df["temperment_words"].explode()]

# replace each list with its mean vector
vectorized_lists = [
    np.mean([word_vectors[word] for word in words], axis=0)
    for words in df["temperment_words"]
]


# fit a kmeans clustering model on the vectorized words
from sklearn.cluster import KMeans

kmeans_model = KMeans(
    n_clusters=10, n_init="auto", init="k-means++", random_state=628
).fit(vectorized_words)

kmeans_list_model = KMeans(
    n_clusters=10, n_init="auto", init="k-means++", random_state=628
).fit(vectorized_lists)

# assign each row in the dataframe to a cluster
df["cluster"] = df["temperment_words"].apply(
    lambda words: kmeans_model.predict([word_vectors[word] for word in words])
)
df["list_cluster"] = kmeans_list_model.labels_

In [None]:
def reduce_dimensions(model):
    """Reduce the dimensionality of the word vectors to 2D."""
    from sklearn.manifold import TSNE

    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)

    from sklearn.manifold import TSNE

    tsne = TSNE(n_components=2, random_state=628)
    vectors_2d = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors_2d]
    y_vals = [v[1] for v in vectors_2d]
    return x_vals, y_vals, labels


x, y, labels = reduce_dimensions(ft_model)
import plotly.express as px

half_length = len(labels) // 2
half_labels = labels[:half_length]

# Create a new list for labels, with None for the second half
new_labels = list(half_labels) + [None] * (len(labels) - half_length)

fig = px.scatter(x=x, y=y, text=new_labels)
fig.update_traces(textposition="top center")
fig.update_layout(
    height=800,
    title_text="Temperment Words",
    title_x=0.5,
    font=dict(size=12),
    hoverlabel=dict(font_size=12),
)
fig.show()

In [None]:
df["breed_group"].value_counts()
# df.head(20)

#### match both AKC and FCI breeds


In [None]:
fci_df = pd.read_json("../data/fci_breeds_trans.json", lines=True)
akc_df = pd.read_csv("../data/akc_dog_breeds.csv")


# Your corpus of texts
texts = list(it.chain.from_iterable(fci_df["alt_names"].values))
# list(texts)[:10]

Using `CountVectorizer` to match akc breeds to fci breeds

In [None]:
# Create a CountVectorizer object
vectorizer = CountVectorizer(ngram_range=(1, 2))

# Transform the texts into a document-term matrix
texts = list(it.chain.from_iterable(fci_df["alt_names"].values))
X = vectorizer.fit_transform(texts)

# Transform the breed into a document-term matrix
breed_vector = vectorizer.transform(akc_df["breed"])

# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(X, breed_vector)

cosine_sim_matrix.shape

cosine_sim_matrix
akc_df["max_cos_idx"] = np.argmax(cosine_sim_matrix, axis=0)
akc_df["max_cos"] = np.max(cosine_sim_matrix, axis=0)
akc_df["matches"] = [texts[i] for i in akc_df["max_cos_idx"]]
akc_df[["breed", "matches"]].iloc[:50]

In [None]:
standard_dict = {row[1]["breed_en"]: row[1]["alt_names"] for row in fci_df.iterrows()}

akc_df["standard"] = akc_df["matches"].apply(
    lambda x: [key for key, value in standard_dict.items() if x in value][0]
)

akc_df.sort_values("max_cos").head(20)[["breed", "matches", "standard", "max_cos"]]

Using `thefuzz` to match akc breeds to fci breeds

In [100]:
# all_fci_names = fci_breeds_trans_df["alt_names"].tolist()
fci_breeds_trans_df = pd.read_json("../data/fci_breeds_trans.json", lines=True)
akc_breeds_df = pd.read_csv("../data/akc_dog_breeds.csv")

standard_dict = {
    row[1]["breed_en"]: row[1]["alt_names"] for row in fci_breeds_trans_df.iterrows()
}


all_fci_names = list(it.chain.from_iterable(fci_breeds_trans_df["alt_names"]))
len(all_fci_names)

# find the closest match for each breed in the akc breeds dataframe
akc_breeds_df["closest_match"], akc_breeds_df["match_score"] = zip(
    *akc_breeds_df["breed"].apply(
        lambda x: match_breed_name(x, all_fci_names, scorer=fuzz.token_set_ratio)
    )
)
akc_breeds_df.sort_values("match_score", ascending=False).drop(
    columns=["links"]
).reset_index()

akc_breeds_df[akc_breeds_df["match_score"] > 90]["closest_match"]

# if the match score is greater than 90,
# then put the key from the standard_dict in the standard column for that row
akc_breeds_df["standard"] = akc_breeds_df[akc_breeds_df["match_score"] > 80][
    "closest_match"
].apply(lambda x: [key for key, value in standard_dict.items() if x in value][0])


akc_matches = akc_breeds_df[akc_breeds_df["standard"].notnull()]

for row in akc_matches.iterrows():
    standard_dict[row[1]["standard"]].append(row[1]["breed"])

In [101]:
unmatched_akc = akc_breeds_df[akc_breeds_df["standard"].isna()]["breed"].tolist()
len(unmatched_akc)
for breed in unmatched_akc:
    a, b = match_breed_name(breed, all_fci_names, scorer=fuzz.ratio)
    if b > 90:
        # update the akc_breeds_df dataframe with the standard name, breed name, and match score
        std_name = [key for key, value in standard_dict.items() if a in value][0]
        standard_dict[std_name].append(breed)
        akc_breeds_df.loc[akc_breeds_df["breed"] == breed, "standard"] = std_name
        akc_breeds_df.loc[akc_breeds_df["breed"] == breed, "closest_match"] = a

In [102]:
len(standard_dict)
unmatched_akc = akc_breeds_df[akc_breeds_df["standard"].isna()]["breed"].tolist()


breed_standards = defaultdict(set)
breed_standards = {k: set(v) for k, v in standard_dict.items()}
for breed in unmatched_akc:
    breed_standards[breed] = set([breed])

breed_standards["russian tsvetnaya bolonka"].update(other_bolonkas)

In [103]:
breed_standards

{'AFFENPINSCHER': {'affenpinscher'},
 'AFGHAN HOUND': {'afghan hound',
  'afghanischer windhund',
  'lebrel afgano',
  'levrier afghan'},
 'ATLAS MOUNTAIN DOG (AIDI)': {"aidi (chien de montagne de l'atlas)",
  'atlas mountain dog (aidi)',
  'atlas-berghund (aidi)',
  'atlas-berghund (aïdi)',
  "aïdi (chien de montagne de l'atlas)",
  "chien de montagne de l'atlas (aidi)",
  "chien de montagne de l'atlas (aïdi)",
  'perro de montana del atlas (aidi)',
  'perro de montaña del atlas (aïdi)'},
 'AIREDALE TERRIER': {'airedale terrier'},
 'AKITA': {'akita'},
 'ALASKAN MALAMUTE': {'alaskan malamute',
  'malamute de alaska',
  "malamute de l'alaska"},
 'ALPINE DACHSBRACKE': {'alpenlandische dachsbracke',
  'alpenländische dachsbracke',
  'alpine dachsbracke',
  'basset des alpes',
  'dachsbracke de los alpes'},
 'AMERICAN AKITA': {'akita americain',
  'akita americano',
  'american akita',
  'amerikanischer akita'},
 'AMERICAN COCKER SPANIEL': {'american cocker spaniel',
  'amerikanischer cock

In [104]:
# save the breed_standards dictionary to disk as pickle file
with open("../data/breed_standards.pkl", "wb") as f:
    pickle.dump(breed_standards, f)

In [105]:
with open("../data/breed_standards.pkl", "rb") as f:
    breed_standards = pickle.load(f)

breed_standards

{'AFFENPINSCHER': {'affenpinscher'},
 'AFGHAN HOUND': {'afghan hound',
  'afghanischer windhund',
  'lebrel afgano',
  'levrier afghan'},
 'ATLAS MOUNTAIN DOG (AIDI)': {"aidi (chien de montagne de l'atlas)",
  'atlas mountain dog (aidi)',
  'atlas-berghund (aidi)',
  'atlas-berghund (aïdi)',
  "aïdi (chien de montagne de l'atlas)",
  "chien de montagne de l'atlas (aidi)",
  "chien de montagne de l'atlas (aïdi)",
  'perro de montana del atlas (aidi)',
  'perro de montaña del atlas (aïdi)'},
 'AIREDALE TERRIER': {'airedale terrier'},
 'AKITA': {'akita'},
 'ALASKAN MALAMUTE': {'alaskan malamute',
  'malamute de alaska',
  "malamute de l'alaska"},
 'ALPINE DACHSBRACKE': {'alpenlandische dachsbracke',
  'alpenländische dachsbracke',
  'alpine dachsbracke',
  'basset des alpes',
  'dachsbracke de los alpes'},
 'AMERICAN AKITA': {'akita americain',
  'akita americano',
  'american akita',
  'amerikanischer akita'},
 'AMERICAN COCKER SPANIEL': {'american cocker spaniel',
  'amerikanischer cock

In [106]:
def find_query(query, breed_standards=breed_standards):
    pattern = re.compile(query, re.IGNORECASE)
    query_result = [
        key
        for key, value in breed_standards.items()
        if any(pattern.search(v) for v in value)
    ]
    if len(query_result) > 0:
        for q in query_result:
            print(q)
            print(breed_standards[q])
        # print(query_result)
        # print(standard_dict[query_result[0]])
    else:
        print("No results found.")


# find_query("swiss")

In [117]:
find_query("bernese")
# fci_df[fci_df["breed"].str.contains("pinscher", case=False)]

BERNESE MOUNTAIN DOG
{'bernese mountain dog', 'berner sennenhund', 'bouvier bernois', 'boyero de montana bernes'}
SWISS HOUND
{'chien courant suisse', 'schwyz hound', 'jura laufhund', 'bernese hound', 'swiss hound', 'lucerne hound', 'schwyz laufhund', 'schweizer laufhund', 'schweizer laufhund - chien courant suisse', 'jura hound', 'bernese laufhund', 'sabueso suizo', 'lucerne laufhund'}
