In [None]:
# Standard library imports
import itertools as it
import json
import re
from collections import defaultdict
from functools import partial
from pathlib import Path
from typing import Optional
from urllib.request import urlopen
import unicodedata

# Third-party imports
from bs4 import BeautifulSoup
from fiona.io import ZipMemoryFile
from lxml import etree
import lxml
from matplotlib import pyplot as plt
import geopandas as gpd
import geoviews as gv
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    StaleElementReferenceException,
)
from thefuzz import fuzz
from thefuzz import process
from tqdm import tqdm

# Local application imports
from translate_app import translate_list_to_dict

In [101]:
# suppress warnings
pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_columns = 50

In [91]:
def convert_to_snake_case(item):
    # Add _ before uppercase in camelCase
    s1 = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", item)
    # Add _ before uppercase following lowercase or digit
    s2 = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", s1)
    # Add _ between letter and digit
    s3 = re.sub(r"([a-zA-Z])([0-9])", r"\1_\2", s2)
    s4 = re.sub(r"[-\s]", "_", s3).lower()  # Replace hyphen or space with _
    return s4

In [3]:
def start_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [4]:
def remove_accents(input_str):
    """Function to remove accents from a string.
    It takes as argument a string and returns the same string
    without accents."""
    nfkd_form = (
        unicodedata.normalize("NFKD", input_str).encode(
            "ASCII", "ignore").decode()
    )
    # return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
    return nfkd_form


remove_accents("résuméö")

'resumeo'

In [None]:
def sanitize_df_column_names(df):
    """Function to danitize column names by translating and conveting to snake case"""
    column_list = df.columns.tolist()
    # translate the column names
    translated_dict = translate_list_to_dict(column_list)
    # map the translated column names to the column names
    df.rename(columns=translated_dict, inplace=True)
    # convert the column names to snake case
    df.columns = [convert_to_snake_case(col) for col in df.columns]
    return df


def rename_keys(d, prefix="zurich_gdf_"):
    return {f"{prefix}{i}": v for i, (k, v) in enumerate(d.items())}

In [5]:
# define a function to match breed names using FuzzyWuzzy
def match_breed_name(name, choices, scorer=fuzz.token_sort_ratio):
    if name in choices:
        return name, 100
    mismo, score, *_ = process.extractOne(name, choices, scorer=scorer)
    return mismo, score

#### Info about Zurich districts

In [6]:
# save the url of the website
zurich_districts_url = "https://www.zuerich.com/en/visit/about-zurich/zurichs-districts"

# get the html content of the website
zurich_response = urlopen(zurich_districts_url)
zurich_html_content = zurich_response.read()

In [7]:
# parse the html content
zurich_soup = BeautifulSoup(zurich_html_content, "lxml")

In [8]:
# select all elements with id starting with 's-' and a number between 1 and 12
pattern = re.compile(r"s-[1-9]|s-1[0-2]")
elements = zurich_soup.find_all(id=pattern)

In [9]:
# create a dataframe with the information of the districts
districts = {element.find("h2").text: element.find(
    "p").text for element in elements}
districts_df = pd.DataFrame.from_dict(
    districts, orient="index", columns=["desc"])


# make the index into a column and split it into district number and district name
districts_df = districts_df.reset_index()
districts_df = (
    districts_df["index"]
    .str.split("–", expand=True)
    .rename({0: "district_number", 1: "district_name"}, axis=1)
    .join(districts_df)
    .drop("index", axis=1)
)
# strip the whitespace from the columns
districts_df["district_number"] = districts_df["district_number"].str.strip()
# create regex to get the number from the district_number column
regex_pattern = re.compile(r"([\d]+)")

# create a new column with the district number
districts_df["district"] = (
    districts_df["district_number"]
    .str.extract(
        regex_pattern,
    )
    .astype("category")
)
districts_df.drop("district_number", axis=1, inplace=True)

districts_df["district_name"] = districts_df["district_name"].str.strip()
districts_df["desc"] = districts_df["desc"].str.strip()

# Add column for the length of the desc
districts_df["desc_length"] = districts_df["desc"].str.len()

print(districts_df)

                      district_name  \
0             Old Town, City Center   
1            Left Shore of the Lake   
2      Creative Quarter of Wiedikon   
3   Creative Quarter of Langstrasse   
4   Creative Quarter of Zürich-West   
5                University Quarter   
6                 On the Zürichberg   
7           Right Shore of the Lake   
8      At the Foot of the Uetliberg   
9               Right of the Limmat   
10                      Zürich Nord   
11                   Schwamendingen   

                                                 desc district  desc_length  
0   The most central district encompasses the hist...        1          355  
1   The architecturally attractive Enge Train Stat...        2          206  
2   Once mainly a working-class neighborhood, the ...        3          215  
3   The district around Langstrasse was long regar...        4          199  
4   In the quarter where huge machines once clatte...        5          287  
5   University buildings

In [10]:
districts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   district_name  12 non-null     object  
 1   desc           12 non-null     object  
 2   district       12 non-null     category
 3   desc_length    12 non-null     int64   
dtypes: category(1), int64(1), object(2)
memory usage: 824.0+ bytes


In [11]:
# create a styler object and set the wrap parameter to True
styler = districts_df.style.set_properties(**{"white-space": "pre-wrap"})

formatted_df = styler.format({"description": lambda x: x})
formatted_df

Unnamed: 0,district_name,desc,district,desc_length
0,"Old Town, City Center","The most central district encompasses the historical Old Town on both banks of the River Limmat, as well as the area to the south bordering on the lake basin. It is home to the prestigious Bahnhofstrasse, magnificent guild houses, imposing churches, and the famous Opera House. The best view of District 1 is to be had from the top of the Karlsturm tower.",1,355
1,Left Shore of the Lake,"The architecturally attractive Enge Train Station built out of Ticino granite, the Museum Rietberg with its beautiful park, the Seebad Enge lido, and the Rote Fabrik give this district a Mediterranean feel.",2,206
2,Creative Quarter of Wiedikon,"Once mainly a working-class neighborhood, the district below the Uetliberg is now a popular residential area with cozy cafés, boutiques, and vintage stores. The Houdini movie theater also lies on Wiedikon territory.",3,215
3,Creative Quarter of Langstrasse,"The district around Langstrasse was long regarded as a den of iniquity. Nowadays, it is known for its diverse restaurants and never-sleeping nightlife, with bars such as the Olé and the Club Zukunft.",4,199
4,Creative Quarter of Zürich-West,"In the quarter where huge machines once clattered away, now nightclubs, cultural institutions, and universities cluster around the Prime Tower. Converted structures such as the Viadukt and designer stores like the Freitag Tower give the neighborhood its characteristic trendy atmosphere.",5,287
5,University Quarter,"University buildings, Jugendstil villas, green areas, and the Dynamo cultural center characterize Zurich’s District 6. Thanks to its tranquility and closeness to the city center, it is a much sought-after residential area.",6,222
6,On the Zürichberg,"The hillside location, little traffic, and fabulous views mean than living here does not come cheap. Neighbors include Zurich Zoo and the luxury hotel, The Dolder Grand.",7,169
7,Right Shore of the Lake,"This district starts behind the Opera House and stretches as far as the open-air bathing facility, Seebad Tiefenbrunnen. It features boutiques, villas, the Chinawiese recreational area, and the Seebad Utoquai outdoor swimming bath.",8,231
8,At the Foot of the Uetliberg,"For a long time, this district was solely known for the Letzigrund Stadium. In recent years, however, the quarter around Altstetten has become THE place to be, with creative club and gastronomy concepts.",9,203
9,Right of the Limmat,"This district boasts a fantastic location: in summer, everyone meets at the Unterer Letten and Oberer Letten riverside lidos to swim and bask in the sun. In addition, the view over Zurich from the Waid quarter is unbeatable.",10,224


In [12]:
districts_df.to_csv("../data/zurich_districts.csv", index=False)

Info on the population of the districts of Zurich from wikipedia but it is from 2005.

In [13]:
subdivisions_of_zurich_url = "https://en.wikipedia.org/wiki/Subdivisions_of_Z%C3%BCrich"

table = pd.read_html(subdivisions_of_zurich_url)
type(table)

list

In [14]:
# get the html content
sub_zurich_response = urlopen(subdivisions_of_zurich_url)
sub_zurich_html_content = sub_zurich_response.read()

sub_zurich_soup = BeautifulSoup(sub_zurich_html_content, "lxml")

# select the second table on the page
table = sub_zurich_soup.find_all("table")[1]

table_body = table.find()
table_body

<tbody><tr>
<th align="left" bgcolor="#EFEFEF" width="100">District
</th>
<th align="center" bgcolor="#EFEFEF" width="100">Location
</th>
<th align="center" bgcolor="#EFEFEF" width="70">Coat of Arms
</th>
<th align="left" bgcolor="#EFEFEF" width="70">Neighborhood
</th>
<th align="center" bgcolor="#EFEFEF" width="70">BFS-Code
</th>
<th align="center" bgcolor="#EFEFEF" width="70">Incorporation
</th>
<th align="center" bgcolor="#EFEFEF" width="70">Area<br/><small>in km²</small>
</th>
<th align="center" bgcolor="#EFEFEF" width="70">Population<br/><small>2005</small>
</th>
<th align="center" bgcolor="#EFEFEF" width="70">Non-Swiss Citizens
</th></tr>
<tr>
<td align="left"><a class="mw-redirect" href="/wiki/District_1_(Z%C3%BCrich)" title="District 1 (Zürich)"><b>District 1</b><br/>Altstadt</a>
</td>
<td align="center"><span typeof="mw:File"><a class="mw-file-description" href="/wiki/File:Karte_Z%C3%BCrcher_Stadtkreis_1.png" title="Zürich"><img alt="Zürich" class="mw-file-element" data-file-h

In [16]:
# create a defaultdict to store the information
sub_zurich_dict = defaultdict(dict)

# loop through the rows and append the information to the defaultdict
rows = [ele for ele in table_body.children if len(ele.text.strip()) > 1]
for i, row in enumerate(rows[1:], 1):
    cells = [ele.text.strip()
             for ele in row.children if len(ele.text.strip()) > 1]
    if len(cells) > 5:
        sub_zurich_dict[i]["district"] = cells[0]

        sub_zurich_dict[i]["neighborhoods"] = cells[1].split("\n\n\n")
        sub_zurich_dict[i]["population"] = cells[5].replace(
            ",", "").split("\n\n\n")
    # sub_zurich_dict[i]["neighborhood"] = cells[3].find("a").string

neighborhood_df = pd.DataFrame.from_dict(sub_zurich_dict).T
# create regex to split the district values where a number is directly in front of a letter
regex_pattern = re.compile(r"(?<=\d)(?=[A-Z])")

neighborhood_df["district_number"] = (
    neighborhood_df["district"].str.split(regex_pattern).apply(lambda x: x[0])
)

# explode the neighborhoods column
neighborhood_df[["district", "neighborhoods"]].explode("neighborhoods")
# explode the population column
neighborhood_df[["district_number", "population"]].explode("population")

neighborhoods_df = pd.concat(
    [
        neighborhood_df[["district", "neighborhoods"]
                        ].explode("neighborhoods"),
        neighborhood_df[["district_number", "population"]
                        ].explode("population"),
    ],
    axis=1,
)

neighborhoods_df["population"] = neighborhoods_df["population"].astype("int")
neighborhoods_df

Unnamed: 0,district,neighborhoods,district_number,population
1,District 1Altstadt,Rathaus,District 1,3081
1,District 1Altstadt,Hochschulen,District 1,695
1,District 1Altstadt,Lindenhof,District 1,950
1,District 1Altstadt,City,District 1,846
2,District 2,Wollishofen,District 2,15592
2,District 2,Leimbach,District 2,4867
2,District 2,Enge,District 2,8375
3,District 3Wiedikon,Alt-Wiedikon,District 3,14971
3,District 3Wiedikon,Friesenberg,District 3,10360
3,District 3Wiedikon,Sihlfeld,District 3,20554


In [45]:
start_value = (
    366_809  # taken from https://en.wikipedia.org/wiki/Subdivisions_of_Z%C3%BCrich
)
# taken from https://worldpopulationreview.com/world-cities/zurich-population
end_value = 391_400
# end_value = 428700
years = 2014 - 2004

cagr = (end_value / start_value) ** (1 / years) - 1

# Convert to percentage
# cagr = cagr * 100
print(f"Compounded annual growth rate: {cagr:.2%}")

Compounded annual growth rate: 0.65%


#### Info for Dog breeds from hunde-zauber.de

In [87]:
hz_url = "https://hunde-zauber.de/liste-aller-hunderassen-von-a-bis-z/"
hz_size_weight_url = "https://hunde-zauber.de/hund-gewicht-groesse-tabelle/"

hz_response = urlopen(hz_url)
hz_html_content = hz_response.read()
hz_soup = BeautifulSoup(hz_html_content, "lxml")

hz_size_weight_response = urlopen(hz_size_weight_url)
hz_size_weight_html_content = hz_size_weight_response.read()
hz_size_weight_soup = BeautifulSoup(hz_size_weight_html_content, "lxml")

In [88]:
hz_size_weight_tree = etree.HTML(hz_size_weight_html_content)
# get the table header
header = hz_size_weight_tree.xpath("//table/thead/tr")
column_headers = [th.text for th in header[0].xpath("//th")]
# get the table body
body = hz_size_weight_tree.xpath("//table/tbody")
rows = body[0].xpath("//tr")
row_data = [[td.text for td in row.xpath(".//td")] for row in rows]
# convert nested list into a dataframe
hz_size_weight_df = pd.DataFrame()
hz_size_weight_df = pd.DataFrame(row_data[1:], columns=column_headers)
hz_size_weight_df

Unnamed: 0,Hunderasse,Weibliche,Weibliches,Männliche,Männliches
0,Affenpinscher,23 – 30 cm,4 – 6 kg,23 – 30 cm,4 – 6 kg
1,Afghanischer Windhund,60 – 69 cm,26 – 34 kg,68 – 74 cm,26 – 34 kg
2,Aidi,52 – 62 cm,22 – 26 kg,52 – 62 cm,22 – 26 kg
3,Airedale Terrier,56 – 58 cm,18 – 20 kg,56 – 61 cm,23 – 29 kg
4,Akita Inu,58 – 64 cm,25 – 36 kg,64 – 69 cm,27 – 42 kg
...,...,...,...,...,...
442,Zentralasiatischer Owtscharka,60 – 69 cm,40 – 65 kg,65 – 68 cm,50 – 79 kg
443,Zwergpinscher,25 – 30 cm,4 – 5 kg,25 – 30 cm,3 – 5 kg
444,Zwergpudel,28 – 35 cm,12 – 14 kg,28 – 35 cm,12 – 14 kg
445,Zwergschnauzer,30 – 35 cm,5 – 8 kg,33 – 38 cm,5 – 9 kg


In [89]:
# Get the first column and column names, translate them
german_to_translate = hz_size_weight_df.iloc[:, 0].tolist(
) + hz_size_weight_df.columns.tolist()
translated_dict = translate_list_to_dict(german_to_translate)

# Apply translations to column names and first column
hz_size_weight_df.columns = [translated_dict.get(
    col, col) for col in hz_size_weight_df.columns]
hz_size_weight_df['breed_en'] = hz_size_weight_df.iloc[:, 0].map(
    lambda x: translated_dict.get(x, x))

# hz_size_weight_df

In [94]:
hz_size_weight_df.columns = [
    "breed_de",
    "f_height_cm",
    "f_weight_kg",
    "m_height_cm",
    "m_weight_kg",
    "breed_en",
]


def split_column(df, column):
    """Function to extract the numbers from a column and create two new columns."""
    df_copy = df[[column]]
    df_copy[[f"{column}_low", f"{column}_high"]] = df_copy[column].str.extract(
        r"(\d+).*?(\d+)"
    )
    df_copy.drop(column, axis=1, inplace=True)
    return df_copy


columns_to_split = ["f_height_cm", "f_weight_kg", "m_height_cm", "m_weight_kg"]
numbers_df = pd.concat(
    [split_column(hz_size_weight_df, column) for column in columns_to_split], axis=1
)
hz_size_weight_df[["breed_de", "breed_en"]].join(numbers_df)
hz_size_weight_df["breed_de"] = hz_size_weight_df["breed_de"].str.lower()
hz_size_weight_df["breed_en"] = hz_size_weight_df["breed_en"].str.lower()

In [96]:
# all_breeds = translate_list_to_dict(breeds)

In [97]:
# # save the dataframe as a json file
hz_size_weight_df.to_json("../data/all_breeds_size.json", orient="records")

#### Info for Dog breeds from FCI

In [15]:
fci_url = "https://www.fci.be/en/Nomenclature/educationGroupe.aspx"
fci_response = urlopen(fci_url)
fci_html_content = fci_response.read()

fci_parsed_html = etree.HTML(fci_html_content)

In [16]:
breed_groups = {}
elements = fci_parsed_html.xpath("//*[@class='nom']")
for element in elements:
    breed_groups[element.text] = element.get("href")

In [17]:
fci_breeds_df = (
    pd.DataFrame.from_dict(breed_groups, orient="index", columns=["link"])
    .reset_index()
    .rename(columns={"index": "breed"})
)

# define regex pattern to get what is in the most right brackets
regex_pattern = re.compile(r"\((?=[^()]*\))([^()]+)\)$")


# fci_breeds_df["breed"].str.extract(regex_pattern)
# fci_breeds_df["breed"].str.split("(", n=1, expand=True).rename(
#     columns={0: "breed_orig", 1: "breed_en"}
# )
#
fci_breeds_df.sample(3)

Unnamed: 0,breed,link
7,DALMATINSKI PAS (153) (DALMATIAN),/en/nomenclature/DALMATIAN-153.html
22,POSAVSKI GONIC (154) (POSAVATZ HOUND),/en/nomenclature/POSAVATZ-HOUND-154.html
24,ST.BERNHARDSHUND - BERNHARDINER (61) (ST. BERN...,/en/nomenclature/ST-BERNARD-61.html


In [18]:
fci_breeds_df[["breed_orig", "breed_en"]] = fci_breeds_df["breed"].str.split(
    "(", n=1, expand=True
)
fci_breeds_df.sample(3)

Unnamed: 0,breed,link,breed_orig,breed_en
21,PERRO SIN PELO DEL PERÚ (310) (PERUVIAN HAIRLE...,/en/nomenclature/PERUVIAN-HAIRLESS-DOG-310.html,PERRO SIN PELO DEL PERÚ,310) (PERUVIAN HAIRLESS DOG)
3,CESKÝ TERIER (246) (CESKY TERRIER),/en/nomenclature/CESKY-TERRIER-246.html,CESKÝ TERIER,246) (CESKY TERRIER)
8,DANSK-SVENSK GÅRDSHUND (356) (DANISH-SWEDISH F...,/en/nomenclature/DANISH-SWEDISH-FARMDOG-356.html,DANSK-SVENSK GÅRDSHUND,356) (DANISH-SWEDISH FARMDOG)


Turned out that webpage only had 33 breeds

In [98]:
only_letters_pattern = r"\(?([A-Za-z-\.\s]+)\)"
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.extract(
    only_letters_pattern)
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].fillna(
    fci_breeds_df["breed_orig"].transform(lambda x: x)
)
fci_breeds_df["breed_orig"] = fci_breeds_df["breed_orig"].str.strip().str.lower()
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.strip().str.lower()
# fci_breeds_df

KeyError: 'breed_orig'

In [20]:
fci_breeds_df["weblink"] = fci_breeds_df["link"].apply(
    lambda x: "www.fci.be" + x)

# fci_breeds_df

In [23]:
# fci_breeds_df.to_csv("../data/fci_dog_breeds.csv", index=False)

In [21]:
fci_breeds_df

Unnamed: 0,breed,link,breed_orig,breed_en,weblink
0,BERNER SENNENHUND (45) (BERNESE MOUNTAIN DOG),/en/nomenclature/BERNESE-MOUNTAIN-DOG-45.html,berner sennenhund,bernese mountain dog,www.fci.be/en/nomenclature/BERNESE-MOUNTAIN-DO...
1,BOLOGNESE (196),/en/nomenclature/BOLOGNESE-196.html,bolognese,bolognese,www.fci.be/en/nomenclature/BOLOGNESE-196.html
2,CANAAN DOG (273),/en/nomenclature/CANAAN-DOG-273.html,canaan dog,canaan dog,www.fci.be/en/nomenclature/CANAAN-DOG-273.html
3,CESKÝ TERIER (246) (CESKY TERRIER),/en/nomenclature/CESKY-TERRIER-246.html,ceský terier,cesky terrier,www.fci.be/en/nomenclature/CESKY-TERRIER-246.html
4,CHIHUAHUEÑO (218) (CHIHUAHUA),/en/nomenclature/CHIHUAHUA-218.html,chihuahueño,chihuahua,www.fci.be/en/nomenclature/CHIHUAHUA-218.html
5,CIMARRÓN URUGUAYO (353),/en/nomenclature/CIMARRON-URUGUAYO-353.html,cimarrón uruguayo,cimarrón uruguayo,www.fci.be/en/nomenclature/CIMARRON-URUGUAYO-3...
6,CIOBANESC ROMÂNESC DE BUCOVINA (357) (ROMANIAN...,/en/nomenclature/ROMANIAN-BUCOVINA-SHEPHERD-35...,ciobanesc românesc de bucovina,romanian bucovina shepherd,www.fci.be/en/nomenclature/ROMANIAN-BUCOVINA-S...
7,DALMATINSKI PAS (153) (DALMATIAN),/en/nomenclature/DALMATIAN-153.html,dalmatinski pas,dalmatian,www.fci.be/en/nomenclature/DALMATIAN-153.html
8,DANSK-SVENSK GÅRDSHUND (356) (DANISH-SWEDISH F...,/en/nomenclature/DANISH-SWEDISH-FARMDOG-356.html,dansk-svensk gårdshund,danish-swedish farmdog,www.fci.be/en/nomenclature/DANISH-SWEDISH-FARM...
9,DOGO ARGENTINO (292),/en/nomenclature/DOGO-ARGENTINO-292.html,dogo argentino,dogo argentino,www.fci.be/en/nomenclature/DOGO-ARGENTINO-292....


#### Get all the breeds from the FCI individually

In [22]:
def get_fci_breeds(driver, link):
    """Function to get the breeds from the FCI website.
    It takes as arguments the driver and the link to the page.
    It navigates to each letter and then to each breed and
    gets the translations of the breed name in other languages."""
    name_link_list = []

    driver.get(link)

    # Wait for the letters to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "initiales"))
    )

    letters = driver.find_element(By.CLASS_NAME, "initiales")

    for n, letter in tqdm(enumerate(letters.find_elements(By.TAG_NAME, "a"))):
        # adding lines to break the loop after the first 2 letters for testing
        # if n >= 2:
        #     break

        try:
            # click first on the letter
            letter.click()
        except StaleElementReferenceException:
            # the element is no longer attached to the DOM so find them again
            letters = driver.find_element(By.CLASS_NAME, "initiales")
            letter = letters.find_elements(By.TAG_NAME, "a")[n]
            letter.click()

        # Wait for the breeds to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "listeraces"))
        )

        breeds = driver.find_element(By.CLASS_NAME, "listeraces")

        for n2, breed in tqdm(enumerate(breeds.find_elements(By.TAG_NAME, "a"))):
            breed_text = None
            breed_ref = None
            breed_group = None
            breed_translations = []
            breed_section = None
            breed_subsection = None
            breed_date_of_acceptance = None
            breed_country_of_origin = None
            try:
                breed_text = breed.text
                breed_ref = breed.get_attribute("href")
                # print(breed_text, breed_ref)
                # click on the breed
                breed.click()
            except StaleElementReferenceException:
                # the element is no longer attached to the DOM so find them again
                breeds = driver.find_element(By.CLASS_NAME, "listeraces")
                breed = breeds.find_elements(By.TAG_NAME, "a")[n2]
                breed_text = breed.text
                breed_ref = breed.get_attribute("href")
                # print(breed_text, breed_ref)
                breed.click()

            # wait for that breed's page to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.ID, "ContentPlaceHolder1_GroupeHyperLink")
                )
            )
            group = driver.find_element(
                By.ID, "ContentPlaceHolder1_GroupeHyperLink")
            breed_group = group.text
            table = driver.find_element(By.CLASS_NAME, "racesgridview")

            # only get that column of the table
            rows = table.find_elements(By.TAG_NAME, "tr")
            for row in rows[1:]:
                breed_translations.append(
                    row.find_elements(By.TAG_NAME, "span")[0].text
                )

            table2 = driver.find_elements(By.CLASS_NAME, "racetable")

            left_rows2 = table2[0].find_elements(By.TAG_NAME, "tr")
            right_rows2 = table2[1].find_elements(By.TAG_NAME, "tr")

            for row in left_rows2:
                cells = row.find_elements(By.TAG_NAME, "td")
                if len(cells) >= 2:
                    if "subsection" in cells[0].text.lower():
                        breed_subsection = cells[1].text
                    elif "section" in cells[0].text.lower():
                        breed_section = cells[1].text
                    elif "date of acceptance" in cells[0].text.lower():
                        breed_date_of_acceptance = cells[1].text

            for row in right_rows2:
                cells = row.find_elements(By.TAG_NAME, "td")
                if len(cells) >= 2 and "country of origin" in cells[0].text.lower():
                    breed_country_of_origin = cells[1].text
            # Get the varieties of the breed if they exist
            try:
                table3 = driver.find_element(By.CLASS_NAME, "varietes")
                # get the rows with class-'variete' and the td tags
                breed_varieties = []
                varieties = table3.find_elements(By.CLASS_NAME, "variete")
                for variety in varieties:
                    spans = variety.find_elements(By.TAG_NAME, "span")
                    if spans:
                        breed_varieties.append(spans[0].text)
            except NoSuchElementException:
                breed_varieties = []

            # print((breed_text, breed_ref, breed_translations))
            # add the breed, link, and translations to the list
            name_link_list.append(
                (
                    breed_text,
                    breed_ref,
                    breed_group,
                    breed_translations,
                    breed_section,
                    breed_subsection,
                    breed_date_of_acceptance,
                    breed_country_of_origin,
                    breed_varieties,
                )
            )
            # go back to the previous page with the breeds
            driver.back()

    driver.quit()

    return name_link_list

In [23]:
# driver = webdriver.Chrome()

fci_nonmenclature_url = "https://fci.be/en/Nomenclature/Default.aspx"

In [24]:
# This cell gets the fci list of breeds and does it letter by letter
# can take up to 10 minutes to run
my_d = start_driver()
fci_list = get_fci_breeds(my_d, fci_nonmenclature_url)

23it [00:27,  1.21s/it]
41it [01:16,  1.87s/it]
2it [01:45, 52.91s/it]


In [25]:
fci_breeds_df = pd.DataFrame(
    fci_list,
    columns=[
        "breed",
        "link",
        "group",
        "translations",
        "section",
        "subsection",
        "date_of_acceptance",
        "country_of_origin",
        "varieties",
    ],
)
# Save in initial scraped data as a json file
fci_breeds_df.to_json("../data/fci_breeds_raw.json", orient="records")
# fci_breeds_trans_df["varieties"].value_counts()

Unnamed: 0,breed,link,group,translations,section,subsection,date_of_acceptance,country_of_origin,varieties
0,AFFENPINSCHER,https://fci.be/en/nomenclature/AFFENPINSCHER-1...,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[AFFENPINSCHER, AFFENPINSCHER, AFFENPINSCHER, ...",Pinscher and Schnauzer type,Pinscher,7/15/1955,GERMANY,[]
1,AFGHAN HOUND,https://fci.be/en/nomenclature/AFGHAN-HOUND-22...,n°10 - Sighthounds,"[AFGHAN HOUND, LEVRIER AFGHAN, AFGHANISCHER WI...",Long-haired or fringed Sighthounds,,12/12/1961,AFGHANISTAN,[]
2,AÏDI (CHIEN DE MONTAGNE DE L'ATLAS),https://fci.be/en/nomenclature/ATLAS-MOUNTAIN-...,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[ATLAS MOUNTAIN DOG (AIDI), CHIEN DE MONTAGNE ...",Molossian type,Mountain type,6/13/1963,MOROCCO,[]
3,AIREDALE TERRIER,https://fci.be/en/nomenclature/AIREDALE-TERRIE...,n°3 - Terriers,"[AIREDALE TERRIER, AIREDALE TERRIER, AIREDALE ...",Large and medium sized Terriers,,5/28/1963,GREAT BRITAIN,[]
4,AKITA,https://fci.be/en/nomenclature/AKITA-255.html,n°5 - Spitz and primitive types,"[AKITA, AKITA, AKITA, AKITA]",Asian Spitz and related breeds,,3/13/1964,JAPAN,[]
...,...,...,...,...,...,...,...,...,...
59,BRIQUET GRIFFON VENDEEN,https://fci.be/en/nomenclature/BRIQUET-GRIFFON...,n°6 - Scent hounds and related breeds,"[BRIQUET GRIFFON VENDEEN, BRIQUET GRIFFON VEND...",Scent hounds,Medium-sized Hounds,10/2/1954,FRANCE,[]
60,BROHOLMER,https://fci.be/en/nomenclature/BROHOLMER-315.html,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[BROHOLMER, BROHOLMER, BROHOLMER, BROHOLMER]",Molossian type,Mastiff type,5/26/1982,DENMARK,[]
61,BULL TERRIER,https://fci.be/en/nomenclature/BULL-TERRIER-11...,n°3 - Terriers,"[BULL TERRIER, BULL TERRIER, BULL TERRIER, BUL...",Bull type Terriers,,6/26/1993,GREAT BRITAIN,[]
62,BULLDOG,https://fci.be/en/nomenclature/BULLDOG-149.html,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[BULLDOG, BULLDOG, BULLDOG, BULLDOG]",Molossian type,Mastiff type,3/14/1955,GREAT BRITAIN,[]


In [27]:
fci_breeds_df = pd.read_json("../data/fci_breeds_raw.json")

In [56]:
fci_breeds_df = pd.DataFrame(
    fci_list,
    columns=[
        "breed",
        "link",
        "group",
        "translations",
        "section",
        "subsection",
        "date_of_acceptance",
        "country_of_origin",
        "varieties",
    ],
)

In [28]:
# add the value in the breed column to the list in the translations column in each respective row
fci_breeds_df["alt_names"] = fci_breeds_df.apply(
    lambda x: x["translations"] + [x["breed"]], axis=1
)
# english version is the first translation
fci_breeds_df["breed_en"] = fci_breeds_df["translations"].apply(
    lambda x: x[0].lower())
# edit all breeds with '- haired' in the name to remove the space and the '-'
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.replace(
    r"- ?haired", "haired", regex=True
)
# add the 'breed_en' breed to the list in alt_names column
fci_breeds_df["alt_names"] = fci_breeds_df.apply(
    lambda x: x["alt_names"] + [x["breed_en"]], axis=1
)

# create a column for the number of varieties from the varieties column
fci_breeds_df["n_varieties"] = fci_breeds_df["varieties"].transform(len)
# clean up the letter-numbering in the varieties column 'a)'
fci_breeds_df["varieties"] = fci_breeds_df["varieties"].apply(
    lambda x: [re.sub(r"^[a-z]\) ", "", i).lower() for i in x]
)

In [29]:
# extract the group number and name from the group column
fci_breeds_df["group_num"] = fci_breeds_df["group"].str.extract(r"(\d+)")
fci_breeds_df["group_name"] = (
    fci_breeds_df["group"].str.split("-", n=1, expand=True)[1].str.strip()
)
fci_breeds_df

Unnamed: 0,breed,link,group,translations,section,subsection,date_of_acceptance,country_of_origin,varieties,alt_names,breed_en,n_varieties,group_num,group_name
0,AFFENPINSCHER,https://fci.be/en/nomenclature/AFFENPINSCHER-1...,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[AFFENPINSCHER, AFFENPINSCHER, AFFENPINSCHER, ...",Pinscher and Schnauzer type,Pinscher,7/15/1955,GERMANY,[],"[AFFENPINSCHER, AFFENPINSCHER, AFFENPINSCHER, ...",affenpinscher,0,2,Pinscher and Schnauzer - Molossoid and Swiss M...
1,AFGHAN HOUND,https://fci.be/en/nomenclature/AFGHAN-HOUND-22...,n°10 - Sighthounds,"[AFGHAN HOUND, LEVRIER AFGHAN, AFGHANISCHER WI...",Long-haired or fringed Sighthounds,,12/12/1961,AFGHANISTAN,[],"[AFGHAN HOUND, LEVRIER AFGHAN, AFGHANISCHER WI...",afghan hound,0,10,Sighthounds
2,AÏDI (CHIEN DE MONTAGNE DE L'ATLAS),https://fci.be/en/nomenclature/ATLAS-MOUNTAIN-...,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[ATLAS MOUNTAIN DOG (AIDI), CHIEN DE MONTAGNE ...",Molossian type,Mountain type,6/13/1963,MOROCCO,[],"[ATLAS MOUNTAIN DOG (AIDI), CHIEN DE MONTAGNE ...",atlas mountain dog (aidi),0,2,Pinscher and Schnauzer - Molossoid and Swiss M...
3,AIREDALE TERRIER,https://fci.be/en/nomenclature/AIREDALE-TERRIE...,n°3 - Terriers,"[AIREDALE TERRIER, AIREDALE TERRIER, AIREDALE ...",Large and medium sized Terriers,,5/28/1963,GREAT BRITAIN,[],"[AIREDALE TERRIER, AIREDALE TERRIER, AIREDALE ...",airedale terrier,0,3,Terriers
4,AKITA,https://fci.be/en/nomenclature/AKITA-255.html,n°5 - Spitz and primitive types,"[AKITA, AKITA, AKITA, AKITA]",Asian Spitz and related breeds,,3/13/1964,JAPAN,[],"[AKITA, AKITA, AKITA, AKITA, AKITA, akita]",akita,0,5,Spitz and primitive types
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,YORKSHIRE TERRIER,https://fci.be/en/nomenclature/YORKSHIRE-TERRI...,n°3 - Terriers,"[YORKSHIRE TERRIER, TERRIER DU YORKSHIRE, YORK...",Toy Terriers,,10/30/1954,GREAT BRITAIN,[],"[YORKSHIRE TERRIER, TERRIER DU YORKSHIRE, YORK...",yorkshire terrier,0,3,Terriers
352,YUZHNORUSSKAYA OVCHARKA,https://fci.be/en/nomenclature/SOUTH-RUSSIAN-S...,n°1 - Sheepdogs and Cattledogs (except Swiss C...,"[SOUTH RUSSIAN SHEPHERD DOG, BERGER DE RUSSIE ...",Sheepdogs,,9/30/1983,RUSSIAN FEDERATION,[],"[SOUTH RUSSIAN SHEPHERD DOG, BERGER DE RUSSIE ...",south russian shepherd dog,0,1,Sheepdogs and Cattledogs (except Swiss Cattled...
353,ZAPADNO-SIBIRSKAÏA LAÏKA,https://fci.be/en/nomenclature/WEST-SIBERIAN-L...,n°5 - Spitz and primitive types,"[WEST SIBERIAN LAIKA, LAIKA DE SIBERIE OCCIDEN...",Nordic Hunting Dogs,,6/3/1980,RUSSIAN FEDERATION,[],"[WEST SIBERIAN LAIKA, LAIKA DE SIBERIE OCCIDEN...",west siberian laika,0,5,Spitz and primitive types
354,ZWERGPINSCHER,https://fci.be/en/nomenclature/MINIATURE-PINSC...,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[MINIATURE PINSCHER, PINSCHER NAIN, ZWERGPINSC...",Pinscher and Schnauzer type,Pinscher,7/14/1955,GERMANY,"[self coloured: deer red, reddish-brown to dar...","[MINIATURE PINSCHER, PINSCHER NAIN, ZWERGPINSC...",miniature pinscher,2,2,Pinscher and Schnauzer - Molossoid and Swiss M...


In [30]:
# if there is a breed name with 'pointing dog' in its alt_names, also add the breed name with 'pointer' in its name
fci_breeds_df["breed_en"] = fci_breeds_df["breed_en"].str.replace(
    r"pointing dog", "pointer", regex=True
)
fci_breeds_df["alt_names"] = fci_breeds_df.apply(
    lambda x: x["alt_names"] + [x["breed_en"]], axis=1
)

In [59]:
# display only the breeds with varieties
fci_breeds_df[fci_breeds_df["n_varieties"] > 0][
    ["breed", "varieties", "alt_names", "breed_en"]
]

Unnamed: 0,breed,varieties,alt_names,breed_en
8,AMERICAN COCKER SPANIEL,"[black, any solid colour other than black (asc...","[AMERICAN COCKER SPANIEL, COCKER AMÉRICAIN, AM...",american cocker spaniel
48,BOULEDOGUE FRANÇAIS,"[uniformly fawn, brindled or not, or with limi...","[FRENCH BULLDOG, BOULEDOGUE FRANÇAIS, FRANZÖSI...",french bulldog
51,BRACCO ITALIANO,"[white-orange, chestnut roan]","[ITALIAN POINTING DOG, BRAQUE ITALIEN, ITALIEN...",italian pointer
64,CA DE BESTIAR,"[short-haired, long-haired]","[MAJORCA SHEPHERD DOG, CHIEN DE BERGER DE MAJO...",majorca shepherd dog
71,CANICHE,"[standard, medium size, miniature, toy]","[POODLE, CANICHE, PUDEL, CANICHE, CANICHE, poo...",poodle
72,CÃO DA SERRA DA ESTRELA,"[short-haired, long-haired]","[ESTRELA MOUNTAIN DOG, CHIEN DE LA SERRA DA ES...",estrela mountain dog
74,CÃO DE AGUA PORTUGUÊS,"[long and wavy, shorter and curly]","[PORTUGUESE WATER DOG, CHIEN D'EAU PORTUGAIS, ...",portuguese water dog
78,CAVALIER KING CHARLES SPANIEL,"[black and tan, ruby, blenheim, tricolour]","[CAVALIER KING CHARLES SPANIEL, CAVALIER KING ...",cavalier king charles spaniel
85,CHIEN DE BERGER BELGE,"[groenendael, laekenois, malinois, tervueren]","[BELGIAN SHEPHERD DOG, CHIEN DE BERGER BELGE, ...",belgian shepherd dog
90,CHIHUAHUEÑO,"[long-haired, smooth-haired]","[CHIHUAHUA, CHIHUAHUA, CHIHUAHUA, CHIHUAHUEÑO,...",chihuahua


Some of the `varieties` are just variations in size, coat-color, -hair-length of the same breed. Still, some variations are so popular that they are referred to by this variation name. We will add these variations to the their `alt_names` list.These include:
- swiss hound
- small swiss hound
- german spitz
- belgian shepherd dog
- continental toy spaniel
- chinese crested dog

In [31]:
# add the varieties to the alt_names column of some popular breeds
popular_breeds = [
    "swiss hound",
    "small swiss hound",
    "german spitz",
    "belgian shepherd dog",
    "continental toy spaniel",
    "chinese crested dog",
]

popular_names_mask = fci_breeds_df["breed_en"].isin(popular_breeds)
fci_breeds_df.loc[popular_names_mask, "alt_names"] = (
    fci_breeds_df.loc[popular_names_mask, "alt_names"]
    + fci_breeds_df.loc[popular_names_mask, "varieties"]
)

In [32]:
fci_breeds_df["no_accent"] = fci_breeds_df["alt_names"].apply(
    lambda x: [remove_accents(i) for i in x]
)
# add the no_accent to the alt_names column and remove duplicates
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"] + \
    fci_breeds_df["no_accent"]
# reduce the duplicates within each alt_names list
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].apply(
    lambda x: [i.lower() for i in x]
)
fci_breeds_df["alt_names"] = fci_breeds_df["alt_names"].transform(set)

Some dog breeds are also known by a nickname so to speak

In [40]:
fci_breeds_df["alt_names"]
# get the rows of the breeds which use a ' - ' inany 1 off its name
fci_breeds_df[fci_breeds_df["alt_names"].apply(lambda x: any(" - " in i for i in x))]


fci_breeds_df[
    fci_breeds_df["country_of_origin"].str.contains(
        r"swit", na=False, regex=True, case=False
    )
]

Unnamed: 0,breed,link,group,translations,section,subsection,date_of_acceptance,country_of_origin,varieties,alt_names,breed_en,n_varieties,group_num,group_name,no_accent
14,APPENZELLER SENNENHUND,https://fci.be/en/nomenclature/APPENZELL-CATTL...,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[APPENZELL CATTLE DOG, BOUVIER APPENZELLOIS, A...",Swiss Mountain- and Cattledogs,,7/27/1954,SWITZERLAND,[],"{appenzeller sennenhund, perro boyero de appen...",appenzell cattle dog,0,2,Pinscher and Schnauzer - Molossoid and Swiss M...,"[APPENZELL CATTLE DOG, BOUVIER APPENZELLOIS, A..."
34,BERGER BLANC SUISSE,https://fci.be/en/nomenclature/WHITE-SWISS-SHE...,n°1 - Sheepdogs and Cattledogs (except Swiss C...,"[WHITE SWISS SHEPHERD DOG, BERGER BLANC SUISSE...",Sheepdogs,,11/26/2002,SWITZERLAND,[],"{berger blanc suisse, white swiss shepherd dog...",white swiss shepherd dog,0,1,Sheepdogs and Cattledogs (except Swiss Cattled...,"[WHITE SWISS SHEPHERD DOG, BERGER BLANC SUISSE..."
38,BERNER SENNENHUND,https://fci.be/en/nomenclature/BERNESE-MOUNTAI...,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[BERNESE MOUNTAIN DOG, BOUVIER BERNOIS, BERNER...",Swiss Mountain- and Cattledogs,,7/26/1954,SWITZERLAND,[],"{berner sennenhund, bouvier bernois, boyero de...",bernese mountain dog,0,2,Pinscher and Schnauzer - Molossoid and Swiss M...,"[BERNESE MOUNTAIN DOG, BOUVIER BERNOIS, BERNER..."
103,CONTINENTAL BULLDOG,https://fci.be/en/nomenclature/CONTINENTAL-BUL...,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[CONTINENTAL BULLDOG, BULLDOG CONTINENTAL, CON...",Molossian type,Mastiff type,3/30/2022,SWITZERLAND,[],"{bulldog continental, continental bulldog}",continental bulldog,0,2,Pinscher and Schnauzer - Molossoid and Swiss M...,"[CONTINENTAL BULLDOG, BULLDOG CONTINENTAL, CON..."
139,ENTLEBUCHER SENNENHUND,https://fci.be/en/nomenclature/ENTLEBUCH-CATTL...,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[ENTLEBUCH CATTLE DOG, BOUVIER DE L'ENTLEBUCH,...",Swiss Mountain- and Cattledogs,,7/28/1954,SWITZERLAND,[],"{bouvier de l'entlebuch, perro boyero de entle...",entlebuch cattle dog,0,2,Pinscher and Schnauzer - Molossoid and Swiss M...,"[ENTLEBUCH CATTLE DOG, BOUVIER DE L'ENTLEBUCH,..."
179,GROSSER SCHWEIZER SENNENHUND,https://fci.be/en/nomenclature/GREAT-SWISS-MOU...,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[GREAT SWISS MOUNTAIN DOG, GRAND BOUVIER SUISS...",Swiss Mountain- and Cattledogs,,8/13/1954,SWITZERLAND,[],"{grosser schweizer sennenhund, great swiss mou...",great swiss mountain dog,0,2,Pinscher and Schnauzer - Molossoid and Swiss M...,"[GREAT SWISS MOUNTAIN DOG, GRAND BOUVIER SUISS..."
221,LANDSEER (EUROPÄISCH-KONTINENTALER TYP),https://fci.be/en/nomenclature/LANDSEER-EUROPE...,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[LANDSEER (EUROPEAN CONTINENTAL TYPE), LANDSEE...",Molossian type,Mountain type,8/24/1960,"GERMANY, SWITZERLAND",[],"{landseer (european continental type), landsee...",landseer (european continental type),0,2,Pinscher and Schnauzer - Molossoid and Swiss M...,"[LANDSEER (EUROPEAN CONTINENTAL TYPE), LANDSEE..."
296,SCHWEIZER NIEDERLAUFHUND,https://fci.be/en/nomenclature/SMALL-SWISS-HOU...,n°6 - Scent hounds and related breeds,"[SMALL SWISS HOUND, PETIT CHIEN COURANT SUISSE...",Scent hounds,Small-sized Hounds,8/27/1954,SWITZERLAND,"[small jura hound, small schwyz hound, small b...","{small lucerne hound, sabueso suizo pequeno, p...",small swiss hound,4,6,Scent hounds and related breeds,"[SMALL SWISS HOUND, PETIT CHIEN COURANT SUISSE..."
297,SCHWEIZER LAUFHUND - CHIEN COURANT SUISSE,https://fci.be/en/nomenclature/SWISS-HOUND-59....,n°6 - Scent hounds and related breeds,"[SWISS HOUND, CHIEN COURANT SUISSE, SCHWEIZER ...",Scent hounds,Medium-sized Hounds,8/25/1954,SWITZERLAND,"[bernese hound, jura hound, lucerne hound, sch...","{schweizer laufhund - chien courant suisse, be...",swiss hound,4,6,Scent hounds and related breeds,"[SWISS HOUND, CHIEN COURANT SUISSE, SCHWEIZER ..."
319,ST.BERNHARDSHUND - BERNHARDINER,https://fci.be/en/nomenclature/ST-BERNARD-61.html,n°2 - Pinscher and Schnauzer - Molossoid and S...,"[ST. BERNARD, CHIEN DU MONT SAINT-BERNARD - SA...",Molossian type,Mountain type,8/28/1954,SWITZERLAND,"[short-haired, long-haired]","{san bernardo, st. bernhardshund (bernhardiner...",st. bernard,2,2,Pinscher and Schnauzer - Molossoid and Swiss M...,"[ST. BERNARD, CHIEN DU MONT SAINT-BERNARD - SA..."


In [63]:
# save to json
fci_breeds_df.to_json("../data/fci_breeds.json", orient="records")

In [206]:
# fci_breeds_trans_df[fci_breeds_trans_df["breed"].str.startswith("E")]
# fci_breeds_df = pd.read_json("../data/fci_breeds.json", orient="records")
# fci_breeds_df[fci_breeds_df["breed"].str.contains(r"spitz", case=False, regex=True)][
#     "alt_names"
# ].values

#### Info about Dog breeds from AKC


In [12]:
akc_dog_breed_groups_url = "https://www.akc.org/public-education/resources/general-tips-information/dog-breeds-sorted-groups/"

# get the html content of the website
akc_response = urlopen(akc_dog_breed_groups_url)
akc_html_content = akc_response.read()

In [13]:
# Parse the html content
akc_soup = BeautifulSoup(akc_html_content, "lxml")

In [14]:
# get the elements with links as the text is the info you want
breed_list = []
link_list = []
elements = akc_soup.find_all("a", href=True)
for element in elements:
    if "dog-breeds" in element.get("href"):
        breed_list.append(element.text.strip())
        link_list.append(element.get("href"))

In [15]:
# create a dictionary with the breed as the key and the link as the value
breed_link_dict = dict(zip(breed_list, link_list))

breed_link_dict_casefolded = {
    key.casefold(): value for key, value in breed_link_dict.items()
}
# breed_link_dict_casefolded

In [16]:
# loop over the breed_list and create a dictionary of group breeds and their sub-breeds
group_breeds = {}
current_group = None
for breed in breed_link_dict_casefolded:
    if re.search(r"group|stock|class", breed):
        current_group = breed
        group_breeds[current_group] = []
    elif current_group is not None:
        group_breeds[current_group].append(breed)

# print the resulting dictionary of breed groups and their breeds
# print(group_breeds)

In [17]:
# create a dataframe with only 2 columns, one for the breed group and the other for the breeds in that group
# this dataframe would be in long format

group_breeds_df = pd.DataFrame(group_breeds.items(), columns=["breed_group", "breed"])
akc_breeds_df = group_breeds_df.explode("breed")
akc_breeds_df.reset_index(drop=True, inplace=True)
akc_breeds_df["breed_group"] = (
    akc_breeds_df["breed_group"].str.replace(" GROUP", "").str.lower()
)
akc_breeds_df["breed"] = akc_breeds_df["breed"].str.lower()
akc_breeds_df = akc_breeds_df.iloc[1:-1, :]

In [15]:
# akc_breeds_df

In [18]:
# map dict with the links to the breed column
akc_breeds_df["links"] = akc_breeds_df["breed"].map(breed_link_dict_casefolded)
akc_breeds_df

Unnamed: 0,breed_group,breed,links
1,herding group,australian cattle dog,https://www.akc.org/dog-breeds/australian-catt...
2,herding group,australian shepherd,https://www.akc.org/dog-breeds/australian-shep...
3,herding group,bearded collie,https://www.akc.org/dog-breeds/bearded-collie/
4,herding group,beauceron,https://www.akc.org/dog-breeds/beauceron/
5,herding group,belgian laekenois,https://www.akc.org/dog-breeds/belgian-laekenois/
...,...,...,...
276,foundation stock service,treeing tennessee brindle,https://www.akc.org/dog-breeds/treeing-tenness...
277,foundation stock service,volpino italiano,https://www.akc.org/dog-breeds/volpino-italiano/
278,foundation stock service,wetterhoun,https://www.akc.org/dog-breeds/wetterhoun/
279,foundation stock service,working kelpie,https://www.akc.org/dog-breeds/working-kelpie/


#### Year breed was recognized by AKC

In [19]:
akc_breed_year_url = "https://www.akc.org/press-center/articles-resources/facts-and-stats/breeds-year-recognized/"

akc_breed_year_response = urlopen(akc_breed_year_url)
akc_breed_year_html_content = akc_breed_year_response.read()

In [20]:
akc_breed_year_soup = BeautifulSoup(akc_breed_year_html_content, "html.parser")

In [21]:
# convert to etree
akc_breed_year_parsed_html = etree.HTML(akc_breed_year_html_content)

# get the elements with the tag span and that have '&nbsp' in the text
akc_breed_year_elements = akc_breed_year_parsed_html.xpath(
    "//*[contains(text(), '\u00A0')]"
)
[element.text.strip() for element in akc_breed_year_elements]

# get elements in the class 'content-body__text-long'
akc_breed_year_element = akc_breed_year_soup.find(
    "div", class_="content-body__text-long"
)

# find the p tag in each of those elements and get the text in the span tag in the p tag
year_breed_list = [
    element.text.strip().replace("\xa0", "")
    for element in akc_breed_year_element
    if element.text.strip()
][2:]

In [101]:
year_breed_df = pd.DataFrame(
    [yb.split("–") for yb in year_breed_list], columns=["year", "breed"]
)
for col in year_breed_df.columns:
    year_breed_df[col] = year_breed_df[col].str.strip()
year_breed_df["year"] = year_breed_df["year"].astype(int)
year_breed_df["breed"] = year_breed_df["breed"].str.lower()

# make breed the first column
year_breed_df = year_breed_df[["breed", "year"]]

In [90]:
def match_breed(breed):
    # Find the best match in year_breed_df['breed'] for breed
    match = process.extractOne(
        breed, year_breed_df["breed"], scorer=fuzz.token_set_ratio
    )
    # If the match score is above 80, return the match
    if match[1] > 90:
        return match[0]
    # If no match is found, return NaN
    return np.nan


# Apply match_breed to the 'breed' column in akc_breeds_df
akc_breeds_df["matched_breed"] = akc_breeds_df["breed"].apply(match_breed)

# Merge akc_breeds_df with year_breed_df on 'matched_breed' and 'breed', respectively
merged_df = (
    pd.merge(
        akc_breeds_df,
        year_breed_df,
        left_on="matched_breed",
        right_on="breed",
        how="left",
    )
    .drop("breed_y", axis=1)
    .rename(columns={"breed_x": "breed"})
)

# Correct the year for "Löwchen" in year_breed_df
year_breed_df.loc[year_breed_df["breed"].str.contains(r"lowchen"), "year"] = 1996

In [46]:
# akc_breeds_merged_df.to_csv("../data/akc_dog_breeds.csv", index=False)
# merged_df.sort_values("breed")

#### AKC physical traits

In [91]:
akc_links = akc_breeds_df["links"].tolist()

In [92]:
def get_breed_info(driver, link):
    """Function to get the breed info from the AKC website."""

    breed_metadata = defaultdict(str)

    driver.get(link)

    try:
        breed = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "h1.page-header__title"))
        )

        breed_metadata["breed"] = breed.text

        # temperment

        temperment = driver.find_element(
            By.CSS_SELECTOR, "p.breed-page__intro__temperment"
        )

        breed_metadata["temperment"] = temperment.text

        # height, weight, life expectancy

        elements = driver.find_elements(
            By.CSS_SELECTOR, "div.breed-page__hero__overview__icon-block"
        )

        for ele in elements:
            text = ele.find_element(By.TAG_NAME, "p").text

            height_match = re.search(r"(\d+)(?:\s*-\s*(\d+))?\s*inches?", text)

            weight_match = re.search(r"(\d+)(?:\s*-\s*(\d+))?\s*pounds?", text)

            life_expectancy_match = re.search(
                r"(\d+)(?:\s*-\s*(\d+))?\s*years?", text)

            if height_match:
                breed_metadata["height"] = height_match.group(0)

            elif weight_match:
                breed_metadata["weight"] = weight_match.group(0)

            elif life_expectancy_match:
                breed_metadata["life_expectancy"] = life_expectancy_match.group(
                    0)

    except NoSuchElementException as e:
        print("No such element", e)

    return breed_metadata

In [93]:
# start the driver and get the breed info for each link
my_d = start_driver()
breed_data_driver = partial(get_breed_info, my_d)
list_of_dicts = []
for link in tqdm(akc_links):
    list_of_dicts.append(breed_data_driver(link))
my_d.quit()

100%|██████████| 280/280 [13:59<00:00,  3.00s/it]


In [121]:
# combine the list of dictionaries into a dataframe
akc_physical_traits = pd.DataFrame(list_of_dicts)
akc_physical_traits["breed"] = akc_physical_traits["breed"].str.lower()
akc_breeds_merged_df = merged_df.merge(akc_physical_traits, on="breed", how="left")

In [122]:
# akc_breeds_df
# akc_physical_traits.sort_values("breed")
akc_breeds_merged_df
# create a column for the both sets of names for the akc breeds
akc_breeds_merged_df["alt_names"] = akc_breeds_merged_df["breed"].apply(lambda x: [x])
# add the matched breed to the list of alt names
akc_breeds_merged_df["alt_names"] = akc_breeds_merged_df.apply(
    lambda x: x["alt_names"] + [x["matched_breed"]], axis=1
)
# drop the none from the alt_names column
akc_breeds_merged_df["alt_names"] = akc_breeds_merged_df["alt_names"].apply(
    lambda x: [i for i in x if i is not None]
)
# make the alt_names column into a set and then back into a list
akc_breeds_merged_df["alt_names"] = (
    akc_breeds_merged_df["alt_names"].transform(set).apply(lambda x: list(x))
)
#  save to json
akc_breeds_merged_df.to_json("../data/akc_breeds.json", orient="records")

#### Spitz Breed Group

Get info from this story about the spitz breeds.

In [None]:
spitz_story_url = "https://www.akc.org/expert-advice/dog-breeds/spitz-dog-breeds/"

spitz_response = urlopen(spitz_story_url)
spitz_html_content = spitz_response.read()

In [None]:
spitz_soup = BeautifulSoup(spitz_html_content, "lxml")

# get the breeds linted in the 'tag-set__item-link' class
elements = spitz_soup.find_all("a", class_="tag-set__item-link")
spitz_breeds = [element.text for element in elements]

display(spitz_breeds)

# find these breeds in the akc_breeds_df
akc_breeds_df[akc_breeds_df.breed.isin(spitz_breeds)]

In [75]:
fci_breeds_df["fci_recognized"] = True

In [79]:
akc_matches["akc_recognized"] = True
# akc_matches["akc_recognized"] = True

#### match both AKC and FCI breeds


In [217]:
def find_fci_breed_match(input_breed, fci_df, scoring_function=fuzz.token_set_ratio):
    """Find the match for the breed in the FCI breeds dataframe."""
    max_score = 85
    best_match = np.nan
    for index, breed_row in fci_df.iterrows():
        alternative_names = breed_row["alt_names"]
        current_score = max(
            scoring_function(input_breed, alt_name) for alt_name in alternative_names
        )
        if current_score > max_score:
            max_score = current_score
            best_match = breed_row["breed_en"]
        if max_score == 100:
            break

    return best_match


def apply_fuzzy_matching_to_breed_column(
    dataframe, breed_column, fci_df, fuzzy_matching_function
):
    """Apply fuzzy matching to the breed column in the dataframe."""
    return dataframe[breed_column].apply(
        lambda breed: find_fci_breed_match(breed, fci_df, fuzzy_matching_function)
    )

In [218]:
# read in the akc breeds dataframe
new_akc_df = pd.read_json("../data/akc_breeds.json", orient="records")
# read in the fci breeds dataframe
new_fci_df = pd.read_json("../data/fci_breeds.json", orient="records")
# create a column of nan values for the fci breed
new_akc_df["fci_breed"] = np.nan
# remove thhe None values form the list in the alt_names column
new_akc_df["alt_names"] = new_akc_df["alt_names"].apply(
    lambda x: [i for i in x if i is not None]
)


fuzz_funcs = [
    fuzz.ratio,
    fuzz.partial_ratio,
    fuzz.WRatio,
    fuzz.token_sort_ratio,
    fuzz.token_set_ratio,
]
# fillna by applying the fuzzy matching to the breed column
for func in fuzz_funcs:
    new_akc_df["fci_breed"] = new_akc_df["fci_breed"].fillna(
        apply_fuzzy_matching_to_breed_column(
            new_akc_df, "breed", new_fci_df, func)
    )
    print(new_akc_df["fci_breed"].notnull().sum())

coi = ["breed", "alt_names", "fci_breed"]
new_akc_df[new_akc_df["fci_breed"].isna()][coi]

207
254
274
274
274


Unnamed: 0,breed,alt_names,fci_breed
42,bluetick coonhound,[bluetick coonhound],
58,redbone coonhound,[redbone coonhound],
63,treeing walker coonhound,[treeing walker coonhound],
209,peruvian inca orchid,[peruvian inca orchid],
269,stabyhoun,[stabyhoun],
276,treeing tennessee brindle,[treeing tennessee brindle],
279,working kelpie,[working kelpie],


In [219]:
new_akc_df[new_akc_df["fci_breed"].notna()][coi].iloc[100:150]

Unnamed: 0,breed,alt_names,fci_breed
103,schipperke,[schipperke],schipperke
104,shiba inu,[shiba inu],shiba
105,tibetan spaniel,[tibetan spaniel],tibetan spaniel
106,tibetan terrier,[tibetan terrier],tibetan terrier
107,xoloitzcuintli,[xoloitzcuintli],xoloitzcuintle
108,american water spaniel,"[american water spaniel, spaniel (american wat...",american water spaniel
109,barbet,[barbet],french water dog
110,boykin spaniel,[boykin spaniel],american cocker spaniel
111,bracco italiano,[bracco italiano],italian pointer
112,brittany,[brittany],brittany spaniel


#### Wikipedia list of breeds of dogs

In [87]:
dog_breeds_list_url = "https://en.wikipedia.org/wiki/List_of_dog_breeds"

In [101]:
def get_breeds(driver, link):
    """Function to get the breeds from the wikipedia page."""
    breeds = []
    driver.get(link)
    try:
        # get all the elements with the dog breeds
        div_cols = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "div-col"))
        )
        # get the breeds in each div_col except the last
        for div_col in div_cols[:-1]:
            breed_elements = div_col.find_elements(By.TAG_NAME, "li")
            for breed_element in breed_elements:
                breeds.append(breed_element.text)

    except NoSuchElementException as e:
        print("No such element", e)

    driver.quit()

    return breeds

In [102]:
my_d = start_driver()
breed_driver = partial(get_breeds, my_d)

breeds_list = breed_driver(dog_breeds_list_url)

In [107]:
# removethe [\d] from the breed names
new_breed_list = [re.sub(r"\[\d+\]", "", breed) for breed in breeds_list]
new_breed_list = [breed.upper() for breed in new_breed_list]
# show nnumber of breeds
len(new_breed_list)

544

In this dataset you will find information on dogs and their owners from the holdings of the municipal dog register since 2015. In the case of dog owners, information on the age group, gender and statistical neighbourhood of the place of residence is provided. For each dog, the breed, the breed type, the sex, the year of birth, the age and the color is recorded. The dog register is maintained by the Dog Control Department of the Zurich City Police.

According to the law on the keeping of dogs, the city police are obliged to keep a register of dogs kept in the city of Zurich. Every dog over the age of three months must be registered at the dog control by the owner in person or by means of the registration form.

Household
Number of private households as well as the economic population of the city of Zurich in private households by household size, urban district, statistical urban district and year, since 2013.
A household includes all persons who live together in the same apartment. For this purpose, the term "economic residence" is used for persons. Collective households (homes, hospitals, penal institutions, communal accommodation for asylum seekers, etc.) are not taken into account.

In [165]:
zurich_dog_data_link = "https://data.stadt-zuerich.ch/dataset/sid_stapo_hundebestand_od1001/download/KUL100OD1001.csv"
zurich_pop_link = "https://data.stadt-zuerich.ch/dataset/bev_bestand_jahr_quartier_alter_herkunft_geschlecht_od3903/download/BEV390OD3903.csv"
zurich_income_link = "https://data.stadt-zuerich.ch/dataset/fd_median_einkommen_quartier_od1003/download/WIR100OD1003.csv"
zurich_household_data_link = "https://data.stadt-zuerich.ch/dataset/bev_hh_haushaltsgroesse_quartier_seit2013_od3806/download/BEV380OD3806.csv"
zurich_dog_data = pd.read_csv(zurich_dog_data_link)
zurich_pop = pd.read_csv(zurich_pop_link)
zurich_income = pd.read_csv(zurich_income_link)
zurich_household = pd.read_csv(zurich_household_data_link)
zurich_dog_data.info()
zurich_pop.info()
zurich_income.info()
zurich_household.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70967 entries, 0 to 70966
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   StichtagDatJahr     70967 non-null  int64 
 1   DatenstandCd        70967 non-null  object
 2   HalterId            70967 non-null  int64 
 3   AlterV10Cd          70967 non-null  int64 
 4   AlterV10Lang        70967 non-null  object
 5   AlterV10Sort        70967 non-null  int64 
 6   SexCd               70967 non-null  int64 
 7   SexLang             70967 non-null  object
 8   SexSort             70967 non-null  int64 
 9   KreisCd             70967 non-null  int64 
 10  KreisLang           70967 non-null  object
 11  KreisSort           70967 non-null  int64 
 12  QuarCd              70967 non-null  int64 
 13  QuarLang            70967 non-null  object
 14  QuarSort            70967 non-null  int64 
 15  Rasse1Text          70967 non-null  object
 16  Rasse2Text          70

In [166]:


zurich_dog_data = sanitize_df_column_names(zurich_dog_data)
zurich_pop = sanitize_df_column_names(zurich_pop)
zurich_income = sanitize_df_column_names(zurich_income)
zurich_household = sanitize_df_column_names(zurich_household)

In [158]:
tax_tariff_long_de = zurich_income.tax_tariff_long.unique().tolist()
tax_tariff_long_transated = translate_list_to_dict(tax_tariff_long_de)
display(tax_tariff_long_transated)
zurich_income['tax_tariff_long_en'] = zurich_income.tax_tariff_long.map(
    tax_tariff_long_transated)

{'Grundtarif': 'Basic tariff',
 'Verheiratetentarif': 'Married rate',
 'Einelternfamilientarif': 'Single-parent family tariff'}

In [None]:

zurich_dog_data.info()
zurich_dog_data.describe().T
zurich_dog_data.describe(include="O").T

In [116]:
zurich_dog_data.columns = [
    convert_to_snake_case(col) for col in zurich_dog_data.columns
]
column_choices = defaultdict(list)
zurich_dog_data.query("deadline_date_year < 2018")

for col in zurich_dog_data.columns:
    if zurich_dog_data[col].dtype == "object":
        column_choices[col] = zurich_dog_data[col].unique().tolist()

breed_mongrel_long = translate_list_to_dict(
    column_choices["breed_mongrel_long"])
breed_mongrel_long

{'Rassehund': 'Pedigree dog',
 'Mischling, beide Rassen bekannt': 'Mixed breed, both breeds known',
 'Mischling, sekundäre Rasse unbekannt': 'Mixed breed, secondary breed unknown',
 'Mischling, beide Rassen unbekannt': 'Mixed breed, both breeds unknown'}

In [None]:


gv.extension("bokeh")

zurich_zip_path = "../data/zurich_statistical_quarters.zip"

zurich_geo_data = (
    "zip://" + zurich_zip_path + "!data/stzh.adm_statistische_quartiere_v.json"
)

gv.Polygons(gpd.read_file(zurich_geo_data)).opts(
    tools=["hover"], height=500, width=500)

In [186]:



# define a functio to get the geojson data from the zip url
# in the zip url, the geojson files are in the data folder
def get_gdf_from_zip_url(zip_url: str) -> Optional[dict[str, gpd.GeoDataFrame]]:
    """Function to get the geojson data from the zip url.
    In the zip url, the geojson files are in the data folder."""
    gpd_dict = {}

    with urlopen(zip_url) as u:
        zip_data = u.read()
    with ZipMemoryFile(zip_data) as z:
        geofiles = z.listdir("data")
        for file in geofiles:
            with z.open("data/" + file) as g:
                gpd_dict[Path(file).stem] = gpd.GeoDataFrame.from_features(g, crs=g.crs)
    return gpd_dict if gpd_dict else None


zip_gdf_url = "https://storage.googleapis.com/mrprime_dataset/zurich/zurich_statistical_quarters.zip"
zurich_gdfs = get_gdf_from_zip_url(zip_gdf_url)

In [187]:
zurich_gdfs.keys()

dict_keys(['stzh.adm_statistische_quartiere_b_p', 'stzh.adm_statistische_quartiere_map', 'stzh.adm_statistische_quartiere_v'])

In [190]:



z_gdfs = rename_keys(zurich_gdfs)

In [None]:
z_gdfs["zurich_gdf_2"]