In [13]:
# import some of the libaries that we will use
import urllib.request
import io
import itertools as it
import os
import zipfile
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import re
import unicodedata
import string

from thefuzz import process
from thefuzz import fuzz

import json

import colorcet as cc
import panel as pn
import panel.widgets as pnw

from translate_app import translate_list_to_dict
import recordlinkage as rl
import missingno as msno
import holoviews as hv
import hvplot
import hvplot.pandas

In [14]:
# get the zip file with the data from the link

data_url = (
    "https://storage.googleapis.com/mrprime_dataset/dogs_of_zurich/dogs_of_zurich.zip"
)

# create function which takes the url
# retrieve zip and unzip it and return the csv files as a list


def get_data(url):
    """Function which takes in a url, retrieves the zip file,
    unzips it and returns the csv files as a list"""
    # get the zip file
    filename, headers = urllib.request.urlretrieve(url)
    with zipfile.ZipFile(filename) as zip_ref:
        # get the csv files
        dfs = []
        for file in zip_ref.namelist():
            if file.endswith(".csv"):
                csv_file = io.StringIO(zip_ref.read(file).decode("utf-8"))
                # readin csv as a pandas dataframe and append to list
                df = pd.DataFrame()
                df = pd.read_csv(csv_file)
                df["roster"] = file
                dfs.append(df)

    return dfs

In [15]:
# call the function and assign the csv files to a variable
dogs_of_zurich_dfs = get_data(data_url)

In [16]:
# Put all the column headers in one list
list_of_headings = []
for df in dogs_of_zurich_dfs:
    list_of_headings += df.columns.tolist()

more_german_words = list(
    filter(lambda x: x is not np.nan, dogs_of_zurich_dfs[3].iloc[:, 2].unique())
)
list_of_headings += more_german_words

In [17]:
# Only keep unique column headers and replace underscores with spaces

words_set = {word.replace("_", " ") for word in list_of_headings}
words_set

{'ALTER',
 'GEBURTSJAHR HUND',
 'GESCHLECHT',
 'GESCHLECHT HUND',
 'HALTER ID',
 'HUNDEFARBE',
 'HUNDERASSE',
 'HUNDERASSENTYP',
 'HUNDERASSENTYP KURZ',
 'Kleinwüchsig',
 'RASSE1',
 'RASSE1 MISCHLING',
 'RASSE2',
 'RASSE2 MISCHLING',
 'RASSENTYP',
 'Rassentypenliste I',
 'Rassentypenliste II',
 'STADTKREIS',
 'STADTQUARTIER',
 'roster'}

#### Run translate app for columns

In [18]:
translated_words = translate_list_to_dict(words_set, project_id="mrprimetranslator")
translated_words

PermissionDenied: 403 Cloud Translation API has not been used in project mrprime-349614 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/translate.googleapis.com/overview?project=mrprime-349614 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry. [links {
  description: "Google developers console API activation"
  url: "https://console.developers.google.com/apis/api/translate.googleapis.com/overview?project=mrprime-349614"
}
, reason: "SERVICE_DISABLED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "translate.googleapis.com"
}
metadata {
  key: "consumer"
  value: "projects/mrprime-349614"
}
]

In [None]:
# translate headers using the translate app
# translated_words = translate_app.translate_list(
#     words_set, project_id="mrprimetranslator"
# )
# translated_words

In [None]:
# put the underscores back in the original headings
translated_headings_underscores = {
    key.replace(" ", "_"): value.lower().replace(" ", "_").replace("'s", "")
    for key, value in translated_words.items()
}
translated_headings_underscores

In [None]:
# This is actually translated to districts as in the 12 districts of Zurich
translated_headings_underscores["STADTKREIS"] = "district"

In [None]:
# put the translated headings as the new column names
for df in dogs_of_zurich_dfs:
    df.rename(columns=translated_headings_underscores, inplace=True)

#### Get the 2 dataframes

In [None]:
# Combine 3 of 4 dataframes into one with dog owner info
dog_owner_df = pd.DataFrame()
dog_owner_df = pd.concat(
    [dogs_of_zurich_dfs[0], dogs_of_zurich_dfs[1], dogs_of_zurich_dfs[2]], axis=0
)
dog_owner_df.info()

# name last dataframe with dog breeds info
dog_df = pd.DataFrame()
dog_df = dogs_of_zurich_dfs[3]
dog_df.info()

In [None]:
display(dog_owner_df.sample(3))
dog_df.sample(3)

In [None]:
msno.matrix(dog_owner_df)

In [None]:
# only null values in breed2_mixed so drop column
dog_owner_df = dog_owner_df.drop(columns=["breed2_mixed_breed"])

In [None]:
msno.matrix(dog_owner_df.sort_values(by=["breed2"]))

In [None]:
# reset index
dog_owner_df = dog_owner_df.reset_index(drop=True)
# make the district column a category
dog_owner_df["district"] = dog_owner_df["district"].astype("category")

# take the first 4 char of roster only and make it an ordered category colummn
dog_owner_df["roster"] = dog_owner_df["roster"].str[:4]
dog_owner_df["roster"] = pd.Categorical(dog_owner_df["roster"], ordered=True)


# add a column with the first year the owner appeared in the roster
dog_owner_df["first_appearance"] = dog_owner_df.groupby("holder_id")[
    "roster"
].transform("min")

# add column for the numberr of appearances in the roster
dog_owner_df["appearances"] = dog_owner_df.groupby("holder_id")["roster"].transform(
    "nunique"
)

# add a column with the number of dogs per owner
dog_owner_df["dog_count"] = dog_owner_df.groupby(["holder_id", "roster"])[
    "holder_id"
].transform("size")

dog_owner_df.sample(3)

Dog with a year of birth after the roster year

In [None]:
# get the holder_id of the bad entries and observe other entries with the same holder_id
bad_entry_holder_id = dog_owner_df[
    dog_owner_df["dog_year_of_birth"] > dog_owner_df["roster"].astype(int)
]["holder_id"]


dog_owner_df[dog_owner_df["holder_id"].isin(bad_entry_holder_id)].sort_values(
    by="holder_id"
)

# dog_owner_df[dog_owner_df["holder_id"].isin(bad_entry_holder_id)].index

Since we have Rosters for 3 separate years, we can see if that owner corrected its wrong entry in the later years. We can drop since the bad entries are consistent with no clue as to the correct entry.

In [None]:
display(dog_owner_df.shape)

bad_entry_index = dog_owner_df[
    dog_owner_df["holder_id"].isin(bad_entry_holder_id)
].index

dog_owner_df.drop(bad_entry_index, inplace=True)

dog_owner_df.info()

Dogs with a year of birth too far before the roster year (before 1990) which is plausible, but not probable

In [None]:
# get the holder_id of the bad entries
bad_entry_holder_id = dog_owner_df[dog_owner_df["dog_year_of_birth"] < 1990][
    "holder_id"
]

# isolate entries from these holder_ids and group them by holder_id
dog_owner_group = (
    dog_owner_df[dog_owner_df["holder_id"].isin(bad_entry_holder_id)]
    .sort_values(by="holder_id")
    .groupby("holder_id")
)

dog_owner_df[dog_owner_df["holder_id"].isin(bad_entry_holder_id)].sort_values(
    by="holder_id"
)

We can replace bad entries with matching entries from the later roster years as the owner corrected the value for th elater rosters. Luckly these owners only have one dog each.

The one bad entry of `1980` with only 1 appearance we cannot safely replace so we drop it.

In [None]:
# these entries only have 1 dog so we can replace the year of birth with the mode making some assumptions
dog_owner_df.loc[
    dog_owner_df["holder_id"].isin(bad_entry_holder_id), "dog_year_of_birth"
] = dog_owner_group["dog_year_of_birth"].transform(lambda x: x.mode().iloc[0])

dog_owner_df[dog_owner_df["holder_id"].isin(bad_entry_holder_id)].sort_values(
    by="holder_id"
)

In [None]:
dog_owner_df.city_quarter

In [None]:
# the one with only 1 appearance we cannot safely replace so we drop it
dog_owner_df = dog_owner_df.drop(
    dog_owner_df[dog_owner_df["holder_id"] == 129251].index
)

# No more 20/30something years-old dogs
dog_owner_df[dog_owner_df["dog_year_of_birth"] < 1990]["holder_id"]

Add 1 to the dog age so that no dog has an age of 0. Consider it the dog's year of living.

In [None]:
# dog's age is calculated by subtracting the year of birth from the year of the roster
# added 1 in case i wanted to do something with log down the line
dog_owner_df["dog_age"] = (
    dog_owner_df["roster"].astype(int) - dog_owner_df["dog_year_of_birth"] + 1
)
dog_owner_df.sample(3)

In [None]:
dog_owner_df["dog_age"].describe()

In [None]:
display(dog_owner_df[dog_owner_df.age.isnull()])


# Drop these 5 rows with unknown
dog_owner_df.dropna(subset=["age"], inplace=True)
dog_owner_df.shape

In [None]:
# owners each year
owner_2015 = set(dog_owner_df[dog_owner_df["roster"] == "2015"]["holder_id"])
owner_2016 = set(dog_owner_df[dog_owner_df["roster"] == "2016"]["holder_id"])
owner_2017 = set(dog_owner_df[dog_owner_df["roster"] == "2017"]["holder_id"])

print(f"{len(owner_2015)} initial owners in 2015")
# constant owners for all 3 years
new_2016 = owner_2016.difference(owner_2015)
returning_2016 = owner_2016.intersection(owner_2015)
print(f"{len(new_2016)} new owners in 2016 and {len(returning_2016)} returning owners")

new_2017 = owner_2017.difference(owner_2015.union(owner_2016))
returning_2017 = owner_2017.intersection(owner_2015.union(owner_2016))
print(f"{len(new_2017)} new owners in 2017 and {len(returning_2017)} returning owners")

constistent_owners = owner_2015.intersection(owner_2016).intersection(owner_2017)
print(f"{len(constistent_owners)} constant owners")

In [None]:
dog_owner_df["age"] = pd.Categorical(
    dog_owner_df["age"],
    ordered=True,
    categories=[
        "11-20",
        "21-30",
        "31-40",
        "41-50",
        "51-60",
        "61-70",
        "71-80",
        "81-90",
        "91-100",
    ],
)


# dog_owner_df.plot(
#     kind="bar",
#     column="age",
#     by="roster",
#     bins=9,
#     figsize=(10, 5),
#     histtype="step",
# )
dog_owner_df.groupby(
    [
        "age",
        "roster",
    ]
).size().unstack().hvplot.bar(
    xlabel="",
    rot=90,
    legend=True,
    tools=["hover", "box_select"],
    title="Owners age distribution each roster",
)

In [None]:
def age_group(age):
    """Function which widen the age groups of the oldest and youngest dog owners"""
    if age == "71-80" or age == "81-90" or age == "91-100":
        return "71+"
    elif age == "11-20" or age == "21-30":
        return "11-30"

    else:
        return age


dog_owner_df["age_group"] = dog_owner_df["age"].apply(age_group).dropna()
dog_owner_df

In [None]:
dog_owner_df["age_range"] = dog_owner_df["age"].str[:1] + "0s"

In [None]:
print(
    f"Number of null values in column 'breed_type': ",
    dog_owner_df.breed_type.isnull().sum(),
)

# get the breed1 for the entries with missing breed_type
breed_missing_breed_type = dog_owner_df[dog_owner_df["breed_type"].isnull()][
    "breed1"
].unique()

breed_missing_breed_type

In [None]:
dog_owner_df.groupby(["breed_type", "roster"]).size().unstack().hvplot.bar(
    xlabel="",
    rot=90,
    legend=True,
    tools=["hover", "box_select"],
    title="Breed type distribution each roster",
)

In [None]:
pd.set_option("display.max_rows", 100)
bully_breed = dog_owner_df[dog_owner_df.breed_type == "II"]["breed1"].unique().tolist()
dog_owner_df[dog_owner_df.breed1.isin(bully_breed)]
dog_owner_df[dog_owner_df.breed_type == "II"].sort_values(
    by="dog_count", ascending=False
)["holder_id"].nunique()

# bully_pattern = re.compile(r"[P|p]it\s?[B|b]ull|Staffordshire")

# dog_owner_df[
#     dog_owner_df["breed1"].str.contains(bully_pattern, na=False)
#     | dog_owner_df["breed2"].str.contains(bully_pattern, na=False)
# ]

In [None]:
# Find these breeds in the dog_df and get the breed_type from there
dog_df.drop("roster", axis=1, inplace=True)
dog_df[dog_df["dog_breed"].isin(breed_missing_breed_type)]

In [None]:
# see what other dogs of these breeds have as breed_type
dog_owner_df[dog_owner_df["breed1"].isin(breed_missing_breed_type)].sort_values(
    by=["breed1", "holder_id"]
)

dog_breed_group = dog_owner_df[
    dog_owner_df["breed1"].isin(breed_missing_breed_type)
].groupby("breed1")

# most breeds have a unanimous breed_type so we can just fillna with the mode
display(dog_breed_group["breed_type"].value_counts())


# Fill in the missing breed_type with the mode of the breed1
dog_owner_df["breed_type"].fillna(
    dog_owner_df.groupby("breed1")["breed_type"].transform(lambda x: x.mode().iloc[0]),
    inplace=True,
)

dog_owner_df.info()

In [None]:
missing_district_holder_id = dog_owner_df[dog_owner_df["district"].isna()][
    "holder_id"
].unique()

dog_owner_df[dog_owner_df["holder_id"].isin(missing_district_holder_id)]

# drop these missing rows with no district info
dog_owner_df.dropna(subset=["district"], inplace=True)

dog_owner_df.info()

In [None]:
# Convert the string columns to lower case
breed_columns = ["breed1", "breed2", "breed1_mixed_breed", "breed_type"]
# for col in breed_columns:
#     dog_owner_df[col] = dog_owner_df[col].str.lower()

dog_owner_df.sample(3)

In [None]:
dog_owner_df["breed1_mixed_breed"].unique()
dog_owner_df["breed1_mixed_breed"].nunique()
dog_owner_df[breed_columns].describe()
# dog_df["dog_breed"].unique()
dog_owner_df["breed1"].dropna().unique().tolist()

In [None]:
# get all of the dog breeds to translate and put in a list
breed_list1 = dog_owner_df[["breed1", "breed2"]].stack().dropna().unique().tolist()
breed_list2 = dog_df["dog_breed"].unique().tolist()
breed_list3 = list(
    filter(lambda x: x is not np.nan, dog_owner_df["breed1_mixed_breed"].unique())
)
breed_set = set(breed_list1 + breed_list2 + breed_list3)
len(breed_set)
# breed_set
unmatched_breeds_df = pd.DataFrame()
unmatched_breeds_df["breed"] = list(breed_set)

In [None]:
# breed_list1 = ["Pitbull", "German Shepherd", "Golden Retriever"]
# breed_list2 = ["Labrador Retriever", "Poodle", "Pit Bull"]

bully_pattern = re.compile(r"[P|p]it\s?[B|b]ull")
russel_pattern = re.compile(r"([P|p]arson|[J|j]ack|[R|r]ussel[l]?)+")

list(filter(bully_pattern.findall, sorted(set(breed_list1 + breed_list2))))
list(filter(russel_pattern.findall, sorted(set(breed_list1 + breed_list2))))

#### Get the AKC breeds saved to disk

In [None]:
# get the dog_breeds dataframe from the data folder
akc_breeds_df = pd.read_csv("../data/akc_dog_breeds.csv")
akc_breeds_df = akc_breeds_df.rename(columns={"breed": "akc_breed"})
# akc_dog_breeds.sample(3)

fci_breeds_df = pd.read_csv("../data/fci_breeds_trans.csv")
# fci_dog_breeds

# breed_choices = (
#     akc_breeds_df["akc_breed"].tolist() + fci_breeds_df["breed_en"].tolist()
# )
# breed_choices = list(set(breed_choices))
# breed_choices_df = pd.DataFrame(breed_choices, columns=["breed"])
# # breed_choices_df

In [None]:
pd.set_option("display.max_rows", 200)


def remove_accents(input_str):
    """Function to remove accents from a string.
    It takes as argument a string and returns the same string
    without accents."""
    nfkd_form = (
        unicodedata.normalize("NFKD", input_str).encode("ASCII", "ignore").decode()
    )
    # return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
    return nfkd_form


remove_accents("résuméö")

In [None]:
punc = string.punctuation.replace(",", "").replace("-", "")  # + "’"

pd.options.display.max_colwidth = 100

fci_breeds_df["alt_names"] = (
    fci_breeds_df["breed"]
    + ", "
    + fci_breeds_df["translations"]
    .str.replace(rf"[{punc}]", "", regex=True)
    .str.strip()
)
fci_breeds_df["alt_names"] = (
    fci_breeds_df["alt_names"]
    .str.lower()
    .str.split(",")
    .apply(lambda x: [i.strip() for i in x])
)


fci_breeds_df

In [None]:
breed_set = {breed.lower() for breed in breed_set}
breed_to_trans = {breed: [] for breed in breed_set}

for index, row in enumerate(fci_breeds_df["no_accent"]):
    for item in set(row).intersection(breed_set):
        breed_to_trans[item].append(index)

matches_df = pd.DataFrame.from_dict(breed_to_trans, orient="index").reset_index()
matches_df.columns = ["breed", "index_match"]
matches_df[matches_df["index_match"].notnull()].sort_values(by="index_match")
# breed_to_trans
# fci_breeds_df
# punc


def find_standard_breed_index(breed, remove_accents=True):
    """Func which matches the breed to the standard breed"""
    if remove_accents:
        breed = unicodedata.normalize("NFKD", breed)
    index = fci_breeds_df[
        fci_breeds_df["no_accent"].apply(lambda x: breed.casefold() in x)
    ].index
    if index.empty:
        return np.nan
    return index[0]


find_standard_breed_index("german shepherd dog")
dog_df["standard_index"] = dog_df["dog_breed"].apply(find_standard_breed_index)
dog_df[dog_df["standard_index"].notnull()].shape

In [None]:
standard_dict = {}
standard_dict = {
    row[1]["breed_en"]: row[1]["alt_names"] for row in fci_breeds_df.iterrows()
}


all_fci_names = list(it.chain.from_iterable(fci_breeds_df["alt_names"]))
# all_fci_names
# standard_dict

# print(row[1]["breed_en"])
# print(row[1]["no_accent"])

In [None]:
def match_breed_name(name, choices, scorer=fuzz.token_sort_ratio):
    mismo, score, *_ = process.extractOne(name, choices, scorer=scorer)
    return mismo, score


unmatched_breeds_df["closest_match"], unmatched_breeds_df["score"] = zip(
    *unmatched_breeds_df["breed"].apply(
        lambda x: match_breed_name(x, all_fci_names, scorer=fuzz.token_set_ratio)
    )
)

unmatched_breeds_df["standard"] = unmatched_breeds_df[
    unmatched_breeds_df["score"] > 80
]["closest_match"].apply(
    lambda x: [key for key, value in standard_dict.items() if x in value][0]
)
matches_df = unmatched_breeds_df[unmatched_breeds_df["standard"].notnull()]
matches_df
unmatched_breeds_df[unmatched_breeds_df["standard"].isnull()].reset_index()

In [None]:


match_list = []

for breed in breed_set:
    no_accent_breed = remove_accents(breed)

    matches = fci_breeds_df[
        fci_breeds_df["no_accent"].apply(lambda breeds: breed in breeds)
    ]
    if len(matches):
        row = matches.iloc[0]

        match_list.append(
            {
                "from_breed_set": breed,
                "fci_index": matches.index[0],
                "fci_translate": row.translations,
            },
        )
    else:
        match_list.append(
            {"from_breed_set": breed, "fci_index": None, "fci_translate": None},
        )
match_df = pd.DataFrame(match_list)
# match_df[match_df["fci_index"].isna()]
match_df[match_df["fci_index"].isna()]

In [None]:
pn.state.kill_all_servers()

In [None]:
def filter_df(breed):
    """Function which filters the dog_df by breed and returns a dataframe"""
    return fci_breeds_df[
        fci_breeds_df["translations"]
        .apply(lambda x: unicodedata.normalize("NFKD", x))
        .str.contains(breed, case=False, regex=True)
    ][["breed", "translations"]]


breed_filter = pnw.TextInput(placeholder="Enter breed here")

filtered_view = pn.Column(
    pn.Column(breed_filter),
    pn.panel(pn.bind(filter_df, breed=breed_filter)),
)
filtered_view.show()

In [None]:
def search_akc_breed(breed):
    """Function which finds the AKC breed in the dog_breeds dataframe"""
    return akc_breeds_df[akc_breeds_df["akc_breed"].str.contains(breed, case=False)]


search_akc_breed("swiss")

In [None]:
# matched_breeds = {}
# matched_breeds = {
#     breed: breed.lower()
#     for breed in breed_set
#     if breed.lower() in akc_dog_breeds["akc_breed"].tolist()
# }
# print(f"{len(matched_breeds)} breed entries found in AKC list.")

# sorted(breed_set)
# unmatched_breeds = sorted(breed_set.difference(matched_breeds))
# print(f"{len(unmatched_breeds)} breed entries not yet found.")

#### Run translate app for breeds

In [None]:
# translate the dog breeds
unmatched_translations = translate_app.translate_list(unmatched_breeds)
# unmatched_translations

In [None]:
# get the items in unmatched_translations which are in unmatched_breeds
# unmatched_breeds = get_updated_unmatched_breeds(matches)
unmatched_dict = {
    breed: unmatched_translations.get(breed) for breed in unmatched_breeds
}

Manuel inputs:

Manually change some of the breeds which may not have been translated correctly or at all

In [None]:
unmatched_translations["västgötaspets"] = "swedish vallhund"
unmatched_translations["jack russel terrier"] = "parson russell terrier"
unmatched_translations["berger blanc suisse"] = "white swiss shepherd dog"
unmatched_translations["trüffelhund"] = "lagotto romagnolo"
unmatched_translations["Polski Owczarek Nizinny"] = "polish lowland sheepdog"
unmatched_translations["Do-Khyi"] = "tibetan mastiff"

# unmatched_translations["zwergspitz"] = "pomeranian"

In [None]:
pd.set_option("display.max_rows", 400)

In [None]:
unmatched_df = pd.DataFrame()
unmatched_df = get_translated_unmatched_df(unmatched_dict)
fuzzy_matches_df = apply_match_breed_name(
    unmatched_df, "breed_en", breed_choices, scorer=fuzz.token_sort_ratio
)

matches = fuzzy_matches_df[fuzzy_matches_df["score"] > 90][
    ["breed_de", "closest_match"]
]
update_matches = dict(zip(matches["breed_de"], matches["closest_match"]))

matches_dict |= update_matches

unmatched_breeds = get_updated_unmatched_breeds(matches_dict)
len(unmatched_breeds)

In [None]:
unmatched_df = pd.DataFrame()
unmatched_df["breed_de"] = unmatched_breeds

unmatched_df = apply_match_breed_name(
    unmatched_df, "breed_de", akc_breeds_df["akc_breed"].tolist()
)
unmatched_df.set_index("breed_de", inplace=True)

matches.update(unmatched_df[unmatched_df["score"] > 90]["closest_match"].to_dict())

In [None]:
len(matches)

In [None]:
dog_owner_df.loc[
    (dog_owner_df["breed1_mixed_breed_en"].notnull())
    | (dog_owner_df["breed2_en"].notnull())
    | (dog_owner_df["breed1_en"].str.contains(r"mixed.*", regex=True)),
    "mixed_breed",
] = True
dog_owner_df["mixed_breed"].fillna(False, inplace=True)
dog_owner_df["pure_breed"] = ~dog_owner_df["mixed_breed"]
only_child_dogs = dog_owner_df[dog_owner_df["dog_count"] == 1]

In [None]:
# Get the translation from the breed_translations_dict and add it to the dataframes

dog_owner_df["breed1_en"] = dog_owner_df["breed1"].map(unmatched_translations)
dog_owner_df["breed2_en"] = dog_owner_df["breed2"].map(unmatched_translations)
dog_owner_df["breed1_mixed_breed_en"] = dog_owner_df["breed1_mixed_breed"].map(
    unmatched_translations
)

dog_df["dog_breed"] = dog_df["dog_breed"].str.lower()
dog_df["breed_en"] = dog_df["dog_breed"].map(unmatched_translations)
dog_df["breed_en"] = dog_df["breed_en"].str.lower()
dog_df["dog_breed_type_en"] = dog_df["dog_breed_type"].map(translated_words)
# dog_df.drop("roster", axis=1, inplace=True)
dog_df.sample(3)

In [None]:
def match_breed(breed, choices, scorer=fuzz.token_set_ratio, limit=2):
    """Function which uses process.extractOne to find the best match for a breed in a list of choices"""
    return process.extractOne(breed, choices, scorer=scorer)


akc_dog_breed_list = akc_breeds_df["akc_breed"].to_list()


# match the dog breeds in the dog_owner_df to the akc_dog_breeds
# find possible matches for the dog breeds in the unmatched_breeds list
# and put them in a dictionary with the dog breed as the key and the possible matches as the value
matches_dict = {
    breed: match_breed(breed, akc_dog_breed_list, scorer=fuzz.partial_ratio)
    for breed in unmatched_breeds
}

matches_dict

In [None]:
breed_translations_df[breed_translations_df["breed_group"].isnull()].head(50)
breed_translations_df

In [None]:
def filter_dog_breeds(breed):
    """Function which filters the dog_df by breed and returns a dataframe"""
    return akc_breeds_df[akc_breeds_df["breeds"].str.contains(breed)]


dog_breed_filter = pnw.TextInput(placeholder="Enter breed here")

filtered_dog_breeds = pn.Row(
    pn.Column(dog_breed_filter),
    pn.panel(pn.bind(filter_dog_breeds, breed=dog_breed_filter)),
)
# filtered_dog_breeds.show()

In [None]:
breed_translations_df[
    breed_translations_df["breed_de"] == breed_translations_df["breed_en"]
].tail(50)

# breed_translation_df['breed_count'] =
breed_translations_df["breed_en_count"] = breed_translations_df.groupby("breed_en")[
    "breed_en"
].transform("count")

breed_translations_df[:50]

In [None]:
dog_owner_df["breed1_en_count"] = dog_owner_df.groupby("breed1_en")[
    "breed1_en"
].transform("count")

# Find all the breeds with pinscher in the name
pattern = re.compile(r"mixed.*")

dog_owner_df.loc[dog_owner_df["breed1_en"].str.contains(pattern, regex=True)]

In [None]:
dog_owner_df.rename(
    columns={
        "holder_id": "owner_id",
        "age_range": "owner_age",
        "gender": "owner_gender",
        "breed1_en": "main_breed",
        "breed1_en_count": "main_breed_count",
        "breed2_en": "second_breed",
        "breed1_mixed_breed_en": "mixed_breed",
    },
    inplace=True,
)
# dog_owner_df["city"] = "Zurich"

dog_owner_df.info(verbose=True)

In [None]:
dog_owner_df[dog_owner_df["breed_type"] == "i"]["main_breed"].count()
dog_owner_df[dog_owner_df["breed_type"] == "ii"]["main_breed"].count()
dog_owner_df[dog_owner_df["breed_type"] == "k"]["main_breed"].unique()

swiss_breeds = list(
    map(
        lambda x: x.lower(),
        [
            "The Greater Swiss",
            "Bernese Mountain Dog",
            "Appenzeller Mountain Dog",
            "Entlebucher",
            "Bernese",
            "Bruno Jura",
            "Saint Hubert Jura",
            "Lucerne Hound",
            "Schwyz",
            "White Swiss Shepherd",
            "St. Bernard",
        ],
    )
)

swiss_keywords = set(it.chain.from_iterable(breed.split() for breed in swiss_breeds))


common_words = {"the", "hound", "dog", "white"}

words_to_look_for = swiss_keywords.difference(common_words)
words_to_look_for
# dog_owner_df.loc[dog_owner_df['main_breed'].contains(swiss_breeds), 'breed_type'] = 'swiss'

In [None]:
# get the rows which contain one of the words_to_look_for in the main_breed column

swiss_pattern = r"\b(?:{})\b".format("|".join(words_to_look_for))


swiss_dogs = dog_owner_df[
    dog_owner_df["main_breed"].str.contains(swiss_pattern, regex=True)
]
swiss_dogs[swiss_dogs["main_breed"] == "swiss low running dog"]
# dog_owner_df[dog_owner_df["main_breed"].isin(swiss_breeds)]["main_breed"].value_counts()
# dog_owner_df["main_breed"].value_counts().head(50)

In [None]:
dog_owner_df.to_csv("../data/dog_owner_df.csv", index=False)
dog_df.to_csv("../data/dog_df.csv", index=False)

In [None]:
# check if any of the word in each row of the main_breed column matches any of the words in the items in the swiss_breeds list
dog_owner_df["swiss_breed"] = dog_owner_df["main_breed"].apply(
    lambda x: any(word in x.split() for word in swiss_breeds)
)

dog_owner_df["swiss_breed"].value_counts()

In [None]:
{
    breed: unmatched_translations[breed]
    for breed in sorted(list(unmatched_translations.keys()))
}
dog_df

#### Color

In [None]:
color_set = set(dog_owner_df["dog_color"].str.replace("/", " ").to_list())
color_translated_dict = translate_app.translate_list(color_set)

In [None]:
color_translated_underscores = {
    key.replace(" ", "/"): value for key, value in color_translated_dict.items()
}
color_translated_underscores
dog_owner_df["dog_color_en"] = dog_owner_df["dog_color"].map(
    color_translated_underscores
)
# dog_owner_df

In [None]:
cc.b_glasbey_category10[:3]
roster_colors = {
    "2015": cc.b_glasbey_category10[0],
    "2016": cc.b_glasbey_category10[1],
    "2017": cc.b_glasbey_category10[2],
}


def filter_df(breed):
    """Function which filters the dog_df by breed and returns a dataframe"""
    return dog_owner_df[dog_owner_df["breed1_en"].str.contains(breed)]


breed_filter = pnw.TextInput(placeholder="Enter breed here")

filtered_view = pn.Row(
    pn.Column(breed_filter),
    pn.panel(pn.bind(filter_df, breed=breed_filter)),
)
# filtered_view.show()

In [None]:
dog_owner_df.hvplot.hist(
    y="dog_age",
    by="roster",
    color=hv.dim("roster").categorize(roster_colors),
    alpha=0.6,
    muted_alpha=0.05,
    legend="top_right",
    title="Dog age distribution each roster",
)

#### checkpoint