In [1]:
# import some of the libaries that we will use
import urllib.request
import io
import itertools as it
import os
import zipfile
import pandas as pd
import numpy as np

import json

import translate_app

In [2]:
# get the zip file with the data from the link

data_url = (
    "https://storage.googleapis.com/mrprime_dataset/dogs_of_zurich/dogs_of_zurich.zip"
)

# create function which takes the url
# retrieve zip and unzip it and return the csv files as a list


def get_data(url):
    """Function which takes in a url, retrieves the zip file,
    unzips it and returns the csv files as a list"""
    # get the zip file
    filename, headers = urllib.request.urlretrieve(url)
    with zipfile.ZipFile(filename) as zip_ref:
        # get the csv files
        dfs = []
        for file in zip_ref.namelist():
            if file.endswith(".csv"):
                csv_file = io.StringIO(zip_ref.read(file).decode("utf-8"))
                # readin csv as a pandas dataframe and append to list
                df = pd.DataFrame()
                df = pd.read_csv(csv_file)
                df["roster"] = file
                dfs.append(df)

    return dfs

In [15]:
# call the function and assign the csv files to a variable
dogs_of_zurich_dfs = get_data(data_url)

In [4]:
list_of_headings1 = dogs_of_zurich_dfs[0].columns.tolist()
list_of_headings2 = dogs_of_zurich_dfs[3].columns.to_list()
list_of_headings = list_of_headings1 + list_of_headings2

words_set = {word.replace("_", " ") for word in list_of_headings}
# words_set


{'HUNDEFARBE': 'DOG COLOR',
 'RASSENTYP': 'BREED TYPE',
 'HUNDERASSENTYP': 'DOG BREED TYPE',
 'GESCHLECHT': 'GENDER',
 'GEBURTSJAHR HUND': "DOG'S YEAR OF BIRTH",
 'roster': 'roster',
 'HUNDERASSENTYP KURZ': 'DOG BREED TYPE SHORT',
 'GESCHLECHT HUND': 'GENDER DOG',
 'RASSE2 MISCHLING': 'BREED2 MIXED BREED',
 'HUNDERASSE': 'DOG BREED',
 'RASSE1 MISCHLING': 'BREED1 MIXED BREED',
 'HALTER ID': 'HOLDER ID',
 'RASSE2': 'BREED2',
 'STADTKREIS': 'CITY CIRCLE',
 'RASSE1': 'BREED1',
 'ALTER': 'AGE',
 'STADTQUARTIER': 'CITY QUARTER'}

In [None]:

# translate using the translate app
translated_headings = translate_app.translate_list(words_set)
translated_headings

In [5]:
# put the underscores back in the original headings
translated_headings_underscores = {
    key.replace(" ", "_"): value.lower().replace(" ", "_").replace("'", "")
    for key, value in translated_headings.items()
}
translated_headings_underscores

{'HUNDEFARBE': 'dog_color',
 'RASSENTYP': 'breed_type',
 'HUNDERASSENTYP': 'dog_breed_type',
 'GESCHLECHT': 'gender',
 'GEBURTSJAHR_HUND': 'dogs_year_of_birth',
 'roster': 'roster',
 'HUNDERASSENTYP_KURZ': 'dog_breed_type_short',
 'GESCHLECHT_HUND': 'gender_dog',
 'RASSE2_MISCHLING': 'breed2_mixed_breed',
 'HUNDERASSE': 'dog_breed',
 'RASSE1_MISCHLING': 'breed1_mixed_breed',
 'HALTER_ID': 'holder_id',
 'RASSE2': 'breed2',
 'STADTKREIS': 'city_circle',
 'RASSE1': 'breed1',
 'ALTER': 'age',
 'STADTQUARTIER': 'city_quarter'}

In [27]:
# This is actually translated to districts as in the 12 districts of Zurich
translated_headings_underscores["STADTKREIS"] = "district"

In [16]:
# put the translated headings as the new column names
for df in dogs_of_zurich_dfs:
    df.rename(columns=translated_headings_underscores, inplace=True)

In [5]:
# dog_owners_columns = {
#     "HALTER_ID": "owner_id",
#     "ALTER": "age",
#     "GESCHLECHT": "gender",
#     "STADTKREIS": "district",
#     "RASSE1": "breed1",
#     "RASSE2": "breed2",
#     "HUNDEFARBE": "color",
#     "GEBURTSJAHR_HUND": "year_of_birth",
#     "GESCHLECHT_HUND": "dog_gender",
#     "RASSENTYP": "breed_type",
#     "RASSE1_MISCHLING": "breed1_mixed",
#     "RASSE2_MISCHLING": "breed2_mixed",
#     "STADTQUARTIER": "city_quarter",
# }

# dog_columns = {
#     "HUNDERASSE": "breed",
#     "HUNDERASSENTYP_KURZ": "short_breed_type",
#     "HUNDERASSENTYP": "breed_type",
# }

In [None]:
# for i in range(3):
#     dogs_of_zurich_dfs[i].rename(columns=dog_owners_columns, inplace=True)

# dogs_of_zurich_dfs[3].rename(columns=dog_columns, inplace=True)

In [17]:
# Combine 3 of 4 dataframes into one with dog owner info
dog_owner_df = pd.DataFrame()
dog_owner_df = pd.concat(
    [dogs_of_zurich_dfs[0], dogs_of_zurich_dfs[1], dogs_of_zurich_dfs[2]], axis=0
)

# name last dataframe with dog breeds info
dog_df = pd.DataFrame()
dog_df = dogs_of_zurich_dfs[3]

In [18]:
display(dog_owner_df.sample(3))
dog_df.sample(3)

Unnamed: 0,holder_id,age,gender,city_circle,city_quarter,breed1,breed1_mixed_breed,breed2,breed2_mixed_breed,breed_type,dogs_year_of_birth,gender_dog,dog_color,roster
5165,123248,31-40,w,10.0,101.0,Labrador Retriever,,,,I,2001,m,blondfarben,20170308hundehalter.csv
4680,119881,31-40,w,12.0,123.0,Labrador Retriever,,,,I,2013,m,braun,20170308hundehalter.csv
4288,115664,41-50,m,4.0,44.0,Pinscher,Mischling,,,K,2012,w,braun/schwarz,20170308hundehalter.csv


Unnamed: 0,dog_breed,dog_breed_type_short,dog_breed_type,roster
289,Schipperke,I,Rassentypenliste I,zuordnungstabellehunderassehundetyp.csv
78,Bullterrier,II,Rassentypenliste II,zuordnungstabellehunderassehundetyp.csv
113,Deutscher Kurzhaar,I,Rassentypenliste I,zuordnungstabellehunderassehundetyp.csv


In [22]:
dog_owner_df["roster"]

0       20151001hundehalter.csv
1       20151001hundehalter.csv
2       20151001hundehalter.csv
3       20151001hundehalter.csv
4       20151001hundehalter.csv
                 ...           
7150    20170308hundehalter.csv
7151    20170308hundehalter.csv
7152    20170308hundehalter.csv
7153    20170308hundehalter.csv
7154    20170308hundehalter.csv
Name: roster, Length: 21065, dtype: object

In [None]:
# only null values in breed2_mixed so drop column
dog_owner_df = dog_owner_df.drop(columns=["breed2_mixed_breed"])


In [26]:
dog_owner_df["roster"] = dog_owner_df["roster"].str[:4]

dog_owner_df["city_circle"] = dog_owner_df["city_circle"].astype("category")

# make category ordered to get the first appearance of the owner
dog_owner_df["roster"] = pd.Categorical(dog_owner_df["roster"], ordered=True)


owner_2015 = set(dog_owner_df[dog_owner_df["roster"] == "2015"]["holder_id"])
owner_2016 = set(dog_owner_df[dog_owner_df["roster"] == "2016"]["holder_id"])
owner_2017 = set(dog_owner_df[dog_owner_df["roster"] == "2017"]["holder_id"])


# dog_owner_df["roster"] = dog_owner_df["roster"].cat.as_ordered()

dog_owner_df["first_appearance"] = dog_owner_df.groupby("holder_id")[
    "roster"
].transform("min")
dog_owner_df["dog_count"] = dog_owner_df.groupby(["holder_id", "roster"])[
    "holder_id"
].transform("size")
# dog_owner_df[dog_owner_df['holder_id']==88250]
dog_owner_df.sample(3)

Unnamed: 0,holder_id,age,gender,city_circle,city_quarter,breed1,breed1_mixed_breed,breed2,breed_type,dogs_year_of_birth,gender_dog,dog_color,roster,first_appearance,dog_count
1107,85067,61-70,m,2.0,24.0,Cairn Terrier,,,K,2003,m,beige,2016,2015,1
2778,93168,51-60,m,5.0,52.0,Schnauzer,,,I,2005,m,schwarz,2015,2015,1
1369,86088,41-50,w,4.0,44.0,Bergamasker,Mischling,,I,2001,m,grau/schwarz,2015,2015,1


In [57]:
def age_group(age):
    """Function which widen the age groups of the oldest and youngest dog owners"""
    if age == "71-80" or age == "81-90" or age == "91-100":
        return "71+"
    elif age == "11-20" or age == "21-30":
        return "11-30"
    else:
        return age


dog_owner_df["age_group"] = dog_owner_df["age"].apply(age_group)

In [None]:
# dog_owner_df[dog_owner_df.age.isnull()]

In [None]:
# dog_owner_df[dog_owner_df["breed_type"].isnull()]

In [None]:
# dog_df[dog_df["breed_type"].isnull()]
# dog_df.info()

In [59]:
# Create a dictionary of German dog breeds
german_dog_breeds = {
    "Schäferhund": "German Shepherd",
    "Dackel": "Dachshund",
    "Rottweiler": "Rottweiler",
}

In [61]:
breed_list1 = dog_owner_df[["breed1", "breed2"]].stack().dropna().unique().tolist()
breed_list2 = dog_df["dog_breed"].unique().tolist()
breed_set = set(breed_list1 + breed_list2)
breed_set

{'Affenpinscher',
 'Afghane',
 'Afghanischer Windhund',
 'Airedale Terrier',
 'Akita Inu',
 'Alano',
 'Alaskan Malamute',
 'Alpenländische Dachsbracke',
 'Altdeutscher Hütehund',
 'Altdeutscher Schäfer',
 'American Akita',
 'American Bulldog',
 'American Cocker Spaniel',
 'American Pit Bull Terrier',
 'American Pitbull Terrier',
 'American Staffordshire Terrier',
 'Anatolian Kangal',
 'Anatolischer Hirtenhund',
 'Appenzeller',
 'Appenzeller Sennenhund',
 'Australian Cattle Dog',
 'Australian Shepherd',
 'Australian Silky Terrier',
 'Australian Terrier',
 'Australian working Kelpie',
 'Australien Kelpie',
 'Azawakh',
 'Barbet',
 'Bardino',
 'Barzoi',
 'Basenji',
 'Basset',
 'Basset Fauve de Bretagne',
 'Basset Griffon vendéen',
 'Basset Hound',
 'Bayerischer Gebirgsschweisshund',
 'Beagle',
 'Bearded Collie',
 'Beauceron',
 'Beauceron Berge de Beauce',
 'Bedlington Terrier',
 'Belgischer Schäfer',
 'Bergamasker',
 'Berger Picard',
 'Berger blanc Suisse',
 'Berger de Beauce',
 'Berger de

In [None]:
# breed_list = list(dog_df["breed"])

breed_translations_dict = translate_app.translate_list(breed_set)

In [None]:
# async def translate_breed(breed, german_dogs_dict=german_dog_breeds):
#     # If the breed is in the dictionary, return the English name
#     if breed in german_dog_breeds:
#         return german_dog_breeds[breed]

#     # Otherwise, use the translator to translate the breed name
#     else:
#         translation = await translate_app.translate_text_async(
#             text=breed, project_id="mrprimetranslator"
#         )
#         german_dogs_dict[breed] = translation

#         return translation

In [None]:
# Get the translation form the breed_translations_dict

dog_owner_df["breed1_en"] = dog_owner_df["breed1"].map(breed_translations_dict)
dog_owner_df["breed2_en"] = dog_owner_df["breed2"].map(breed_translations_dict)

dog_df["breed_en"] = dog_df["breed"].map(breed_translations_dict)


# save the dictionary as a json file
# with open ('../data/german_dog_breeds.json', 'w') as f:
# json.dump(german_dog_breeds, f)
dog_df.isna().sum()

In [None]:
# dog_df[dog_df['breed'].str.contains('hund')]
# dog_df['breed'].tolist()

with open("../data/german_dog_breeds.txt", "w", encoding="utf-8") as f:
    f.write(str(dog_df["breed"].tolist()))

In [None]:
dog_owner_df.reset_index()

In [None]:
dog_owner_df.to_csv("../data/dog_owner_df.csv", index=False)
dog_df.to_csv("../data/dog_df.csv", index=False)

In [None]:
dog_owner_breed = (
    dog_owner_df[["breed1", "breed1_mixed", "breed2", "breed2_mixed"]]
    .fillna("")
    .apply(lambda x: x.str.lower())
).reset_index(drop=True)


dog_breed = dog_df[["breed", "breed_en", "breed_type", "short_breed_type"]].apply(
    lambda x: x.str.lower()
)
merged_breed = pd.DataFrame()

merged_breed = dog_owner_breed.merge(
    dog_breed[["breed", "breed_en", "breed_type"]],
    left_on="breed1",
    right_on="breed",
    how="left",
).reset_index()

filler = (
    merged_breed[["index", "breed1"]]
    .loc[merged_breed["breed"].isna()]
    .merge(
        dog_breed,
        left_on="breed1",
        right_on="breed_en",
        how="left",
    )
)

breeds_en = pd.concat(
    [
        merged_breed.dropna(),
        filler,
    ]
).fillna("")
# len(breeds_en)
# breeds_en.set_index("index", inplace=True)
merged_breed
filler
breeds_en

In [None]:
# dog_df.drop_duplicates(inplace=True)
# breeds_dict = dict(zip(dog_df["breed"], dog_df["breed_en"]))

# # find the key in the dictionary and return the value
# dog_owner_df["breed1_en"] = dog_owner_df["breed1"].apply(
#     lambda x: breeds_dict.get(x, "")
# )
# # select the rows where breed1_en is ''
# mask = dog_owner_df["breed1_en"] == ""

# # fill in the missing values with breed1 values
# dog_owner_df.loc[mask, "breed1_en"] = dog_owner_df.loc[mask, "breed1"]

dog_owner_df

In [None]:
def convert_to_lowercase(data):
    # Convert text to lowercase in columns: 'breed1', 'breed1_mixed' and 3 other columns
    df = data.copy()
    df["breed1"] = df["breed1"].str.lower()
    df["breed1_mixed"] = df["breed1_mixed"].str.lower()
    df["breed2"] = df["breed2"].str.lower()
    df["breed_type"] = df["breed_type"].str.lower()
    df["breed1_en"] = df["breed1_en"].str.lower()

    return df


def one_hot_encode(data):
    # One-hot encode columns: 'gender', 'dog_gender', 'breed_type'
    df = data.copy()
    df = pd.get_dummies(df, columns=["gender", "dog_gender", "breed_type"])
    return df


dog_owner_df = convert_to_lowercase(dog_owner_df)
# one_hot_encode(dog_owner_df)

In [None]:
dog_owner_df[dog_owner_df["breed1_mixed"].notna()]

# mixed_dogs_owners = dog_owner_df[
#     dog_owner_df["breed1_en"].str.contains("mixed")
# ]

# dog_owner_df["breed1_mixed"].value_counts()

dog_owner_df["mixed_breed"] = (dog_owner_df["breed1_en"].str.contains("mixed")) | (
    dog_owner_df["breed1_mixed"].notna()
)

dog_owner_df["mixed_twice"] = (dog_owner_df["breed1_en"].str.contains("mixed")) & (
    dog_owner_df["breed1_mixed"].notna()
)

# dog_owner_df[dog_owner_df["breed2"].notna()]["breed1_mixed"]
# dog_owner_df[dog_owner_df["breed1_en"].notna()]
mixed_twice_df = dog_owner_df[dog_owner_df["mixed_twice"]]


In [None]:
color_df = dog_owner_df["color"].str.lower().str.split("/", expand=True)
color_list = []

color_list = color_df.stack().dropna().tolist()
color_list = set(sorted(color_list))


len(color_list)

In [None]:
color_dict = {}

translations = await translate_app.translate_text_parallel(
    color_list, "mrprimetranslator"
)
for text, translation in zip(color_list, translations):
    print(f"{text} -> {translation}")
    color_dict[text] = translation