In [1]:

# import some of the libaries that we will use
import urllib.request
import io
import os
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import json

import translate_app



In [2]:
# get the zip file with the data from the link

data_url = 'https://storage.googleapis.com/mrprime_dataset/dogs_of_zurich/dogs_of_zurich.zip'

# create function which takes the url
# retrieve zip and unzip it and return the csv files as a list

def get_data(url):
    """Function which takes in a url, retrieves the zip file,
    unzips it and returns the csv files as a list"""
    # get the zip file
    filename, headers = urllib.request.urlretrieve(url)
    with zipfile.ZipFile(filename) as zip_ref:

        # get the csv files
        dfs = []
        for file in zip_ref.namelist():
            if file.endswith('.csv'):
                csv_file = io.StringIO(zip_ref.read(file).decode('utf-8'))
                # readin csv as a pandas dataframe and append to list
                df = pd.DataFrame()
                df = pd.read_csv(csv_file)
                df['roster'] = file
                dfs.append(df)

    
    return dfs


 

In [3]:
# call the function and assign the csv files to a variable
dogs_of_zurich_dfs = get_data(data_url)


In [4]:
dog_owners_columns = {'HALTER_ID': 'owner_id',
                      'ALTER': 'age',
                     'GESCHLECHT': 'gender',
                     'STADTKREIS': 'district',
                     'RASSE1': 'breed1',
                     'RASSE2': 'breed2',
                     'HUNDEFARBE': 'color',
                     'GEBURTSJAHR_HUND': 'year_of_birth',
                     'GESCHLECHT_HUND': 'dog_gender',
                     'RASSENTYP': 'breed_type',
                     'RASSE1_MISCHLING': 'breed1_mixed',
                     'RASSE2_MISCHLING': 'breed2_mixed',
                     'STADTQUARTIER':'city_quarter'}

dog_columns = {'HUNDERASSE': 'breed',
               'HUNDERASSENTYP_KURZ': 'short_breed_type',
               'HUNDERASSENTYP':'breed_type'}

In [5]:
for i in range(3):
    dogs_of_zurich_dfs[i].rename(columns=dog_owners_columns, inplace=True)

dogs_of_zurich_dfs[3].rename(columns=dog_columns, inplace=True)

In [6]:
dog_owner_df = pd.concat([dogs_of_zurich_dfs[0], dogs_of_zurich_dfs[1], dogs_of_zurich_dfs[2]], axis=0)
dog_df = dogs_of_zurich_dfs[3]
# dog_df[dog_df['breed_type']=='Rassentypenliste II']

In [7]:
# dog_owner_df['owner_id'].value_counts()

In [8]:
dog_owner_df['district'] = dog_owner_df['district'].astype('category')
dog_owner_df['roster'] = dog_owner_df['roster'].astype('category')
dog_owner_df['roster'] = dog_owner_df['roster'].cat.rename_categories(new_categories=[2015, 2016, 2017])
# dog_owner_df[['roster','owner_id']].groupby('roster').nunique()

owner_2015 = set(dog_owner_df[dog_owner_df['roster']==2015]['owner_id'])
owner_2016 = set(dog_owner_df[dog_owner_df['roster']==2016]['owner_id'])
owner_2017 = set(dog_owner_df[dog_owner_df['roster']==2017]['owner_id'])



# make cattegory ordered to get the first appearance of the owner
dog_owner_df['roster'] = dog_owner_df['roster'].cat.as_ordered()

dog_owner_df['first_appearance'] = dog_owner_df.groupby('owner_id')['roster'].transform('min')
dog_owner_df['dog_count'] = dog_owner_df.groupby(['owner_id','roster'])['owner_id'].transform('size')
# dog_owner_df[dog_owner_df['owner_id']==88250]

In [9]:
def age_group(age):
    """Function which widen the age groups of the oldest and youngest dog owners"""
    if age == '71-80' or age == '81-90' or age == '91-100':
        return '71+'
    elif age == '11-20' or age == '21-30':
        return '11-30'
    else:
        return age

dog_owner_df['age_group'] = dog_owner_df['age'].apply(age_group)

In [10]:
dog_owner_df[dog_owner_df.age.isnull()]

Unnamed: 0,owner_id,age,gender,district,city_quarter,breed1,breed1_mixed,breed2,breed2_mixed,breed_type,year_of_birth,dog_gender,color,roster,first_appearance,dog_count,age_group
1348,85988,,m,2.0,2.0,Zwergpudel,,,,K,2001,m,beige/weiss,2015,2015,1,
2556,90344,,m,,,Shih Tzu,,,,K,1998,w,schwarz,2015,2015,1,
1273,85988,,m,2.0,2.0,Zwergpudel,,,,K,2001,m,beige/weiss,2016,2015,1,
2407,90344,,m,,,Shih Tzu,,,,K,1998,w,schwarz,2016,2015,1,
2234,90344,,m,,,Shih Tzu,,,,K,1998,w,schwarz,2017,2015,1,


In [11]:
dog_owner_df[dog_owner_df['breed_type'].isnull()]

Unnamed: 0,owner_id,age,gender,district,city_quarter,breed1,breed1_mixed,breed2,breed2_mixed,breed_type,year_of_birth,dog_gender,color,roster,first_appearance,dog_count,age_group
60,20483,61-70,m,12.0,121.0,Terrier,Mischling,,,,2002,m,black/tan,2015,2015,1,61-70
78,23889,61-70,w,4.0,42.0,Pudel,,,,,2012,m,braun,2015,2015,1,61-70
210,80272,41-50,m,9.0,91.0,Terrier,Mischling,,,,2007,m,black/tan,2015,2015,1,41-50
233,80434,61-70,w,7.0,71.0,Pudel,Mischling,,,,2006,m,schwarz,2015,2015,1,61-70
248,80530,81-90,w,2.0,21.0,Pudel,,,,,2005,w,schwarz,2015,2015,1,71+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6785,127555,41-50,w,3.0,34.0,Pudel,,,,,2009,w,apricot,2015,2015,1,41-50
6811,127698,41-50,m,9.0,91.0,Pudel,,,,,2009,w,beige,2015,2015,1,41-50
6833,127766,51-60,w,8.0,82.0,Pudel,,,,,2009,m,goldbraun,2015,2015,1,51-60
6867,127902,41-50,w,2.0,24.0,Pudel,,,,,2015,m,black/tan,2015,2015,1,41-50


In [12]:
dog_df[dog_df['breed_type'].isnull()]
# dog_df.info()

Unnamed: 0,breed,short_breed_type,breed_type,roster
55,Biewer Yorkshire Terrier,,,zuordnungstabellehunderassehundetyp.csv
72,Broholmer,,,zuordnungstabellehunderassehundetyp.csv
98,Corgie,,,zuordnungstabellehunderassehundetyp.csv
104,Daisy-Dog,,,zuordnungstabellehunderassehundetyp.csv
112,Deutscher Jagdterrier,,,zuordnungstabellehunderassehundetyp.csv
124,Dogo Canario,,,zuordnungstabellehunderassehundetyp.csv
125,Dürbächler,,,zuordnungstabellehunderassehundetyp.csv
126,Elo,,,zuordnungstabellehunderassehundetyp.csv
130,Englischer,,,zuordnungstabellehunderassehundetyp.csv
221,Miniature Australien Shepard,,,zuordnungstabellehunderassehundetyp.csv


In [13]:
dog_owner_df.to_csv('../data/dog_owner_df.csv', index=False)
dog_df.to_csv('../data/dog_df.csv', index=False)

In [15]:

# Create a dictionary of German dog breeds
german_dog_breeds = {
    "Schäferhund": "German Shepherd",
    "Dackel": "Dachshund",
    "Rottweiler": "Rottweiler",

}


In [16]:
def translate_breed(breed, german_dogs_dict=german_dog_breeds):
    # If the breed is in the dictionary, return the English name
    if breed in german_dog_breeds:
        return german_dog_breeds[breed]
    
    # Otherwise, use the translator to translate the breed name
    else:
        translation = translate_app.translate_text(text=breed, project_id='mrprimetranslator')
        german_dogs_dict[breed] = translation


        return translation


In [27]:
# read in json file saved from prior translation
with open ('../data/german_dog_breeds.json', 'r') as f:
    this_dict = json.load(f)

# update the dictionary with the prior saved translations
german_dog_breeds = german_dog_breeds|this_dict

In [28]:


# Test the function
print(translate_breed("Schäferhund"))  # Outputs: German Shepherd
print(translate_breed("Dackel"))  # Outputs: Dachshund
print(translate_breed("Rottweiler"))  # Outputs: Rottweiler


print(translate_breed('Afghanischer Windhund'))


German Shepherd
Dachshund
Rottweiler
Afghan Hound


In [30]:
german_dog_breeds

{'Schäferhund': 'German Shepherd',
 'Dackel': 'Dachshund',
 'Rottweiler': 'Rottweiler',
 'Afghanischer Windhund': 'Afghan Hound',
 'Affenpinscher': 'Affenpinscher',
 'Afghane': 'Afghan',
 'Airedale Terrier': 'Airedale Terriers',
 'Akita Inu': 'Akita Inu',
 'Alano': 'Alano',
 'Alaskan Malamute': 'Alaskan malamute',
 'Alpenländische Dachsbracke': 'Alpine Dachsbracke',
 'Altdeutscher Hütehund': 'Old German herding dog',
 'Altdeutscher Schäfer': 'Old German Shepherd',
 'American Akita': 'American Akita',
 'American Bulldog': 'American Bulldog',
 'American Cocker Spaniel': 'American cocker spaniel',
 'American Pit Bull Terrier': 'American Pit Bull Terrier',
 'American Pitbull Terrier': 'American Pit Bull Terrier',
 'American Staffordshire Terrier': 'American Staffordshire Terrier',
 'Anatolian Kangal': 'Anatolian Kangal',
 'Anatolischer Hirtenhund': 'Anatolian Shepherd Dog',
 'Appenzeller': 'Appenzeller',
 'Appenzeller Sennenhund': 'Appenzell Mountain dog',
 'Australian Cattle Dog': 'Austra

In [25]:
dog_df['breed_en'] = dog_df['breed'].apply(translate_breed)

# save the dictionary as a json file
# with open ('../data/german_dog_breeds.json', 'w') as f:
    # json.dump(german_dog_breeds, f)

In [43]:
# dog_df[dog_df['breed'].str.contains('hund')]
# dog_df['breed'].tolist()

with open('../data/german_dog_breeds.txt', 'w', encoding='utf-8') as f:
    f.write(str(dog_df['breed'].tolist()))