### This notebook's purpose is to clean up AKC dataset to make sure that the breeds are compatible with those of the image labels in the other dataset. 

### This information will be used for our final deliverable of giving breed information and traits after our models have made a prediction

# Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in akc breed traits data
akc_breeds = pd.read_csv('data/akc_breeds.csv')

In [3]:
akc_breeds.head()

Unnamed: 0.1,Unnamed: 0,description,temperament,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,...,grooming_frequency_value,grooming_frequency_category,shedding_value,shedding_category,energy_level_value,energy_level_category,trainability_value,trainability_category,demeanor_value,demeanor_category
0,Affenpinscher,The Affen’s apish look has been described many...,"Confident, Famously Funny, Fearless",148,22.86,29.21,3.175147,4.535924,12.0,15.0,...,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.6,Regular Exercise,0.8,Easy Training,1.0,Outgoing
1,Afghan Hound,"The Afghan Hound is an ancient breed, his whol...","Dignified, Profoundly Loyal, Aristocratic",113,63.5,68.58,22.679619,27.215542,12.0,15.0,...,0.8,Daily Brushing,0.2,Infrequent,0.8,Energetic,0.2,May be Stubborn,0.2,Aloof/Wary
2,Airedale Terrier,The Airedale Terrier is the largest of all ter...,"Friendly, Clever, Courageous",60,58.42,58.42,22.679619,31.751466,11.0,14.0,...,0.6,2-3 Times a Week Brushing,0.4,Occasional,0.6,Regular Exercise,1.0,Eager to Please,0.8,Friendly
3,Akita,"Akitas are burly, heavy-boned spitz-type dogs ...","Courageous, Dignified, Profoundly Loyal",47,60.96,71.12,31.751466,58.967008,10.0,13.0,...,0.8,Daily Brushing,0.6,Seasonal,0.8,Energetic,1.0,Eager to Please,0.6,Alert/Responsive
4,Alaskan Malamute,The Alaskan Malamute stands 23 to 25 inches at...,"Affectionate, Loyal, Playful",58,58.42,63.5,34.019428,38.555351,10.0,14.0,...,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.8,Energetic,0.4,Independent,0.8,Friendly


In [4]:
akc_breeds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   277 non-null    object 
 1   description                  277 non-null    object 
 2   temperament                  276 non-null    object 
 3   popularity                   198 non-null    object 
 4   min_height                   277 non-null    float64
 5   max_height                   277 non-null    float64
 6   min_weight                   275 non-null    float64
 7   max_weight                   275 non-null    float64
 8   min_expectancy               274 non-null    float64
 9   max_expectancy               274 non-null    float64
 10  group                        277 non-null    object 
 11  grooming_frequency_value     270 non-null    float64
 12  grooming_frequency_category  270 non-null    object 
 13  shedding_value      

In [5]:
# drop description (might end up leaving in...)
akc_breeds.drop('description', axis=1, inplace=True)

In [6]:
# change first column to breed
akc_breeds.rename(columns={'Unnamed: 0': 'Breed'}, inplace=True)

In [7]:
akc_breeds.head()

Unnamed: 0,Breed,temperament,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,group,grooming_frequency_value,grooming_frequency_category,shedding_value,shedding_category,energy_level_value,energy_level_category,trainability_value,trainability_category,demeanor_value,demeanor_category
0,Affenpinscher,"Confident, Famously Funny, Fearless",148,22.86,29.21,3.175147,4.535924,12.0,15.0,Toy Group,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.6,Regular Exercise,0.8,Easy Training,1.0,Outgoing
1,Afghan Hound,"Dignified, Profoundly Loyal, Aristocratic",113,63.5,68.58,22.679619,27.215542,12.0,15.0,Hound Group,0.8,Daily Brushing,0.2,Infrequent,0.8,Energetic,0.2,May be Stubborn,0.2,Aloof/Wary
2,Airedale Terrier,"Friendly, Clever, Courageous",60,58.42,58.42,22.679619,31.751466,11.0,14.0,Terrier Group,0.6,2-3 Times a Week Brushing,0.4,Occasional,0.6,Regular Exercise,1.0,Eager to Please,0.8,Friendly
3,Akita,"Courageous, Dignified, Profoundly Loyal",47,60.96,71.12,31.751466,58.967008,10.0,13.0,Working Group,0.8,Daily Brushing,0.6,Seasonal,0.8,Energetic,1.0,Eager to Please,0.6,Alert/Responsive
4,Alaskan Malamute,"Affectionate, Loyal, Playful",58,58.42,63.5,34.019428,38.555351,10.0,14.0,Working Group,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.8,Energetic,0.4,Independent,0.8,Friendly


In [8]:
akc_breeds.shape

(277, 20)

In [9]:
# change breed names so that they are comparable to the breed names in our image dataset
akc_breeds['Breed'] = akc_breeds['Breed'].map(lambda x: x.lower().replace(' ', '_'))

In [10]:
akc_breeds.head()

Unnamed: 0,Breed,temperament,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,group,grooming_frequency_value,grooming_frequency_category,shedding_value,shedding_category,energy_level_value,energy_level_category,trainability_value,trainability_category,demeanor_value,demeanor_category
0,affenpinscher,"Confident, Famously Funny, Fearless",148,22.86,29.21,3.175147,4.535924,12.0,15.0,Toy Group,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.6,Regular Exercise,0.8,Easy Training,1.0,Outgoing
1,afghan_hound,"Dignified, Profoundly Loyal, Aristocratic",113,63.5,68.58,22.679619,27.215542,12.0,15.0,Hound Group,0.8,Daily Brushing,0.2,Infrequent,0.8,Energetic,0.2,May be Stubborn,0.2,Aloof/Wary
2,airedale_terrier,"Friendly, Clever, Courageous",60,58.42,58.42,22.679619,31.751466,11.0,14.0,Terrier Group,0.6,2-3 Times a Week Brushing,0.4,Occasional,0.6,Regular Exercise,1.0,Eager to Please,0.8,Friendly
3,akita,"Courageous, Dignified, Profoundly Loyal",47,60.96,71.12,31.751466,58.967008,10.0,13.0,Working Group,0.8,Daily Brushing,0.6,Seasonal,0.8,Energetic,1.0,Eager to Please,0.6,Alert/Responsive
4,alaskan_malamute,"Affectionate, Loyal, Playful",58,58.42,63.5,34.019428,38.555351,10.0,14.0,Working Group,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.8,Energetic,0.4,Independent,0.8,Friendly


In [11]:
# read in breed info from images
breed_labels = pd.read_csv('data/dog-breed-identification/labels.csv')

In [12]:
breed_list = breed_labels['breed'].unique()

In [13]:
akc_list = akc_breeds['Breed'].unique()

In [14]:
breed_list

array(['boston_bull', 'dingo', 'pekinese', 'bluetick', 'golden_retriever',
       'bedlington_terrier', 'borzoi', 'basenji', 'scottish_deerhound',
       'shetland_sheepdog', 'walker_hound', 'maltese_dog',
       'norfolk_terrier', 'african_hunting_dog',
       'wire-haired_fox_terrier', 'redbone', 'lakeland_terrier', 'boxer',
       'doberman', 'otterhound', 'standard_schnauzer',
       'irish_water_spaniel', 'black-and-tan_coonhound', 'cairn',
       'affenpinscher', 'labrador_retriever', 'ibizan_hound',
       'english_setter', 'weimaraner', 'giant_schnauzer', 'groenendael',
       'dhole', 'toy_poodle', 'border_terrier', 'tibetan_terrier',
       'norwegian_elkhound', 'shih-tzu', 'irish_terrier', 'kuvasz',
       'german_shepherd', 'greater_swiss_mountain_dog', 'basset',
       'australian_terrier', 'schipperke', 'rhodesian_ridgeback',
       'irish_setter', 'appenzeller', 'bloodhound', 'samoyed',
       'miniature_schnauzer', 'brittany_spaniel', 'kelpie', 'papillon',
       'borde

In [15]:
akc_list

array(['affenpinscher', 'afghan_hound', 'airedale_terrier', 'akita',
       'alaskan_malamute', 'american_bulldog',
       'american_english_coonhound', 'american_eskimo_dog',
       'american_foxhound', 'american_hairless_terrier',
       'american_leopard_hound', 'american_staffordshire_terrier',
       'american_water_spaniel', 'anatolian_shepherd_dog',
       'appenzeller_sennenhund', 'australian_cattle_dog',
       'australian_kelpie', 'australian_shepherd',
       'australian_stumpy_tail_cattle_dog', 'australian_terrier',
       'azawakh', 'barbet', 'basenji', 'basset_fauve_de_bretagne',
       'basset_hound', 'bavarian_mountain_scent_hound', 'beagle',
       'bearded_collie', 'beauceron', 'bedlington_terrier',
       'belgian_laekenois', 'belgian_malinois', 'belgian_sheepdog',
       'belgian_tervuren', 'bergamasco_sheepdog', 'berger_picard',
       'bernese_mountain_dog', 'bichon_frise', 'biewer_terrier',
       'black_and_tan_coonhound', 'black_russian_terrier', 'bloodhound',


In [16]:
# find all of the breeds that do not match up between the two lists
print([x for x in breed_list if x not in akc_list])

['boston_bull', 'dingo', 'pekinese', 'bluetick', 'walker_hound', 'maltese_dog', 'african_hunting_dog', 'wire-haired_fox_terrier', 'redbone', 'doberman', 'black-and-tan_coonhound', 'cairn', 'groenendael', 'dhole', 'toy_poodle', 'shih-tzu', 'german_shepherd', 'basset', 'appenzeller', 'brittany_spaniel', 'kelpie', 'entlebucher', 'malamute', 'malinois', 'airedale', 'leonberg', 'mexican_hairless', 'bull_mastiff', 'lhasa', 'cardigan', 'clumber', 'scotch_terrier', 'eskimo_dog', 'brabancon_griffon', 'toy_terrier', 'chow', 'soft-coated_wheaten_terrier', 'staffordshire_bullterrier', 'dandie_dinmont', 'standard_poodle', 'japanese_spaniel', 'miniature_poodle', 'pembroke', 'blenheim_spaniel', 'german_short-haired_pointer', 'english_springer']


In [17]:
# create a dictionary to adjust any name discrepancies within the two datasets

breed_map = {'manchester_terrier_(standard)': 'standard_manchester_terrier', 'manchester_terrier_(toy)':'toy_manchester_terrier',
 'poodle_(miniature)': 'miniature_poodle', 'poodle_(toy)': 'toy_poodle', 'poodle_(standard)':'standard_poodle', 
 'chow_chow': 'chow', 'maltese': 'maltese_dog', 'basset_hound': 'basset', 'staffordshire_bull_terrier': 'staffordshire_bullterrier',
 'soft_coated_wheaten_terrier':'soft-coated_wheaten_terrier', 'german_shorthaired_pointer': 'german_short-haired_pointer',
 'german_shepherd_dog': 'german_shepherd', 'alaskan_malamute': 'malamute', 'american_eskimo_dog': 'eskimo_dog',
 'pembroke_welsh_corgi': 'pembroke', 'english_springer_spaniel': 'english_springer', 'black_and_tan_coonhound': 'black-and-tan_coonhound',
 'australian_kelpie': 'kelpie', 'shih_tzu': 'shih-tzu', 'dandie_dinmont_terrier': 'dandie_dinmont', 'bullmastiff': 'bull_mastiff',
 'doberman_pinscher': 'doberman', 'brittany': 'brittany_spaniel', 'redbone_coonhound': 'redbone', 'bluetick_coonhound': 'bluetick',
 'cardigan_welsh_corgi': 'cardigan', 'appenzeller_sennenhund': 'appenzeller', 'belgian_malinois': 'malinois', 'lhasa_apso':'lhasa',
 'clumber_spaniel': 'clumber', 'airedale_terrier': 'airedale', 'leonberger': 'leonberg', 'cairn_terrier': 'cairn',
 'entlebucher_mountain_dog':'entlebucher', 'treeing_walker_coonhound': 'walker_hound', 'wire_fox_terrier': 'wire-haired_fox_terrier',
 'toy_fox_terrier': 'toy_terrier', 'american_bulldog': 'boston_bull', 'brussels_griffon': 'brabancon_griffon', 
 'japanese_chin': 'japanese_spaniel', 'cavalier_king_charles_spaniel': 'blenheim_spaniel', 'carolina_dog': 'dingo',
 'pekingese': 'pekinese', 'azawakh': 'african_hunting_dog', 'belgian_sheepdog': 'groenendael', 
 'xoloitzcuintli': 'mexican_hairless', 'scottish_terrier': 'scotch_terrier'}
 
# akc: breeds
#akc_breeds['Breed'] = akc_breeds.where(akc_breeds['Breed'] in breed_map.keys(), breed_map[x], akc_breeds['Breed'])
akc_breeds['Breed'] = akc_breeds['Breed'].replace(breed_map)


In [18]:
akc_list = akc_breeds['Breed'].unique()

In [19]:
akc_list

array(['affenpinscher', 'afghan_hound', 'airedale', 'akita', 'malamute',
       'boston_bull', 'american_english_coonhound', 'eskimo_dog',
       'american_foxhound', 'american_hairless_terrier',
       'american_leopard_hound', 'american_staffordshire_terrier',
       'american_water_spaniel', 'anatolian_shepherd_dog', 'appenzeller',
       'australian_cattle_dog', 'kelpie', 'australian_shepherd',
       'australian_stumpy_tail_cattle_dog', 'australian_terrier',
       'african_hunting_dog', 'barbet', 'basenji',
       'basset_fauve_de_bretagne', 'basset',
       'bavarian_mountain_scent_hound', 'beagle', 'bearded_collie',
       'beauceron', 'bedlington_terrier', 'belgian_laekenois', 'malinois',
       'groenendael', 'belgian_tervuren', 'bergamasco_sheepdog',
       'berger_picard', 'bernese_mountain_dog', 'bichon_frise',
       'biewer_terrier', 'black-and-tan_coonhound',
       'black_russian_terrier', 'bloodhound', 'bluetick', 'boerboel',
       'bohemian_shepherd', 'bolognese', '

In [20]:
# find breeds that are in our image dataset that are not in akc list
print([x for x in breed_list if x not in akc_list])

['dhole']


In [21]:
akc_breeds.head(1)

Unnamed: 0,Breed,temperament,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,group,grooming_frequency_value,grooming_frequency_category,shedding_value,shedding_category,energy_level_value,energy_level_category,trainability_value,trainability_category,demeanor_value,demeanor_category
0,affenpinscher,"Confident, Famously Funny, Fearless",148,22.86,29.21,3.175147,4.535924,12.0,15.0,Toy Group,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.6,Regular Exercise,0.8,Easy Training,1.0,Outgoing


In [22]:
akc_breeds.columns

Index(['Breed', 'temperament', 'popularity', 'min_height', 'max_height',
       'min_weight', 'max_weight', 'min_expectancy', 'max_expectancy', 'group',
       'grooming_frequency_value', 'grooming_frequency_category',
       'shedding_value', 'shedding_category', 'energy_level_value',
       'energy_level_category', 'trainability_value', 'trainability_category',
       'demeanor_value', 'demeanor_category'],
      dtype='object')

In [23]:
# add 'dhole' to the akc list and give it basic info. leaving mostly blank because known to be wild dog 
akc_breeds = akc_breeds.append({'Breed': 'dhole', 'temperament': 'Athletic, Fearless, Social, Confident'}, ignore_index=True)

In [24]:
akc_list = akc_breeds['Breed'].unique()

In [25]:
akc_list

array(['affenpinscher', 'afghan_hound', 'airedale', 'akita', 'malamute',
       'boston_bull', 'american_english_coonhound', 'eskimo_dog',
       'american_foxhound', 'american_hairless_terrier',
       'american_leopard_hound', 'american_staffordshire_terrier',
       'american_water_spaniel', 'anatolian_shepherd_dog', 'appenzeller',
       'australian_cattle_dog', 'kelpie', 'australian_shepherd',
       'australian_stumpy_tail_cattle_dog', 'australian_terrier',
       'african_hunting_dog', 'barbet', 'basenji',
       'basset_fauve_de_bretagne', 'basset',
       'bavarian_mountain_scent_hound', 'beagle', 'bearded_collie',
       'beauceron', 'bedlington_terrier', 'belgian_laekenois', 'malinois',
       'groenendael', 'belgian_tervuren', 'bergamasco_sheepdog',
       'berger_picard', 'bernese_mountain_dog', 'bichon_frise',
       'biewer_terrier', 'black-and-tan_coonhound',
       'black_russian_terrier', 'bloodhound', 'bluetick', 'boerboel',
       'bohemian_shepherd', 'bolognese', '

In [26]:
# check one last time to make sure there's no more missing breeds
print([x for x in breed_list if x not in akc_list])

[]


In [31]:
# only keep AKC breeds that are in our image dataset
final_akc = akc_breeds[akc_breeds['Breed'].isin(breed_list)]

In [32]:
final_akc

Unnamed: 0,Breed,temperament,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,group,grooming_frequency_value,grooming_frequency_category,shedding_value,shedding_category,energy_level_value,energy_level_category,trainability_value,trainability_category,demeanor_value,demeanor_category
0,affenpinscher,"Confident, Famously Funny, Fearless",148,22.86,29.21,3.175147,4.535924,12.0,15.0,Toy Group,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.6,Regular Exercise,0.8,Easy Training,1.0,Outgoing
1,afghan_hound,"Dignified, Profoundly Loyal, Aristocratic",113,63.50,68.58,22.679619,27.215542,12.0,15.0,Hound Group,0.8,Daily Brushing,0.2,Infrequent,0.8,Energetic,0.2,May be Stubborn,0.2,Aloof/Wary
2,airedale,"Friendly, Clever, Courageous",60,58.42,58.42,22.679619,31.751466,11.0,14.0,Terrier Group,0.6,2-3 Times a Week Brushing,0.4,Occasional,0.6,Regular Exercise,1.0,Eager to Please,0.8,Friendly
4,malamute,"Affectionate, Loyal, Playful",58,58.42,63.50,34.019428,38.555351,10.0,14.0,Working Group,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.8,Energetic,0.4,Independent,0.8,Friendly
5,boston_bull,"Loyal, Self-Confident",,50.80,63.50,27.215542,45.359237,10.0,12.0,Foundation Stock Service,0.2,Occasional Bath/Brush,0.6,Seasonal,0.8,Energetic,0.6,Agreeable,0.6,Alert/Responsive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269,whippet,"Affectionate, Playful, Calm",61,45.72,55.88,11.339809,18.143695,12.0,15.0,Hound Group,0.2,Occasional Bath/Brush,0.4,Occasional,0.6,Regular Exercise,0.4,Independent,0.4,Reserved with Strangers
270,wire-haired_fox_terrier,"Confident, Alert, Gregarious",101,38.10,38.10,6.803886,8.164663,12.0,15.0,Terrier Group,0.4,Weekly Brushing,0.2,Infrequent,0.6,Regular Exercise,0.6,Agreeable,0.6,Alert/Responsive
274,mexican_hairless,"Loyal, Alert, Calm",140,25.40,58.42,4.535924,24.947580,13.0,18.0,Non-Sporting Group,0.2,Occasional Bath/Brush,0.2,Infrequent,0.8,Energetic,0.6,Agreeable,0.6,Alert/Responsive
276,yorkshire_terrier,"Affectionate, Sprightly, Tomboyish",10,17.78,20.32,3.175147,3.175147,11.0,15.0,Toy Group,1.0,Specialty/Professional,0.2,Infrequent,0.6,Regular Exercise,0.2,May be Stubborn,0.8,Friendly


In [33]:
# convert to final csv for use with predictions 
final_akc.to_csv('data/akc_breeds_final.csv', index=False)