# Dog breeds

Goal is to use the American Kennel Club (AKC) breed taxonomy to classify the breeds in the dataset into AKC breeds.

In [100]:
from fuzzywuzzy import fuzz
import numpy as np
import pandas as pd
import re
import wikipedia

The Wikipedia page *List of dog breeds recognized by the American Kennel Club* has a list of dog breeds along with their AKC classification. We can grab this data, and then use fuzzy text matching to match these to the breeds in our data.

In [108]:
akc_breeds = wikipedia.page("List of dog breeds recognized by the American Kennel Club")

# Manually check for links that are NOT breeds
exclude = [u'Dog breed', u'American Kennel Club', u'List of dog breeds']
breeds = [page for page in akc_breeds.links if page not in exclude]
page_content = akc_breeds.content

def identify_classification(breed, page_content):
    start = page_content.index(breed) + len(breed) + 2
    for i in range(1, 25):
        classification = page_content[start:start + i]
        if page_content[start + i - 1:start + i] == '\n':
            return classification[:-1]
    raise Exception('No class found')

classifications = {}
for breed in breeds:
    
    # Manual fixes
    breed = (breed
             .replace(' (dog)', '')
             .replace(' (dog breed)', '')
             .replace('American Cocker Spaniel', 'Cocker Spaniel')
             .replace('American Eskimo Dog', 'American Eskimo Dog (Miniature)')
             .replace('Australian Silky Terrier', 'Silky Terrier')
             .replace('Bergamasco Shepherd', 'Bergamasco')
             .replace('English Mastiff', 'Mastiff')
             .replace('Griffon Bruxellois', 'Brussels Griffon')
             .replace('Hungarian Vizsla', 'Vizsla')
             .replace('Rough Collie', 'Collie'))
    try:
        classifications[breed] = identify_classification(breed, page_content)
    except:
        print 'No luck with', breed

No luck with Jackchi


In [132]:
set(classifications.values())

{u'',
 u'Herding',
 u'Hound',
 u'Non-Sporting',
 u'Non-Sporting & Toy',
 u'Sporting',
 u'Terrier',
 u'Terrier & Toy',
 u'Toy',
 u'Working',
 u'equeno, Hound',
 u'errier'}

Match the classifications to our data. Since so many dogs are mixed-breed, let's make it so that a dog can be part of multiple classes.

In [129]:
train = pd.read_csv('data/train.csv')
train_breeds = list(train['Breed'].unique())
train_breed_classifications = {}
for train_breed in train_breeds:
    classes = []
    train_breed_clean = train_breed.replace(' Mix', '')
    train_breed_split = train_breed_clean.split('/')
    for partial_breed in train_breed_split:
        high_score, current_class = 0, None
        for classified_breed in classifications.keys():
            score = fuzz.ratio(partial_breed, classified_breed)
            if score > high_score:
                high_score = score
                current_class = classifications[classified_breed]
        classes.append(current_class)
    train_breed_classifications[train_breed] = set(classes)

In [128]:
train_breed_classifications

{'German Shepherd/Australian Cattle Dog': [u'Herding', u'Herding'],
 'Boxer/American Pit Bull Terrier': [u'Working', u'errier'],
 'Dachshund/Pug': [u'Hound', u'Toy'],
 'Carolina Dog/Chihuahua Shorthair': [u'Working', u'Toy'],
 'Harrier': [u'Hound'],
 'Pointer/Staffordshire': [u'Sporting', u'Terrier'],
 'Labrador Retriever/Australian Shepherd': [u'Sporting', u'Herding'],
 'Chihuahua Longhair/Cairn Terrier': [u'Toy', u'Terrier'],
 'Cairn Terrier/Dachshund': [u'Terrier', u'Hound'],
 'Belgian Tervuren/German Shepherd': [u'Herding', u'Herding'],
 'Pug Mix': [u'Toy'],
 'Anatol Shepherd/Catahoula': [u'Working', u'Toy'],
 'Yorkshire Terrier/Maltese': [u'Toy', u'Toy'],
 'Great Dane/German Shepherd': [u'Working', u'Herding'],
 'Australian Kelpie/Pit Bull': [u'Terrier', u'Non-Sporting'],
 'Golden Retriever/Pit Bull': [u'Sporting', u'Non-Sporting'],
 'Basenji/Jack Russell Terrier': [u'Hound', u'Terrier'],
 'Australian Kelpie Mix': [u'Terrier'],
 'Black/Tan Hound Mix': [u'Herding', u'Hound'],
 'Dac