Dog bite data retrieved from: https://catalog.data.gov/dataset/dohmh-dog-bite-data
AKC breed data: https://github.com/tmfilho/akcdata


In [127]:
import pandas as pd
import altair as alt 

breed_url = 'https://raw.githubusercontent.com/jendoodle/akcdata/master/data/akc-data-latest.csv'
breed_data = pd.read_csv(breed_url)

dog_bite_url = 'https://raw.githubusercontent.com/jendoodle/DTSA-5304_lewis/main/DOHMH_Dog_Bite_Data.csv'
dog_bite_data = pd.read_csv(dog_bite_url, index_col=0)

dog_bite_df = pd.DataFrame(dog_bite_data)

# clean up column types
dog_bite_df[['Species', 'Breed', 'Gender', 'Borough']] = dog_bite_df[['Species', 'Breed', 'Gender', 'Borough']].astype('string')
dog_bite_df['DateOfBite'] = pd.to_datetime(dog_bite_data['DateOfBite'])
dog_bite_df.Breed = dog_bite_df.Breed.fillna('unknown')
dog_bite_df = dog_bite_df.map(lambda x: x.lower() if type(x) == str else x)

In [128]:
# clean up the breed column
dog_bite_df['Breed'].replace(to_replace='(.*\s&\s.*)|(great\sdane\sand\scocker\sspaniel)|(.*[&].*)|(.*[\d].*)|(.*multiple dogs.*)|(.*,){2}.*|(.*\/){2}.*', value='multiple dogs', regex=True, inplace=True)

dog_bite_df['Breed'].replace(to_replace='(.*unsure.*)|(.*\sor\s.*)|(.*unknown.*)|(.*uknown.*)|(.*unnkown.*)|(.*unkown.*)|(declined to provide)|(^un$)|(.*ukown.*)|(.*unc.*)|(.*unsure.*)|(.*not\s.*)|(.*large.*)|(.*medium.*)|(.*small.*)|(.*\sor\s.*)|(big size dog)', value='unknown', regex=True, inplace=True)
#dog_bite_df['Breed'].replace(to_replace='(.*american\sbull[y]?.*)', value='pit bull', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='\sbreed', value='', regex=True, inplace=True)

# removing comas makes the mapping to breed group work better
dog_bite_df['Breed'].replace(to_replace=',', value='', regex=True, inplace=True)

# common spelling mistakes
dog_bite_df['Breed'].replace(to_replace='(shepard)|(shpherd)|(sheperd)', value='shepherd', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='(terrior)|(terrieer)|(terr$)|(russellterr)', value='terrier', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='(pionter)', value='pointer', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='(bull\sdog)|(^bull$)|(bullbog)', value='bulldog', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='sheep\sdog', value='sheepdog', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='doddle', value='doodle', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='(mini\s)|(minature\s)', value='miniature ', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='pincher', value='pinscher', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='retrever', value='retriever', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='high\s', value='highland ', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='teacup ', value='', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='^st\s', value='saint ', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='mxied', value='mixed ', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='englih', value='english ', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='(austrailian\s)|(aust\s)', value='australian ', regex=True, inplace=True)

# mixed replacement before modifying specific breeds
dog_bite_df['Breed'].replace(to_replace='(.*cross$)|(.*mix.*)|(.*crossbreed$)|(.*[\s-]x[\s]*[-]*.*)|(.*hybrid.*)|(mutt)|(.*doodle.*)|(.*poo$)', value='mixed', regex=True, inplace=True)

# Pit bull clean up
# micro/pocket bully
dog_bite_df['Breed'].replace(to_replace='(.*pocket.*)|(.*micro.*)', value='micro bully', regex=True, inplace=True)
# american bully
dog_bite_df['Breed'].replace(to_replace='(.*bully.*)|(.*am.*bull[y]?.*)', value='american bully', regex=True, inplace=True)
# american bull dog
dog_bite_df['Breed'].replace(to_replace='(.*amer.*bulldog.*)|(.*bulldog.*amer.*)', value='american bulldog', regex=True, inplace=True)
# american pit bull terrier
dog_bite_df['Breed'].replace(to_replace='(.*amer.*pit.*)|(.*amer.*terr.*)', value='american pit bull terrier', regex=True, inplace=True)
# american staffordshire terrier
dog_bite_df['Breed'].replace(to_replace='(.*amer.*staff.*)|(^staff.*amer.*)', value='american staffordshire terrier', regex=True, inplace=True)
# staffordshire terrier
dog_bite_df['Breed'].replace(to_replace='(.*eng.*terr.*)|(^staff.*)', value='staffordshire bull terrier', regex=True, inplace=True)
# unknown pit bull
dog_bite_df['Breed'].replace(to_replace='^(?!(american\s)).*pit.*(?!(\sbull\sterrier))|((.*bull([\s]?(terrier))?){2}(?!.*mix.*))', value='unknown pit bull', regex=True, inplace=True)


# specific breed cleanup
dog_bite_df['Breed'].replace(to_replace='(.*chihuahua.*)|(.*chi\shua\shua.*)', value='chihuahua', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='(.*lab.*)', value='labrador retriever', regex=True, inplace=True)
# african boerboel (boerboel)
dog_bite_df['Breed'].replace(to_replace='african\sboerboel', value='boerboel', regex=True, inplace=True)
# airedale (airedale terrier)
dog_bite_df['Breed'].replace(to_replace='airedale$', value='airedale terrier', regex=True, inplace=True)
# alaskan husky, alaska husky, husky, alaskan husky, siberian (siberian husky)
dog_bite_df['Breed'].replace(to_replace='(.*husk.*)|(.*siberian.*)', value='siberian husky', regex=True, inplace=True)
# alaskan malmute, alskan malamute, malamute (alaskan malamute)
dog_bite_df['Breed'].replace(to_replace='(.*malmute.*)|(.*alskan.*)|(.*malamute.*)', value='alaskan malamute', regex=True, inplace=True)
# australian cattledog, blue healer, blue heeler
dog_bite_df['Breed'].replace(to_replace='(.*cattledog.*)|(.*blue\shealer.*)|(.*blue\sheeler.*)', value='australian cattle dog', regex=True, inplace=True)
# australian sheep dog (sheepdog)
dog_bite_df['Breed'].replace(to_replace='(australian\ssheepdog)|(australian\sherd\sdog)', value='australian shepherd', regex=True, inplace=True)
# beagle hound
dog_bite_df['Breed'].replace(to_replace='(beagle\shound)|(pocket\sbeagle)', value='beagle', regex=True, inplace=True)
# belgian malinois
dog_bite_df['Breed'].replace(to_replace='(.*malinois.*)', value='belgian malinois', regex=True, inplace=True)
# belgian sheperd, belgian shepherd, belgium shepard, (replace with belgian sheepdog)
dog_bite_df['Breed'].replace(to_replace='belgium', value='belgian', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='belgian\sshepherd', value='belgian sheepdog', regex=True, inplace=True)
# blue tick hound (bluetick coonhound)
dog_bite_df['Breed'].replace(to_replace='.*blue\stic.*', value='bluetick coonhound', regex=True, inplace=True)
# brittany spaniel (just brittany)
dog_bite_df['Breed'].replace(to_replace='.*brittany.*', value='brittany', regex=True, inplace=True)
# english bulldog (bulldog)
dog_bite_df['Breed'].replace(to_replace='(english.*bulldog)|(bulldog.*english)|(british.*bulldog)', value='bulldog', regex=True, inplace=True)
# caine corso, corso, italian king corso, italian mastiff, king corso (cane corso)
dog_bite_df['Breed'].replace(to_replace='(.*corso.*)|(.*caine.*)|(.*king\scorso.*)|(italian\smastiff)|(.*king\skorso.*)', value='cane corso', regex=True, inplace=True)
# catahoula, catahoula cur (catahoula leopard dog)
dog_bite_df['Breed'].replace(to_replace='(catahoula)|(catahoula\scur)', value='catahoula leopard dog', regex=True, inplace=True)
# collie rought coat, collie smooth coat (just collie)
dog_bite_df['Breed'].replace(to_replace='(.*rough\scoat.*)|(.*smooth\scoat.*)', value='collie', regex=True, inplace=True)
# corgie (corgi)
dog_bite_df['Breed'].replace(to_replace='.*corgi.*', value='corgi', regex=True, inplace=True)
# cotton de tulear (coton de tulear)
dog_bite_df['Breed'].replace(to_replace='cotton', value='coton', regex=True, inplace=True)
# daschound, daschund, doxxin (dachshund)
dog_bite_df['Breed'].replace(to_replace='(.*dachsund.*)|(.*dachshund.*)|(.*dacshound.*)|(.*daschound.*)|(.*daschund.*)|(.*doxxin.*)|(.*dotson.*)', value='dachshund', regex=True, inplace=True)
# pinscher (doberman pinscher)
dog_bite_df['Breed'].replace(to_replace='^pinscher$', value='doberman pinscher', regex=True, inplace=True)
# dogo (dogo argentino)
dog_bite_df['Breed'].replace(to_replace='^dogo$', value='dogo argentino', regex=True, inplace=True)
# dogue de bordx (dogue de bordeaux)
dog_bite_df['Breed'].replace(to_replace='bordx', value='bordeaux', regex=True, inplace=True)
# english mastiff, french mastiff, mastiff old english (mastiff)
dog_bite_df['Breed'].replace(to_replace='(masif.*)|(.*mastiff.*)', value='mastiff', regex=True, inplace=True)
# english pointer (pointer)
dog_bite_df['Breed'].replace(to_replace='english\spointer', value='pointer', regex=True, inplace=True)
# fox hound (american foxhound)
dog_bite_df['Breed'].replace(to_replace='fox\shound', value='american foxhound', regex=True, inplace=True)
# french poodle (standard poodle)
dog_bite_df['Breed'].replace(to_replace='french\spoodle', value='standard poodle', regex=True, inplace=True)
# frenchie (french bulldog)
dog_bite_df['Breed'].replace(to_replace='.*frenchie.*', value='french bulldog', regex=True, inplace=True)
# german pionter (german wirehaired pointer)
dog_bite_df['Breed'].replace(to_replace='german\spointer', value='german wirehaired pointer', regex=True, inplace=True)
# griffon bruzellois (brussels griffon)
dog_bite_df['Breed'].replace(to_replace='bruzellois', value='brussels', regex=True, inplace=True)
# harrier hound (harrier)
dog_bite_df['Breed'].replace(to_replace='.*harrier.*', value='harrier', regex=True, inplace=True)
# hungarian sheep dog (komondor)
dog_bite_df['Breed'].replace(to_replace='.*hungarian.*', value='komondor', regex=True, inplace=True)
# jack russ, terrier jack russ (jack russell terrier)
dog_bite_df['Breed'].replace(to_replace='jack.*russ.*', value='jack russell terrier', regex=True, inplace=True)
# japanese shiba, japanese shibainu (shiba inu)
dog_bite_df['Breed'].replace(to_replace='(shiba.*)|(shibainu)', value='shiba inu', regex=True, inplace=True)
# king charles cavlier (king charles)
dog_bite_df['Breed'].replace(to_replace='.*king\sch.*', value='king charles', regex=True, inplace=True)
# lagotte (lagotto romagnolo)
dog_bite_df['Breed'].replace(to_replace='lagotte', value='lagotto romagnolo', regex=True, inplace=True)
# lapso (lhasa apso)
dog_bite_df['Breed'].replace(to_replace='lapso', value='lhasa apso', regex=True, inplace=True)
# maltese terrier (maltese)
dog_bite_df['Breed'].replace(to_replace='maltese\sterrier', value='maltese', regex=True, inplace=True)
# mexican hairless (xoloitzcuintli)
dog_bite_df['Breed'].replace(to_replace='.*mexican.*', value='xoloitzcuintli', regex=True, inplace=True)
# miniature doberman
dog_bite_df['Breed'].replace(to_replace='miniature\sdoberman', value='miniature pinscher', regex=True, inplace=True)
# minature shepherd (miniature american shepherd)
dog_bite_df['Breed'].replace(to_replace='(mini.*shepherd)', value='miniature american shepherd', regex=True, inplace=True)
# russian wolfhound (borzoi)
dog_bite_df['Breed'].replace(to_replace='russian\swolfhound', value='borzoi', regex=True, inplace=True)
# poodle
dog_bite_df['Breed'].replace(to_replace='^poodle$', value='poodle standard', regex=True, inplace=True)
# schitzu (shih tzu)
dog_bite_df['Breed'].replace(to_replace='(shitzu)|(shihtzu)|(schitzu)', value='shih tzu', regex=True, inplace=True)
# sharpei (chinese shar pei)
dog_bite_df['Breed'].replace(to_replace='sharpei', value='shar pei', regex=True, inplace=True)
# sherlyn shepherd (shetland sheepdog)
dog_bite_df['Breed'].replace(to_replace='sherlyn', value='shetland', regex=True, inplace=True)
# walkerhound (treeing walker coonhound)
dog_bite_df['Breed'].replace(to_replace='.*walkerhound.*', value='treeing walker coonhound', regex=True, inplace=True)
# springer
dog_bite_df['Breed'].replace(to_replace='^springer$', value='springer spaniel', regex=True, inplace=True)
# westies, westy (west highland white terrier)
dog_bite_df['Breed'].replace(to_replace='(westies)|(westy)', value='west highland white terrier', regex=True, inplace=True)
# white point griffon (wirehaird pointing griffon)
dog_bite_df['Breed'].replace(to_replace='white\spoint\sgriffon', value='wirehaired pointing griffon', regex=True, inplace=True)
# yorki (yorkshire terrier)
dog_bite_df['Breed'].replace(to_replace='(.*yorki.*)|(.*york.*)', value='yorkshire terrier', regex=True, inplace=True)

# pit bull cleanup
#dog_bite_df['Breed'].replace(to_replace='(bull\sterrier)|(miniature\sbull\sterrier)|(.*[\s]pit.*)|(amer.*terrier)|(.*pit\sbull.*)|(.*pocket\sb.*)|(.*micro\s.*)|(^bully$)|(^am.*staff.*)|(.*pitbull.*)|(.*pitbll.*)|((.*bull([\s]?(terrier))?){2}(?!.*mix.*))', value='pit bull', regex=True, inplace=True)

# final mixed replacement
dog_bite_df['Breed'].replace(to_replace='(maltipoo.*)|(malti\s.*)|(puggle)|(poodle\sshih\stzu)|(poodle\sterrier)|(pomeranian\senglish\sspaniel)|(shorky)|(shih\stzu\spug)|(schichon)|(schichi)|(schnoodle)|(morkie)|(maltese\spoodle)|(malshi)|(latese)|(golden\spoodle)|(dingo.*)|(cockanese)|(boxer\sbeagle)|(chiweenie)|(chorkie)|(.*\/.*)|(.*-.*)', value='mixed', regex=True, inplace=True)
dog_bite_df['Breed'].replace(to_replace='.*mixed.*', value='mixed', regex=True, inplace=True)

# remove multiple dogs and weird entries
dog_bite_df = dog_bite_df[dog_bite_df.Breed != 'hot dog']
dog_bite_df = dog_bite_df[dog_bite_df.Breed != 'goat']
dog_bite_df = dog_bite_df[dog_bite_df.Breed != 'alpaca']
dog_bite_df = dog_bite_df[dog_bite_df.Breed != 'wild dog']
dog_bite_df = dog_bite_df[dog_bite_df.Breed != 'buddhist']
dog_bite_df = dog_bite_df[dog_bite_df.Breed != 'multiple dogs']

In [129]:
# set up the initial breed information data frame
akc_breed_df = pd.DataFrame(breed_data)
breed_groups_df = akc_breed_df[['breed', 'group']].astype('string')

basic_breed_names_df = breed_groups_df.map(lambda x: x.lower())
basic_breed_names_df['breed'].replace(to_replace='[(),]', value='', regex=True, inplace=True)
basic_breed_names_df['breed'].replace(to_replace='[-]', value=' ', regex=True, inplace=True)

# handle pit bull breeds
pit_bull_breeds = {'breed': ['unknown pit bull', 'micro bully', 'american bully', 'american pit bull terrier', 'staffordshire bull terrier'], 'group': ['pit bull group']*5 }
pit_bull_breeds_df = pd.DataFrame(pit_bull_breeds)
basic_breed_names_df = pd.concat([basic_breed_names_df, pit_bull_breeds_df]).set_index(['breed'])
basic_breed_names_df.at['american bulldog', 'group'] = 'pit bull group'
basic_breed_names_df.at['american staffordshire terrier', 'group'] = 'pit bull group'
basic_breed_names_df.reset_index(inplace=True)

In [130]:
# map the breeds to their groups and actual breed names
def breedMatches(source, target):
    letter_tolerance = 3
    word_match_limit = 2
    curr_letter_mismatch = 0
    word_match_amount = 0
    target_words = target.split(' ')
    source_words = source.split(' ')
    target_words.sort()
    source_words.sort()
    
    (max_length, most_words) = max((len(source_words), source_words), (len(target_words), target_words))
    (min_length, least_words) = min((len(source_words), source_words), (len(target_words), target_words))
    i = 0
    j = 0
    
    if max_length < 2:
        word_match_amount = 1
    
    # continue to check the current source word until we determine if it's a mismatch
    while i < min_length and j < max_length:
        word_m = ''
        word_l = least_words[i]
        
        # check if there is a next target word
        if j < len(most_words):
            word_m = most_words[j]
        
        max_word_length = max(len(word_m), len(word_l))
        curr_letter_mismatch = 0
        for k in range(max_word_length):
            m_letter = ''
            l_letter = ''
            
            if not word_m or not word_l:
                curr_letter_mismatch = letter_tolerance
                break
            
            if k < len(word_m):
                m_letter = word_m[k]
            if k < len(word_l):
                l_letter = word_l[k]
            
            # check if we've hit the end of a word and adjust the mismatch accordingly
            if k >= len(word_m) or k >= len(word_l):
                curr_letter_mismatch += max_word_length - k
            else:    
                curr_letter_mismatch += m_letter != l_letter
            
            # if we're past the tolerance move on to the next word to consider
            if curr_letter_mismatch >= letter_tolerance:
                j += 1
                break
            
        if curr_letter_mismatch < letter_tolerance:
            word_match_amount += 1
            i += 1
            j += 1
        if word_match_amount >= word_match_limit:
            return True
    return False
    

def mapBreedNameToGroup(breedToCheck, breedGroups):
    for index, row in breedGroups.iterrows():
        breed = row[0]
        group = row[1]
        if breedMatches(breed, breedToCheck):
            return (breed, group)
    if breedToCheck == 'mixed':
        return (breedToCheck, 'mixed breeds')
    return ('unknown', 'unregistered or unknown')

unique_breed_keys = dog_bite_df['Breed'].unique()
unique_breed_keys.sort()
messed_up_breed_map = dict.fromkeys(unique_breed_keys)

for breed in messed_up_breed_map:
    (actual_breed, group) = mapBreedNameToGroup(breed, basic_breed_names_df)
    messed_up_breed_map[breed] = (actual_breed, group)
    
# set corgi to herding group and terrier to unknown
messed_up_breed_map['corgi'] = ('corgi', 'herding group')
messed_up_breed_map['terrier'] = ('unknown', 'unregistered or unknown')
messed_up_breed_map['american pit bull terrier'] = ('american pit bull terrier', 'pit bull group')

  breed = row[0]
  group = row[1]


In [131]:
# start grouping things together for visualization
group_by_breed = dog_bite_df.groupby(['Breed']).size().reset_index(name='Bite Count')

group_by_group = dog_bite_df[['Breed']].copy()
group_by_group['Breed'] = group_by_group.Breed.map(lambda x: messed_up_breed_map[x][0] if x in messed_up_breed_map.keys() else x)
group_by_group['Breed Group'] = dog_bite_df.Breed.map(lambda x: messed_up_breed_map[x][1] if x in messed_up_breed_map.keys() else 'unregistered or unknown')
count_group_bites = group_by_group.groupby(['Breed Group']).size().reset_index(name='Bite Count')
count_breed_bites = group_by_group.groupby(['Breed']).size().reset_index(name='Bite Count')

In [85]:
alt.data_transformers.disable_max_rows()
breed_groups_sorted = count_breed_bites.sort_values(by="Bite Count")
alt.Chart(breed_groups_sorted).mark_bar().encode(
    y='Bite Count',
    x=alt.X('Breed').sort('-y')
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [167]:
# creating interactive scatter plot with layered bar chart
selector = alt.selection_point(fields=['Breed Group'], on='click')

group_color_domain = group_by_group['Breed Group'].unique()

color_scale = alt.Scale(domain=group_color_domain,
                        range=[
                            '#71E5EF', 
                            '#d065e1', 
                            '#f4d403', 
                            '#fb899b', 
                            '#5f9b99',
                            '#b1e632',
                            '#e2874e',
                            '#52ef99',
                            '#fb5a3b',
                            '#2f4b4e',
                            '#3295e9',
                            '#957206'
                        ])

base = alt.Chart(group_by_group).properties(
    width=250,
).add_params(selector)

points = base.mark_bar(size=20).properties(
        title='Dog Bite Records In NYC From 2018-2022'
).encode(
        alt.Y('Breed Group').sort('-x'),
        alt.X('count()').scale(domain=[0,7000]),
        color=alt.condition(
        selector,
        'Breed Group:N',
        alt.value('lightgray'),
        scale=color_scale),
        tooltip=['Breed Group', alt.Tooltip('count()', title='Bite Count')]
)

hists = base.properties(
    width = 500
    ).mark_bar(thickness=100, size=15).encode(
        alt.X('count()', title='Breed Bite Count')
            .stack(None)
            .scale(domain=[0,7000]),
        alt.Y('Breed').sort('-x'),
        alt.Color('Breed Group:N').scale(color_scale),
        tooltip=['Breed', alt.Tooltip('count()', title='Bite Count')]
).transform_filter(
    selector
)

points | hists


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [149]:
import altair as alt

source = {
    "values": [
        {"a": "A", "b": 28},
        {"a": "B", "b": 55},
        {"a": "C", "b": 43},
        {"a": "D", "b": 91},
        {"a": "E", "b": 81},
        {"a": "F", "b": 53},
        {"a": "G", "b": 19},
        {"a": "H", "b": 87},
        {"a": "I", "b": 52},
    ]
}

select = alt.selection_point(name="select", on="click")
highlight = alt.selection_point(name="highlight", on="pointerover", empty=False)

stroke_width = (
    alt.when(select).then(alt.value(2, empty=False))
    .when(highlight).then(alt.value(1))
    .otherwise(alt.value(0))
)


alt.Chart(source, height=200).mark_bar(
    fill="#4C78A8", stroke="black", cursor="pointer"
).encode(
    x="a:O",
    y="b:Q",
    fillOpacity=alt.when(select).then(alt.value(1)).otherwise(alt.value(0.3)),
    strokeWidth=stroke_width,
).configure_scale(bandPaddingInner=0.2).add_params(select, highlight)

AttributeError: module 'altair' has no attribute 'when'