In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

---

## Intake

In [2]:
intake = pd.read_csv('../data/austin_animal_center_intakes_20241017.csv')
intake.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
0,A786884,*Brock,01/03/2019 04:19:00 PM,January 2019,2501 Magin Meadow Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,2 years,Beagle Mix,Tricolor
1,A706918,Belle,07/05/2015 12:59:00 PM,July 2015,9409 Bluegrass Dr in Austin (TX),Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver
2,A724273,Runster,04/14/2016 06:43:00 PM,April 2016,2818 Palomino Trail in Austin (TX),Stray,Normal,Dog,Intact Male,11 months,Basenji Mix,Sable/White
3,A665644,,10/21/2013 07:59:00 AM,October 2013,Austin (TX),Stray,Sick,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico
4,A857105,Johnny Ringo,05/12/2022 12:23:00 AM,May 2022,4404 Sarasota Drive in Austin (TX),Public Assist,Normal,Cat,Neutered Male,2 years,Domestic Shorthair,Orange Tabby


In [3]:
intake.columns = intake.columns.str.replace(' ', '_').str.lower()

In [4]:
intake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168040 entries, 0 to 168039
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   animal_id         168040 non-null  object
 1   name              119647 non-null  object
 2   datetime          168040 non-null  object
 3   monthyear         168040 non-null  object
 4   found_location    168040 non-null  object
 5   intake_type       168040 non-null  object
 6   intake_condition  168040 non-null  object
 7   animal_type       168040 non-null  object
 8   sex_upon_intake   168038 non-null  object
 9   age_upon_intake   168039 non-null  object
 10  breed             168040 non-null  object
 11  color             168040 non-null  object
dtypes: object(12)
memory usage: 15.4+ MB


In [5]:
intake['datetime'] = intake['datetime'].apply(pd.to_datetime)
# intake['monthyear'] = intake['monthyear'].apply(pd.to_datetime)
intake.head()

Unnamed: 0,animal_id,name,datetime,monthyear,found_location,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,color
0,A786884,*Brock,2019-01-03 16:19:00,January 2019,2501 Magin Meadow Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,2 years,Beagle Mix,Tricolor
1,A706918,Belle,2015-07-05 12:59:00,July 2015,9409 Bluegrass Dr in Austin (TX),Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver
2,A724273,Runster,2016-04-14 18:43:00,April 2016,2818 Palomino Trail in Austin (TX),Stray,Normal,Dog,Intact Male,11 months,Basenji Mix,Sable/White
3,A665644,,2013-10-21 07:59:00,October 2013,Austin (TX),Stray,Sick,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico
4,A857105,Johnny Ringo,2022-05-12 00:23:00,May 2022,4404 Sarasota Drive in Austin (TX),Public Assist,Normal,Cat,Neutered Male,2 years,Domestic Shorthair,Orange Tabby


In [6]:
intake.isnull().sum()[lambda x: x>0]

name               48393
sex_upon_intake        2
age_upon_intake        1
dtype: int64

In [7]:
intake.drop(columns='name', inplace=True)

In [8]:
intake[intake.isnull().any(axis=1)]

Unnamed: 0,animal_id,datetime,monthyear,found_location,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,color
28866,A667395,2013-11-17 13:15:00,November 2013,Pflugerville (TX),Owner Surrender,Normal,Dog,,7 years,Dachshund,Brown Merle
85481,A830333,2021-03-03 18:30:00,March 2021,1234 Test Street in Austin (TX),Stray,Normal,Dog,,,Kuvasz,Unknown


In [9]:
intake.dropna(inplace=True)

In [10]:
# intake['intake_Type'].value_counts(normalize=True).plot(kind='bar');

In [11]:
# intake['intake_Condition'].value_counts(normalize=True).plot(kind='bar');

In [12]:
# intake['animal_Type'].value_counts(normalize=True).plot(kind='bar');

In [13]:
def convert_age(age): # add doc string for fucntion!!!!!!!!
    value, unit = age.split()
    value = abs(int(value)) # assume the nagetive age is typo 
    
    if 'year' in unit:
        return value * 12
    elif 'month' in unit:
        return value
    elif 'week' in unit:
        return int(value * 0.23)
    elif 'day' in unit:
        return int(value * 0.033)
    else:
        return 0 

In [14]:
intake['age_in_month_upon_intake'] = intake['age_upon_intake'].apply(convert_age)

In [15]:
intake.drop_duplicates(keep='first', inplace=True, ignore_index=True)

In [16]:
intake.sort_values(by=['animal_id', 'datetime'], inplace=True)

In [17]:
# Code written by Paddy
intake['stay'] = intake.groupby('animal_id').cumcount() + 1 
intake['stay'] = intake['stay'].astype('str')
intake['animal_stay'] = intake['animal_id'] + '-' + intake['stay']

In [18]:
intake

Unnamed: 0,animal_id,datetime,monthyear,found_location,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,color,age_in_month_upon_intake,stay,animal_stay
113897,A006100,2014-03-07 14:26:00,March 2014,8700 Research in Austin (TX),Public Assist,Normal,Dog,Neutered Male,6 years,Spinone Italiano Mix,Yellow/White,72,1,A006100-1
5412,A006100,2014-12-19 10:21:00,December 2014,8700 Research Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,7 years,Spinone Italiano Mix,Yellow/White,84,2,A006100-2
25233,A006100,2017-12-07 14:07:00,December 2017,Colony Creek And Hunters Trace in Austin (TX),Stray,Normal,Dog,Neutered Male,10 years,Spinone Italiano Mix,Yellow/White,120,3,A006100-3
88523,A047759,2014-04-02 15:55:00,April 2014,Austin (TX),Owner Surrender,Normal,Dog,Neutered Male,10 years,Dachshund,Tricolor,120,1,A047759-1
120421,A134067,2013-11-16 09:02:00,November 2013,12034 Research Blvd in Austin (TX),Public Assist,Injured,Dog,Neutered Male,16 years,Shetland Sheepdog,Brown/White,192,1,A134067-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167974,A915609,2024-10-17 13:21:00,October 2024,1006 Banister Ln in Austin (TX),Stray,Normal,Bird,Intact Male,1 year,Chicken,White/Blue,12,1,A915609-1
168002,A915610,2024-10-17 14:10:00,October 2024,15321 Wells Lane in Travis (TX),Stray,Normal,Dog,Unknown,7 months,Doberman Pinsch Mix,Brown,7,1,A915610-1
167990,A915614,2024-10-17 13:44:00,October 2024,8305 Garcreek Cv in Austin (TX),Stray,Injured,Cat,Unknown,6 months,Domestic Shorthair,Black/Black,6,1,A915614-1
168004,A915617,2024-10-17 13:44:00,October 2024,9501 Fm 969 in Austin (TX),Stray,Normal,Dog,Intact Male,1 year,Pit Bull Mix,Red/White,12,1,A915617-1


---

## Outcome

In [19]:
outcome = pd.read_csv('../data/austin_animal_center_outcomes_20241017.csv')
outcome.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A882831,*Hamilton,07/01/2023 06:12:00 PM,Jul 2023,03/25/2023,Adoption,,Cat,Neutered Male,3 months,Domestic Shorthair Mix,Black/White
1,A794011,Chunk,05/08/2019 06:20:00 PM,May 2019,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
2,A776359,Gizmo,07/18/2018 04:02:00 PM,Jul 2018,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
3,A821648,,08/16/2020 11:38:00 AM,Aug 2020,08/16/2019,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray
4,A720371,Moose,02/13/2016 05:59:00 PM,Feb 2016,10/08/2015,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff


In [20]:
outcome.columns = outcome.columns.str.replace(' ', '_').str.lower()
outcome.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167942 entries, 0 to 167941
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   animal_id         167942 non-null  object
 1   name              119733 non-null  object
 2   datetime          167942 non-null  object
 3   monthyear         167942 non-null  object
 4   date_of_birth     167942 non-null  object
 5   outcome_type      167896 non-null  object
 6   outcome_subtype   77144 non-null   object
 7   animal_type       167942 non-null  object
 8   sex_upon_outcome  167940 non-null  object
 9   age_upon_outcome  167926 non-null  object
 10  breed             167942 non-null  object
 11  color             167942 non-null  object
dtypes: object(12)
memory usage: 15.4+ MB


In [21]:
outcome['datetime'] = outcome['datetime'].apply(pd.to_datetime)
# outcome['monthyear'] = outcome['monthyear'].apply(pd.to_datetime)
outcome.head()

Unnamed: 0,animal_id,name,datetime,monthyear,date_of_birth,outcome_type,outcome_subtype,animal_type,sex_upon_outcome,age_upon_outcome,breed,color
0,A882831,*Hamilton,2023-07-01 18:12:00,Jul 2023,03/25/2023,Adoption,,Cat,Neutered Male,3 months,Domestic Shorthair Mix,Black/White
1,A794011,Chunk,2019-05-08 18:20:00,May 2019,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
2,A776359,Gizmo,2018-07-18 16:02:00,Jul 2018,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
3,A821648,,2020-08-16 11:38:00,Aug 2020,08/16/2019,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray
4,A720371,Moose,2016-02-13 17:59:00,Feb 2016,10/08/2015,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff


In [22]:
outcome.isnull().sum()[lambda x: x>0]

name                48209
outcome_type           46
outcome_subtype     90798
sex_upon_outcome        2
age_upon_outcome       16
dtype: int64

In [23]:
outcome['outcome_type'].value_counts(normalize=True, dropna=False)

outcome_type
Adoption           0.481934
Transfer           0.282764
Return to Owner    0.150224
Euthanasia         0.062444
Died               0.009527
Rto-Adopt          0.007104
Disposal           0.004990
Missing            0.000530
NaN                0.000274
Relocate           0.000161
Stolen             0.000030
Lost               0.000018
Name: proportion, dtype: float64

In [24]:
outcome['outcome_subtype'].value_counts(normalize=True, dropna=False)

outcome_subtype
NaN                    0.540651
Partner                0.234700
Foster                 0.101732
Rabies Risk            0.028355
Suffering              0.023836
Snr                    0.023198
SCRP                   0.019120
Out State              0.005544
In Kennel              0.004984
Aggressive             0.003614
Offsite                0.003025
In Foster              0.002418
At Vet                 0.002060
Medical                0.002036
Field                  0.001375
Behavior               0.001018
Enroute                0.000655
Court/Investigation    0.000512
Underage               0.000220
Emergency              0.000202
In Surgery             0.000196
Prc                    0.000119
Customer S             0.000107
Possible Theft         0.000095
Barn                   0.000095
In State               0.000071
Emer                   0.000060
Name: proportion, dtype: float64

In [25]:
outcome.drop(columns=['name', 'outcome_subtype'], inplace=True)

In [26]:
outcome.isnull().sum()

animal_id            0
datetime             0
monthyear            0
date_of_birth        0
outcome_type        46
animal_type          0
sex_upon_outcome     2
age_upon_outcome    16
breed                0
color                0
dtype: int64

In [27]:
outcome.dropna(inplace=True)

In [28]:
outcome.drop_duplicates(keep='first', inplace=True, ignore_index=True)

In [29]:
outcome['age_in_month_upon_outcome'] = outcome['age_upon_outcome'].apply(convert_age)

In [30]:
outcome.sort_values(by=['animal_id', 'datetime'], inplace=True)

In [31]:
# Code written by Paddy
outcome['stay'] = outcome.groupby('animal_id').cumcount() + 1 
outcome['stay'] = outcome['stay'].astype('str')
outcome['animal_stay'] = outcome['animal_id'] + '-' + outcome['stay']
outcome

Unnamed: 0,animal_id,datetime,monthyear,date_of_birth,outcome_type,animal_type,sex_upon_outcome,age_upon_outcome,breed,color,age_in_month_upon_outcome,stay,animal_stay
145574,A006100,2014-03-08 17:10:00,Mar 2014,07/09/2007,Return to Owner,Dog,Neutered Male,6 years,Spinone Italiano Mix,Yellow/White,72,1,A006100-1
71733,A006100,2014-12-20 16:35:00,Dec 2014,07/09/2007,Return to Owner,Dog,Neutered Male,7 years,Spinone Italiano Mix,Yellow/White,84,2,A006100-2
128212,A006100,2017-12-07 00:00:00,Dec 2017,07/09/2007,Return to Owner,Dog,Neutered Male,10 years,Spinone Italiano Mix,Yellow/White,120,3,A006100-3
49520,A047759,2014-04-07 15:12:00,Apr 2014,04/02/2004,Transfer,Dog,Neutered Male,10 years,Dachshund,Tricolor,120,1,A047759-1
102940,A134067,2013-11-16 11:54:00,Nov 2013,10/16/1997,Return to Owner,Dog,Neutered Male,16 years,Shetland Sheepdog,Brown/White,192,1,A134067-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
167810,A915507,2024-10-16 13:35:00,Oct 2024,05/16/2024,Return to Owner,Cat,Intact Male,5 months,Domestic Shorthair,Orange,5,1,A915507-1
167830,A915518,2024-10-16 17:01:00,Oct 2024,10/04/2024,Transfer,Cat,Intact Female,1 weeks,Domestic Shorthair,Tortie,0,1,A915518-1
167832,A915519,2024-10-16 17:01:00,Oct 2024,10/04/2024,Transfer,Cat,Intact Male,1 weeks,Domestic Shorthair,Blue,0,1,A915519-1
167824,A915520,2024-10-16 17:02:00,Oct 2024,10/04/2024,Transfer,Cat,Intact Female,1 weeks,Domestic Shorthair,Blue,0,1,A915520-1


In [32]:
# outcome['Outcome_Type'].value_counts(normalize=True, dropna=False).plot(kind='bar');

In [33]:
# outcome['Age_upon_Outcome'].value_counts()

In [34]:
# outcome['Age_in_month_upon_outcome'] = outcome['Age_upon_Outcome'].apply(convert_age)

In [35]:
# outcome['Age_in_month_upon_outcome'].value_counts().plot(kind='hist');

In [36]:
# adoption = outcome[outcome['Outcome_Type']=='Adoption'].copy()
# adoption.shape

In [37]:
# adoption.describe()

In [38]:
# adoption['Animal_Type'].value_counts()

In [39]:
# print(adoption['Sex_upon_Outcome'].nunique())
# adoption['Sex_upon_Outcome'].value_counts()

In [40]:
# adoption['Age_in_month_upon_outcome'].value_counts().plot(kind='hist');

In [41]:
# print(adoption.Breed.nunique())
# adoption.Breed.value_counts().head(20)

In [42]:
# print(adoption.Color.nunique())
# adoption.Color.value_counts().head(20)

---

## Merge

In [43]:
intake.loc[intake['animal_id'].isin(outcome['animal_id'])].shape

(167047, 14)

In [44]:
df = pd.merge(intake, outcome, on='animal_stay', how='inner', suffixes=('_income', '_outcome'))
df.head()

Unnamed: 0,animal_id_income,datetime_income,monthyear_income,found_location,intake_type,intake_condition,animal_type_income,sex_upon_intake,age_upon_intake,breed_income,...,monthyear_outcome,date_of_birth,outcome_type,animal_type_outcome,sex_upon_outcome,age_upon_outcome,breed_outcome,color_outcome,age_in_month_upon_outcome,stay_outcome
0,A006100,2014-03-07 14:26:00,March 2014,8700 Research in Austin (TX),Public Assist,Normal,Dog,Neutered Male,6 years,Spinone Italiano Mix,...,Mar 2014,07/09/2007,Return to Owner,Dog,Neutered Male,6 years,Spinone Italiano Mix,Yellow/White,72,1
1,A006100,2014-12-19 10:21:00,December 2014,8700 Research Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,7 years,Spinone Italiano Mix,...,Dec 2014,07/09/2007,Return to Owner,Dog,Neutered Male,7 years,Spinone Italiano Mix,Yellow/White,84,2
2,A006100,2017-12-07 14:07:00,December 2017,Colony Creek And Hunters Trace in Austin (TX),Stray,Normal,Dog,Neutered Male,10 years,Spinone Italiano Mix,...,Dec 2017,07/09/2007,Return to Owner,Dog,Neutered Male,10 years,Spinone Italiano Mix,Yellow/White,120,3
3,A047759,2014-04-02 15:55:00,April 2014,Austin (TX),Owner Surrender,Normal,Dog,Neutered Male,10 years,Dachshund,...,Apr 2014,04/02/2004,Transfer,Dog,Neutered Male,10 years,Dachshund,Tricolor,120,1
4,A134067,2013-11-16 09:02:00,November 2013,12034 Research Blvd in Austin (TX),Public Assist,Injured,Dog,Neutered Male,16 years,Shetland Sheepdog,...,Nov 2013,10/16/1997,Return to Owner,Dog,Neutered Male,16 years,Shetland Sheepdog,Brown/White,192,1


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166920 entries, 0 to 166919
Data columns (total 26 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   animal_id_income           166920 non-null  object        
 1   datetime_income            166920 non-null  datetime64[ns]
 2   monthyear_income           166920 non-null  object        
 3   found_location             166920 non-null  object        
 4   intake_type                166920 non-null  object        
 5   intake_condition           166920 non-null  object        
 6   animal_type_income         166920 non-null  object        
 7   sex_upon_intake            166920 non-null  object        
 8   age_upon_intake            166920 non-null  object        
 9   breed_income               166920 non-null  object        
 10  color_income               166920 non-null  object        
 11  age_in_month_upon_intake   166920 non-null  int64   

In [46]:
df.isnull().sum()[lambda x: x>0]

Series([], dtype: int64)

In [47]:
df.columns

Index(['animal_id_income', 'datetime_income', 'monthyear_income',
       'found_location', 'intake_type', 'intake_condition',
       'animal_type_income', 'sex_upon_intake', 'age_upon_intake',
       'breed_income', 'color_income', 'age_in_month_upon_intake',
       'stay_income', 'animal_stay', 'animal_id_outcome', 'datetime_outcome',
       'monthyear_outcome', 'date_of_birth', 'outcome_type',
       'animal_type_outcome', 'sex_upon_outcome', 'age_upon_outcome',
       'breed_outcome', 'color_outcome', 'age_in_month_upon_outcome',
       'stay_outcome'],
      dtype='object')

In [48]:
print((df.animal_id_income == df.animal_id_outcome).all())
print((df.animal_type_income == df.animal_type_outcome).all())
print((df.breed_income == df.breed_outcome).all())
print((df.color_income == df.color_outcome).all())
print((df.stay_income == df.stay_outcome).all())

True
True
True
True
True


In [49]:
df.drop(columns=['animal_id_outcome', 'animal_type_income', 'breed_income', 'color_income', 'stay_income'], inplace=True)
df.rename(columns={'animal_id_income': 'animal_id', 'stay_outcome': 'stay'},inplace=True)
df.head()

Unnamed: 0,animal_id,datetime_income,monthyear_income,found_location,intake_type,intake_condition,sex_upon_intake,age_upon_intake,age_in_month_upon_intake,animal_stay,...,monthyear_outcome,date_of_birth,outcome_type,animal_type_outcome,sex_upon_outcome,age_upon_outcome,breed_outcome,color_outcome,age_in_month_upon_outcome,stay
0,A006100,2014-03-07 14:26:00,March 2014,8700 Research in Austin (TX),Public Assist,Normal,Neutered Male,6 years,72,A006100-1,...,Mar 2014,07/09/2007,Return to Owner,Dog,Neutered Male,6 years,Spinone Italiano Mix,Yellow/White,72,1
1,A006100,2014-12-19 10:21:00,December 2014,8700 Research Blvd in Austin (TX),Public Assist,Normal,Neutered Male,7 years,84,A006100-2,...,Dec 2014,07/09/2007,Return to Owner,Dog,Neutered Male,7 years,Spinone Italiano Mix,Yellow/White,84,2
2,A006100,2017-12-07 14:07:00,December 2017,Colony Creek And Hunters Trace in Austin (TX),Stray,Normal,Neutered Male,10 years,120,A006100-3,...,Dec 2017,07/09/2007,Return to Owner,Dog,Neutered Male,10 years,Spinone Italiano Mix,Yellow/White,120,3
3,A047759,2014-04-02 15:55:00,April 2014,Austin (TX),Owner Surrender,Normal,Neutered Male,10 years,120,A047759-1,...,Apr 2014,04/02/2004,Transfer,Dog,Neutered Male,10 years,Dachshund,Tricolor,120,1
4,A134067,2013-11-16 09:02:00,November 2013,12034 Research Blvd in Austin (TX),Public Assist,Injured,Neutered Male,16 years,192,A134067-1,...,Nov 2013,10/16/1997,Return to Owner,Dog,Neutered Male,16 years,Shetland Sheepdog,Brown/White,192,1


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166920 entries, 0 to 166919
Data columns (total 21 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   animal_id                  166920 non-null  object        
 1   datetime_income            166920 non-null  datetime64[ns]
 2   monthyear_income           166920 non-null  object        
 3   found_location             166920 non-null  object        
 4   intake_type                166920 non-null  object        
 5   intake_condition           166920 non-null  object        
 6   sex_upon_intake            166920 non-null  object        
 7   age_upon_intake            166920 non-null  object        
 8   age_in_month_upon_intake   166920 non-null  int64         
 9   animal_stay                166920 non-null  object        
 10  datetime_outcome           166920 non-null  datetime64[ns]
 11  monthyear_outcome          166920 non-null  object  

In [51]:
df['animal_id'].value_counts().head(10)

animal_id
A721033    33
A718223    14
A718877    12
A706536    11
A705625    11
A761266    10
A616444     9
A700407     9
A737814     9
A717053     9
Name: count, dtype: int64

In [52]:
df['duration'] = df['datetime_outcome'] - df['datetime_income']

In [53]:
(df['duration'] > pd.Timedelta(0)).value_counts() # Need to deal with negative duration

duration
True     166131
False       789
Name: count, dtype: int64

In [54]:
df.breed_outcome.nunique()

2959

In [55]:
df.breed_outcome.value_counts().head(20)

breed_outcome
Domestic Shorthair Mix       33598
Domestic Shorthair           21479
Pit Bull Mix                  9888
Labrador Retriever Mix        8509
Chihuahua Shorthair Mix       6832
German Shepherd Mix           3951
Domestic Medium Hair Mix      3330
Pit Bull                      3164
Bat                           2234
Australian Cattle Dog Mix     1936
Domestic Medium Hair          1923
Chihuahua Shorthair           1899
Labrador Retriever            1881
Bat Mix                       1753
German Shepherd               1703
Domestic Longhair Mix         1671
Siamese Mix                   1441
Dachshund Mix                 1171
Boxer Mix                     1158
Border Collie Mix             1132
Name: count, dtype: int64

In [56]:
remove_list = ['Mix', 'Shorthair', 'Medium Hair', 'Longhair']

def base_breed(breed, str_num=2):
    '''
    Removes specified strings from the breed name and reduces it to the first 'str_num' words.

    Parameters:
    breed (str): The original breed name.
    str_num (int): The number of breed words to keep.

    Returns:
    str: The cleaned and reduced breed name.
    '''
    base = breed
    for i in remove_list:
        base = base.replace(i, '').strip()
    return ' '.join(base.split()[:str_num]).strip() # split the breed information and join the ideal number of parts

In [57]:
df['base_2'] = df['breed_outcome'].apply(lambda x: base_breed(x))
print(df['base_2'].nunique())
df['base_2'].value_counts()

2195


base_2
Domestic                        62722
Pit Bull                        13052
Labrador Retriever              10390
Chihuahua                        9455
German Shepherd                  5654
                                ...  
Tropical                            1
French Bulldog/English              1
Beagle/Mastiff                      1
Golden Retriever/Catahoula          1
Brittany/Australian Shepherd        1
Name: count, Length: 2195, dtype: int64

In [58]:
df['base_1'] = df['breed_outcome'].apply(lambda x: base_breed(x, str_num=1))
print(df['base_1'].nunique())
df['base_1'].value_counts()

814


base_1
Domestic                62757
Pit                     13942
Labrador                12982
Chihuahua               10722
German                   6902
                        ...  
Dutch/Angora-Satin          1
Staffordshire/Border        1
Tropical                    1
Beagle/Mastiff              1
Brittany/Australian         1
Name: count, Length: 814, dtype: int64