# Reading in data and MAJOR CLEANING


In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import datetime as dt

In [2]:
# Read in csv
df = pd.read_csv("Resources/Animal_Shelter_Louisville.csv")
df.head(10)


Unnamed: 0,AnimalID,AnimalType,IntakeDate,IntakeType,IntakeSubtype,PrimaryColor,PrimaryBreed,SecondaryBreed,Gender,SecondaryColor,...,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeDate,OutcomeType,OutcomeSubtype,OutcomeReason,OutcomeInternalStatus,OutcomeAsilomarStatus,ReproductiveStatusAtOutcome
0,A366370,CAT,7/11/2008 10:50,STRAY,OTC,WHITE,DOMESTIC SHORTHAIR,,NEUTERED MALE,BROWN,...,FEARFUL,HEALTHY,ALTERED,12/11/2008 15:46,EUTH,FERAL,,,UNHEALTHY/UNTREATABLE,ALTERED
1,A366531,CAT,10/11/2008 10:20,STRAY,OTC,BLACK,DOMESTIC SHORTHAIR,DOMESTIC SHORTHAIR,UNKNOWN,,...,NORMAL,HEALTHY,UNKNOWN,19/11/2008 20:10,EUTH,CONTAG DIS,,SICK,HEALTHY,UNKNOWN
2,A532367,BIRD,23/7/2014 23:21,CONFISCATE,CRUELTY,RED,CHICKEN,,MALE,BLACK,...,OTHER,HEALTHY,FERTILE,5/11/2014 15:49,TRANSFER,,,,HEALTHY,FERTILE
3,A532474,OTHER,24/7/2014 18:29,ET REQUEST,,BROWN,BAT,,UNKNOWN,,...,OTHER,HEALTHY,UNKNOWN,24/7/2014 23:59,EUTH,MEDICAL,,OTHER,HEALTHY,UNKNOWN
4,A281756,DOG,11/9/2006 18:10,OWNER SUR,OTC,WHITE,PIT BULL TERRIER,,MALE,BROWN,...,NORMAL,HEALTHY,FERTILE,12/9/2006 13:44,EUTH,TIME/SPACE,,,HEALTHY,FERTILE
5,A451184,BIRD,29/1/2012 15:25,OWNER SUR,FIELD,BLACK,CHICKEN,,UNKNOWN,WHITE,...,NORMAL,HEALTHY,UNKNOWN,22/2/2012 23:59,TRANSFER,RESCUE GRP,,NORMAL,HEALTHY,UNKNOWN
6,A256128,DOG,26/11/2005 12:35,STRAY,FIELD,BROWN,AMERICAN PIT BULL TERRIER,MIX,MALE,WHITE,...,NORMAL,HEALTHY,FERTILE,8/12/2005 23:59,EUTH,MEDICAL,,,HEALTHY,FERTILE
7,A314432,CAT,15/6/2007 17:13,OWNER SUR,OTC,BROWN TABBY,DOMESTIC SHORTHAIR,,UNKNOWN,,...,NORMAL,HEALTHY,UNKNOWN,15/6/2007 18:07,EUTH,TIME/SPACE,,,HEALTHY,UNKNOWN
8,A316619,DOG,29/6/2007 20:10,STRAY,FIELD,WHITE,LABRADOR RETRIEVER,MIX,MALE,TAN,...,FEARFUL,HEALTHY,FERTILE,4/7/2007 13:12,EUTH,TIME/SPACE,,,HEALTHY,FERTILE
9,A317335,CAT,5/7/2007 21:30,STRAY,OTC,GRAY,DOMESTIC SHORTHAIR,,UNKNOWN,,...,NORMAL,HEALTHY,UNKNOWN,11/7/2007 9:19,EUTH,TIME/SPACE,,,HEALTHY,UNKNOWN


In [3]:
# Check for nulls
df.isna().sum()

AnimalID                            0
AnimalType                          0
IntakeDate                          0
IntakeType                          0
IntakeSubtype                    4556
PrimaryColor                       17
PrimaryBreed                        0
SecondaryBreed                 110310
Gender                              0
SecondaryColor                  70263
DOB                             56224
IntakeReason                   116037
IntakeInternalStatus                0
IntakeAsilomarStatus                0
ReproductiveStatusAtIntake          0
OutcomeDate                       202
OutcomeType                       452
OutcomeSubtype                  26982
OutcomeReason                  150842
OutcomeInternalStatus          101121
OutcomeAsilomarStatus             202
ReproductiveStatusAtOutcome       202
dtype: int64

In [4]:
df.columns

Index(['AnimalID', 'AnimalType', 'IntakeDate', 'IntakeType', 'IntakeSubtype',
       'PrimaryColor', 'PrimaryBreed', 'SecondaryBreed', 'Gender',
       'SecondaryColor', 'DOB', 'IntakeReason', 'IntakeInternalStatus',
       'IntakeAsilomarStatus', 'ReproductiveStatusAtIntake', 'OutcomeDate',
       'OutcomeType', 'OutcomeSubtype', 'OutcomeReason',
       'OutcomeInternalStatus', 'OutcomeAsilomarStatus',
       'ReproductiveStatusAtOutcome'],
      dtype='object')

In [5]:
# Drop unnecessary columns
df2 = df.drop(columns=['AnimalID',
                       'IntakeType', 
                       'IntakeSubtype', 
                       'OutcomeSubtype',
                      'SecondaryColor',
                       'IntakeReason',
                      'ReproductiveStatusAtOutcome',
                       'OutcomeReason',
                      'OutcomeInternalStatus',
                      'OutcomeAsilomarStatus'])

In [6]:
df2.head()


Unnamed: 0,AnimalType,IntakeDate,PrimaryColor,PrimaryBreed,SecondaryBreed,Gender,DOB,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeDate,OutcomeType
0,CAT,7/11/2008 10:50,WHITE,DOMESTIC SHORTHAIR,,NEUTERED MALE,,FEARFUL,HEALTHY,ALTERED,12/11/2008 15:46,EUTH
1,CAT,10/11/2008 10:20,BLACK,DOMESTIC SHORTHAIR,DOMESTIC SHORTHAIR,UNKNOWN,,NORMAL,HEALTHY,UNKNOWN,19/11/2008 20:10,EUTH
2,BIRD,23/7/2014 23:21,RED,CHICKEN,,MALE,,OTHER,HEALTHY,FERTILE,5/11/2014 15:49,TRANSFER
3,OTHER,24/7/2014 18:29,BROWN,BAT,,UNKNOWN,,OTHER,HEALTHY,UNKNOWN,24/7/2014 23:59,EUTH
4,DOG,11/9/2006 18:10,WHITE,PIT BULL TERRIER,,MALE,11/9/2005 0:00,NORMAL,HEALTHY,FERTILE,12/9/2006 13:44,EUTH


In [7]:
# Change object to datetime so we can calculate AgeInMonth and DurationInShelter
df2["IntakeDate"] = pd.to_datetime(df2["IntakeDate"])
df2["DOB"] = pd.to_datetime(df2["DOB"])
df2["OutcomeDate"] = pd.to_datetime(df2["OutcomeDate"])

In [8]:
df2.dtypes

AnimalType                            object
IntakeDate                    datetime64[ns]
PrimaryColor                          object
PrimaryBreed                          object
SecondaryBreed                        object
Gender                                object
DOB                           datetime64[ns]
IntakeInternalStatus                  object
IntakeAsilomarStatus                  object
ReproductiveStatusAtIntake            object
OutcomeDate                   datetime64[ns]
OutcomeType                           object
dtype: object

In [9]:
df2.head()

Unnamed: 0,AnimalType,IntakeDate,PrimaryColor,PrimaryBreed,SecondaryBreed,Gender,DOB,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeDate,OutcomeType
0,CAT,2008-07-11 10:50:00,WHITE,DOMESTIC SHORTHAIR,,NEUTERED MALE,NaT,FEARFUL,HEALTHY,ALTERED,2008-12-11 15:46:00,EUTH
1,CAT,2008-10-11 10:20:00,BLACK,DOMESTIC SHORTHAIR,DOMESTIC SHORTHAIR,UNKNOWN,NaT,NORMAL,HEALTHY,UNKNOWN,2008-11-19 20:10:00,EUTH
2,BIRD,2014-07-23 23:21:00,RED,CHICKEN,,MALE,NaT,OTHER,HEALTHY,FERTILE,2014-05-11 15:49:00,TRANSFER
3,OTHER,2014-07-24 18:29:00,BROWN,BAT,,UNKNOWN,NaT,OTHER,HEALTHY,UNKNOWN,2014-07-24 23:59:00,EUTH
4,DOG,2006-11-09 18:10:00,WHITE,PIT BULL TERRIER,,MALE,2005-11-09,NORMAL,HEALTHY,FERTILE,2006-12-09 13:44:00,EUTH


In [10]:
df2.groupby("AnimalType").count()

Unnamed: 0_level_0,IntakeDate,PrimaryColor,PrimaryBreed,SecondaryBreed,Gender,DOB,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeDate,OutcomeType
AnimalType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
BIRD,1185,1185,1185,69,1185,310,1185,1185,1185,1185,1176
CAT,68741,68738,68741,2658,68741,39386,68741,68741,68741,68680,68563
DOG,77145,77131,77145,37715,77145,53709,77145,77145,77145,77008,76905
FERRET,91,91,91,0,91,51,91,91,91,91,91
LIVESTOCK,256,256,256,28,256,109,256,256,256,254,253
OTHER,1341,1341,1341,2,1341,292,1341,1341,1341,1340,1322
RABBIT,1139,1139,1139,30,1139,436,1139,1139,1139,1138,1138
REPTILE,263,263,263,26,263,86,263,263,263,263,262
RODENT,681,681,681,4,681,239,681,681,681,681,680


In [11]:
# Drop all rows NOT dog!
df3 = df2.drop(df2[(df2['AnimalType'] == "BIRD") | 
                   (df2['AnimalType'] == "CAT") |
                   (df2['AnimalType'] == "FERRET") |
                   (df2['AnimalType'] == "LIVESTOCK") |
                   (df2['AnimalType'] == "OTHER") |
                   (df2['AnimalType'] == "RABBIT") |
                   (df2['AnimalType'] == "REPTILE") |
                   (df2['AnimalType'] == "RODENT")].index)

In [12]:
df3.groupby("AnimalType").count()

Unnamed: 0_level_0,IntakeDate,PrimaryColor,PrimaryBreed,SecondaryBreed,Gender,DOB,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeDate,OutcomeType
AnimalType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
DOG,77145,77131,77145,37715,77145,53709,77145,77145,77145,77008,76905


# More cleaning! Cleaning WITHIN dog dataset
1. Drop unnessary columns
2. Drop rows where "Outcome_Type" is null
3. Drop rows where "Outcome_Type" is Return_To_Owner

In [13]:
# Found dog data, drop AnimalType column
df3 = df3.drop(columns=['AnimalType'])

In [14]:
# Quick check for missing values
# Solution - Going to drop NA in OutcomeType first, then work on others.  We will NOT drop Secondary breed until we
#            create Kelly's 3 BreedCategory column.
df3.isna().sum()

IntakeDate                        0
PrimaryColor                     14
PrimaryBreed                      0
SecondaryBreed                39430
Gender                            0
DOB                           23436
IntakeInternalStatus              0
IntakeAsilomarStatus              0
ReproductiveStatusAtIntake        0
OutcomeDate                     137
OutcomeType                     240
dtype: int64

In [15]:
df3.columns

Index(['IntakeDate', 'PrimaryColor', 'PrimaryBreed', 'SecondaryBreed',
       'Gender', 'DOB', 'IntakeInternalStatus', 'IntakeAsilomarStatus',
       'ReproductiveStatusAtIntake', 'OutcomeDate', 'OutcomeType'],
      dtype='object')

In [16]:
# Drop rows where OutcomeType is null

df3 = df3.loc[df3.OutcomeType.notna(), ["IntakeDate", 
                                        'PrimaryColor',
                                        "PrimaryBreed", 
                                        "SecondaryBreed", 
                                        "Gender",
                                        'DOB',
                                        "IntakeInternalStatus", 
                                        'IntakeAsilomarStatus', 
                                        'ReproductiveStatusAtIntake', 
                                        'OutcomeDate',
                                        'OutcomeType']]


In [17]:
df3.shape

(76905, 11)

In [18]:
# Check for more nulls

# df3.apply(lambda x: sum(x.isnull()/len(df3)))
df3.isna().sum()

IntakeDate                        0
PrimaryColor                     14
PrimaryBreed                      0
SecondaryBreed                39296
Gender                            0
DOB                           23414
IntakeInternalStatus              0
IntakeAsilomarStatus              0
ReproductiveStatusAtIntake        0
OutcomeDate                       6
OutcomeType                       0
dtype: int64

In [19]:
# # df2.groupby('OutcomeType').count()
# df2.groupby('Color').count().sort_values('OutcomeType', ascending=False)
df3.groupby('OutcomeType').count()

Unnamed: 0_level_0,IntakeDate,PrimaryColor,PrimaryBreed,SecondaryBreed,Gender,DOB,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeDate
OutcomeType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ADOPTION,15368,15366,15368,9060,15368,13675,15368,15368,15368,15368
DIED,716,716,716,346,716,327,716,716,716,716
DISPOSAL,1127,1127,1127,360,1127,968,1127,1127,1127,1127
ET PROCESS,30,30,30,17,30,8,30,30,30,30
EUTH,30088,30079,30088,14527,30088,14610,30088,30088,30088,30087
FOSTER,1032,1032,1032,491,1032,998,1032,1032,1032,1030
INDEFINITE,1,1,1,1,1,0,1,1,1,1
MISSING,501,501,501,301,501,225,501,501,501,501
MISSING EX,2,2,2,1,2,2,2,2,2,2
NO SHOW,34,34,34,6,34,22,34,34,34,34


In [20]:
# Drop all rows 'RTO'. Assuming that chip is checked at intake

df3 = df3.drop(df3[df3.OutcomeType == "RTO"].index)

In [21]:
df3.shape

(63710, 11)

In [22]:
# More nulls?  Keeping secondary breed for now
# df3.apply(lambda x: sum(x.isnull()/len(df3)))
df3.isna().sum()

IntakeDate                        0
PrimaryColor                     12
PrimaryBreed                      0
SecondaryBreed                31546
Gender                            0
DOB                           20875
IntakeInternalStatus              0
IntakeAsilomarStatus              0
ReproductiveStatusAtIntake        0
OutcomeDate                       3
OutcomeType                       0
dtype: int64

In [23]:
df3.loc[df3['DOB'].isna()]

Unnamed: 0,IntakeDate,PrimaryColor,PrimaryBreed,SecondaryBreed,Gender,DOB,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeDate,OutcomeType
6,2005-11-26 12:35:00,BROWN,AMERICAN PIT BULL TERRIER,MIX,MALE,NaT,NORMAL,HEALTHY,FERTILE,2005-08-12 23:59:00,EUTH
8,2007-06-29 20:10:00,WHITE,LABRADOR RETRIEVER,MIX,MALE,NaT,FEARFUL,HEALTHY,FERTILE,2007-04-07 13:12:00,EUTH
15,2007-07-19 22:32:00,TRICOLOR,BEAGLE,MIX,NEUTERED MALE,NaT,NORMAL,HEALTHY,ALTERED,2007-07-08 12:13:00,EUTH
17,2005-12-21 14:30:00,WHITE,PIT BULL TERRIER,,MALE,NaT,NORMAL,HEALTHY,FERTILE,2005-12-29 11:05:00,EUTH
18,2005-12-22 12:23:00,BROWN BRINDLE,AMERICAN PIT BULL TERRIER,AMERICAN PIT BULL TERRIER,MALE,NaT,NORMAL,HEALTHY,FERTILE,2005-12-28 10:35:00,EUTH
...,...,...,...,...,...,...,...,...,...,...,...
150756,2009-05-12 12:58:00,TRICOLOR,POMERANIAN,,SPAYED FEMALE,NaT,NORMAL,HEALTHY,FERTILE,2009-12-18 10:05:00,ADOPTION
150759,2011-03-17 23:22:00,WHITE,JACK RUSS TER,MIX,NEUTERED MALE,NaT,NORMAL,HEALTHY,FERTILE,2011-07-05 23:59:00,TRANSFER
150760,2010-03-28 14:50:00,BROWN,POMERANIAN,PAPILLON,SPAYED FEMALE,NaT,NORMAL,HEALTHY,FERTILE,2010-12-04 17:09:00,ADOPTION
150797,2009-11-24 09:37:00,TRICOLOR,BEAGLE,,SPAYED FEMALE,NaT,EMACIATED,HEALTHY,FERTILE,2010-12-02 18:57:00,ADOPTION


#### ANGIE!!!  Testing keeping DOB

In [24]:
# ORIGINAL CODE BY DROPPING DOB!!
# Drop NA values in DOB (can't have nulls to calculate AgeInMonths)
# df3 = df3.dropna(axis=0, subset=["DOB", 'PrimaryColor', 'OutcomeDate'])
# df3.shape


df3 = df3.dropna(axis=0, subset=['PrimaryColor', 'OutcomeDate'])
df3.shape

(63695, 11)

In [25]:
df3.isna().sum()

IntakeDate                        0
PrimaryColor                      0
PrimaryBreed                      0
SecondaryBreed                31538
Gender                            0
DOB                           20873
IntakeInternalStatus              0
IntakeAsilomarStatus              0
ReproductiveStatusAtIntake        0
OutcomeDate                       0
OutcomeType                       0
dtype: int64

In [26]:
# Create AgeInMonth column since data is cleaner now
df3['AgeInMonths'] = round(((df3['IntakeDate'] - df3['DOB']).dt.days)/30, 2)
df3.head()

Unnamed: 0,IntakeDate,PrimaryColor,PrimaryBreed,SecondaryBreed,Gender,DOB,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeDate,OutcomeType,AgeInMonths
4,2006-11-09 18:10:00,WHITE,PIT BULL TERRIER,,MALE,2005-11-09,NORMAL,HEALTHY,FERTILE,2006-12-09 13:44:00,EUTH,12.17
6,2005-11-26 12:35:00,BROWN,AMERICAN PIT BULL TERRIER,MIX,MALE,NaT,NORMAL,HEALTHY,FERTILE,2005-08-12 23:59:00,EUTH,
8,2007-06-29 20:10:00,WHITE,LABRADOR RETRIEVER,MIX,MALE,NaT,FEARFUL,HEALTHY,FERTILE,2007-04-07 13:12:00,EUTH,
15,2007-07-19 22:32:00,TRICOLOR,BEAGLE,MIX,NEUTERED MALE,NaT,NORMAL,HEALTHY,ALTERED,2007-07-08 12:13:00,EUTH,
17,2005-12-21 14:30:00,WHITE,PIT BULL TERRIER,,MALE,NaT,NORMAL,HEALTHY,FERTILE,2005-12-29 11:05:00,EUTH,


In [27]:
# Create DurationInShelter column
df3['DurationInShelter'] = round(((df3['OutcomeDate'] - df3['IntakeDate']).dt.days), 2)
df3.head()

Unnamed: 0,IntakeDate,PrimaryColor,PrimaryBreed,SecondaryBreed,Gender,DOB,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeDate,OutcomeType,AgeInMonths,DurationInShelter
4,2006-11-09 18:10:00,WHITE,PIT BULL TERRIER,,MALE,2005-11-09,NORMAL,HEALTHY,FERTILE,2006-12-09 13:44:00,EUTH,12.17,29
6,2005-11-26 12:35:00,BROWN,AMERICAN PIT BULL TERRIER,MIX,MALE,NaT,NORMAL,HEALTHY,FERTILE,2005-08-12 23:59:00,EUTH,,-106
8,2007-06-29 20:10:00,WHITE,LABRADOR RETRIEVER,MIX,MALE,NaT,FEARFUL,HEALTHY,FERTILE,2007-04-07 13:12:00,EUTH,,-84
15,2007-07-19 22:32:00,TRICOLOR,BEAGLE,MIX,NEUTERED MALE,NaT,NORMAL,HEALTHY,ALTERED,2007-07-08 12:13:00,EUTH,,-12
17,2005-12-21 14:30:00,WHITE,PIT BULL TERRIER,,MALE,NaT,NORMAL,HEALTHY,FERTILE,2005-12-29 11:05:00,EUTH,,7


In [28]:
df4 = df3.drop(columns=['IntakeDate',
                       'DOB', 
                       'OutcomeDate'])

In [29]:
# More nulls?  Keeping secondary breed for now
df4.isna().sum()

PrimaryColor                      0
PrimaryBreed                      0
SecondaryBreed                31538
Gender                            0
IntakeInternalStatus              0
IntakeAsilomarStatus              0
ReproductiveStatusAtIntake        0
OutcomeType                       0
AgeInMonths                   20873
DurationInShelter                 0
dtype: int64

In [30]:
df4.shape

(63695, 10)

# Add new features? Feature Engineering
1. New feature BreedCategory column - Mixed/Two/Pure Breed  
2. New feature InternalStatus column - Grouping "IntakeInternalStatus" eg. grouping all aggressive into 1 main agg group
3. Creating PetAgeCategory column - Baby/Young/Adult/Senior
4. Creating Sex column - Male/Female (With no internal status)
5. Creating BinaryOutcome column - Take or Don't take

#### Feature Engineering 1 - Creating a BreedCategory column
* 0 = Breed has the word "Mix"
* 1 = If there are 2 breeds listed
* 2 = Only 1 breed in the primary (i.e. None in secondary OR Pure)

In [31]:
df4.loc[500:1000, :]

Unnamed: 0,PrimaryColor,PrimaryBreed,SecondaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter
500,WHITE,AMERICAN STAFFORDSHIRE TERRIER,,FEMALE,NORMAL,HEALTHY,FERTILE,EUTH,,-140
502,RED,WELSH CORGI - PEMBROKE,MIX,FEMALE,AGED,TREATABLE/MANAGEABLE,FERTILE,EUTH,,5
504,BLACK,LABRADOR RETRIEVER,MIX,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,5
505,BLACK,PARSON (JACK) RUSSELL TERRIER,MIX,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,7
507,BLACK,ROTTWEILER,MIX,MALE,FEARFUL,HEALTHY,FERTILE,EUTH,,183
...,...,...,...,...,...,...,...,...,...,...
986,BROWN,BLOODHOUND,DOBERMAN PINSCHER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,-140
988,BLACK,LABRADOR RETRIEVER,MIX,MALE,NORMAL,HEALTHY,FERTILE,MISSING,,-142
991,BLACK,YORKSHIRE TERRIER,MIX,SPAYED FEMALE,NORMAL,HEALTHY,ALTERED,ADOPTION,,91
996,BLACK,GERMAN SHEPHERD DOG,,FEMALE,NORMAL,HEALTHY,FERTILE,EUTH,,5


In [32]:
df4.isna().sum()

PrimaryColor                      0
PrimaryBreed                      0
SecondaryBreed                31538
Gender                            0
IntakeInternalStatus              0
IntakeAsilomarStatus              0
ReproductiveStatusAtIntake        0
OutcomeType                       0
AgeInMonths                   20873
DurationInShelter                 0
dtype: int64

In [33]:
# df3.SecondaryBreed.value_counts().sort_values(ascending=False).head(20)
# df3.SecondaryBreed.count()

In [34]:
# Add new column called BreedCategory, set all to Mix

df4["BreedCategory"]= "Two"
df4.loc[500:1000, :]

Unnamed: 0,PrimaryColor,PrimaryBreed,SecondaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory
500,WHITE,AMERICAN STAFFORDSHIRE TERRIER,,FEMALE,NORMAL,HEALTHY,FERTILE,EUTH,,-140,Two
502,RED,WELSH CORGI - PEMBROKE,MIX,FEMALE,AGED,TREATABLE/MANAGEABLE,FERTILE,EUTH,,5,Two
504,BLACK,LABRADOR RETRIEVER,MIX,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,5,Two
505,BLACK,PARSON (JACK) RUSSELL TERRIER,MIX,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,7,Two
507,BLACK,ROTTWEILER,MIX,MALE,FEARFUL,HEALTHY,FERTILE,EUTH,,183,Two
...,...,...,...,...,...,...,...,...,...,...,...
986,BROWN,BLOODHOUND,DOBERMAN PINSCHER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,-140,Two
988,BLACK,LABRADOR RETRIEVER,MIX,MALE,NORMAL,HEALTHY,FERTILE,MISSING,,-142,Two
991,BLACK,YORKSHIRE TERRIER,MIX,SPAYED FEMALE,NORMAL,HEALTHY,ALTERED,ADOPTION,,91,Two
996,BLACK,GERMAN SHEPHERD DOG,,FEMALE,NORMAL,HEALTHY,FERTILE,EUTH,,5,Two


In [35]:
# Find rows containing word "Nan" in SecondaryBreed and set to Pure

df4.loc[df4['SecondaryBreed'].isnull(),'BreedCategory'] = "Pure"
df4.loc[500:1000, :]



Unnamed: 0,PrimaryColor,PrimaryBreed,SecondaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory
500,WHITE,AMERICAN STAFFORDSHIRE TERRIER,,FEMALE,NORMAL,HEALTHY,FERTILE,EUTH,,-140,Pure
502,RED,WELSH CORGI - PEMBROKE,MIX,FEMALE,AGED,TREATABLE/MANAGEABLE,FERTILE,EUTH,,5,Two
504,BLACK,LABRADOR RETRIEVER,MIX,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,5,Two
505,BLACK,PARSON (JACK) RUSSELL TERRIER,MIX,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,7,Two
507,BLACK,ROTTWEILER,MIX,MALE,FEARFUL,HEALTHY,FERTILE,EUTH,,183,Two
...,...,...,...,...,...,...,...,...,...,...,...
986,BROWN,BLOODHOUND,DOBERMAN PINSCHER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,-140,Two
988,BLACK,LABRADOR RETRIEVER,MIX,MALE,NORMAL,HEALTHY,FERTILE,MISSING,,-142,Two
991,BLACK,YORKSHIRE TERRIER,MIX,SPAYED FEMALE,NORMAL,HEALTHY,ALTERED,ADOPTION,,91,Two
996,BLACK,GERMAN SHEPHERD DOG,,FEMALE,NORMAL,HEALTHY,FERTILE,EUTH,,5,Pure


In [36]:
# Find rows containing word "Mix"
df4.loc[df4["SecondaryBreed"].str.contains("MIX",  na=False), "BreedCategory"] = "Mix"
df4.loc[500:1000, :]

Unnamed: 0,PrimaryColor,PrimaryBreed,SecondaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory
500,WHITE,AMERICAN STAFFORDSHIRE TERRIER,,FEMALE,NORMAL,HEALTHY,FERTILE,EUTH,,-140,Pure
502,RED,WELSH CORGI - PEMBROKE,MIX,FEMALE,AGED,TREATABLE/MANAGEABLE,FERTILE,EUTH,,5,Mix
504,BLACK,LABRADOR RETRIEVER,MIX,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,5,Mix
505,BLACK,PARSON (JACK) RUSSELL TERRIER,MIX,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,7,Mix
507,BLACK,ROTTWEILER,MIX,MALE,FEARFUL,HEALTHY,FERTILE,EUTH,,183,Mix
...,...,...,...,...,...,...,...,...,...,...,...
986,BROWN,BLOODHOUND,DOBERMAN PINSCHER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,-140,Two
988,BLACK,LABRADOR RETRIEVER,MIX,MALE,NORMAL,HEALTHY,FERTILE,MISSING,,-142,Mix
991,BLACK,YORKSHIRE TERRIER,MIX,SPAYED FEMALE,NORMAL,HEALTHY,ALTERED,ADOPTION,,91,Mix
996,BLACK,GERMAN SHEPHERD DOG,,FEMALE,NORMAL,HEALTHY,FERTILE,EUTH,,5,Pure


In [37]:
# Drop SeondaryBreed column since BreedCategory is available
df4 = df4.drop(columns=['SecondaryBreed'])
df4.shape

(63695, 10)

In [38]:
df4.head()

Unnamed: 0,PrimaryColor,PrimaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory
4,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,12.17,29,Pure
6,BROWN,AMERICAN PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,-106,Mix
8,WHITE,LABRADOR RETRIEVER,MALE,FEARFUL,HEALTHY,FERTILE,EUTH,,-84,Mix
15,TRICOLOR,BEAGLE,NEUTERED MALE,NORMAL,HEALTHY,ALTERED,EUTH,,-12,Mix
17,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,7,Pure


In [39]:
df4.isna().sum()

PrimaryColor                      0
PrimaryBreed                      0
Gender                            0
IntakeInternalStatus              0
IntakeAsilomarStatus              0
ReproductiveStatusAtIntake        0
OutcomeType                       0
AgeInMonths                   20873
DurationInShelter                 0
BreedCategory                     0
dtype: int64

### Feature Engineering 2 - Trying to Group "IntakeInternalStatus" column
* From 25 subcategories to 5

In [40]:
df4.groupby('IntakeInternalStatus').count()

Unnamed: 0_level_0,PrimaryColor,PrimaryBreed,Gender,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory
IntakeInternalStatus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AGED,3246,3246,3246,3246,3246,3246,2637,3246,3246
AGG ANIMAL,67,67,67,67,67,67,67,67,67
AGG BARRIE,5,5,5,5,5,5,4,5,5
AGG FEAR,34,34,34,34,34,34,34,34,34
AGG FOOD,5,5,5,5,5,5,5,5,5
AGG PEOPLE,131,131,131,131,131,131,131,131,131
AGGRESSIVE,2420,2420,2420,2420,2420,2420,1533,2420,2420
DEAD,1530,1530,1530,1530,1530,1530,1097,1530,1530
DEHYDRA,9,9,9,9,9,9,9,9,9
DIARRHEA,2,2,2,2,2,2,2,2,2


In [41]:
# df5.groupby("IntakeInternalStatus").count()
# df5.IntakeInternalStatus.value_counts().to_dict()

df4['IntakeStatus'] = "Sick"

In [42]:
# Group all AGG together
df4.loc[df4["IntakeInternalStatus"].str.contains("AGG"), "IntakeStatus"] = "Aggressive"

In [43]:
# Group all Normal together
df4.loc[df4["IntakeInternalStatus"].str.contains("NORMAL"), "IntakeStatus"] = "Normal"

In [44]:
df4.loc[df4["IntakeInternalStatus"] == "NORMAL"]

Unnamed: 0,PrimaryColor,PrimaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory,IntakeStatus
4,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,12.17,29,Pure,Normal
6,BROWN,AMERICAN PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,-106,Mix,Normal
15,TRICOLOR,BEAGLE,NEUTERED MALE,NORMAL,HEALTHY,ALTERED,EUTH,,-12,Mix,Normal
17,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,7,Pure,Normal
18,BROWN BRINDLE,AMERICAN PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,5,Two,Normal
...,...,...,...,...,...,...,...,...,...,...,...
150835,APRICOT,DANDIE DINMONT TERRIER,NEUTERED MALE,NORMAL,HEALTHY,FERTILE,ADOPTION,36.53,109,Mix,Normal
150836,BLACK,DACHSHUND - WIREHAIRED,SPAYED FEMALE,NORMAL,HEALTHY,FERTILE,ADOPTION,36.53,71,Pure,Normal
150837,YELLOW BRINDLE,GREYHOUND,NEUTERED MALE,NORMAL,HEALTHY,FERTILE,ADOPTION,24.13,140,Mix,Normal
150838,CREAM,CAIRN TERRIER,SPAYED FEMALE,NORMAL,HEALTHY,FERTILE,ADOPTION,7.07,7,Pure,Normal


In [45]:
# Add these to Aggresive
df4.loc[df4["IntakeInternalStatus"].str.contains("TERITORIAL"), "IntakeStatus"] = "Aggressive"
df4.loc[df4["IntakeInternalStatus"].str.contains("FERAL"), "IntakeStatus"] = "Aggressive"
df4.loc[df4["IntakeInternalStatus"].str.contains("FEARFUL"), "IntakeStatus"] = "Aggressive"

# Group these to Pregnant
df4.loc[df4["IntakeInternalStatus"].str.contains("NURSING"), "IntakeStatus"] = "Pregnant"
df4.loc[df4["IntakeInternalStatus"].str.contains("PREGNANT"), "IntakeStatus"] = "Pregnant"

# Group these to Other
df4.loc[df4["IntakeInternalStatus"].str.contains("AGED"), "IntakeStatus"] = "Other"
df4.loc[df4["IntakeInternalStatus"].str.contains("DEAD"), "IntakeStatus"] = "Other"
df4.loc[df4["IntakeInternalStatus"].str.contains("OTHER"), "IntakeStatus"] = "Other"
df4.loc[df4["IntakeInternalStatus"].str.contains("OBESE"), "IntakeStatus"] = "Other"


In [46]:
df4.groupby('IntakeStatus').count()

Unnamed: 0_level_0,PrimaryColor,PrimaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory
IntakeStatus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Aggressive,5635,5635,5635,5635,5635,5635,5635,3587,5635,5635
Normal,43425,43425,43425,43425,43425,43425,43425,28451,43425,43425
Other,6500,6500,6500,6500,6500,6500,6500,5412,6500,6500
Pregnant,1184,1184,1184,1184,1184,1184,1184,874,1184,1184
Sick,6951,6951,6951,6951,6951,6951,6951,4498,6951,6951


In [47]:
df4.head()

Unnamed: 0,PrimaryColor,PrimaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory,IntakeStatus
4,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,12.17,29,Pure,Normal
6,BROWN,AMERICAN PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,-106,Mix,Normal
8,WHITE,LABRADOR RETRIEVER,MALE,FEARFUL,HEALTHY,FERTILE,EUTH,,-84,Mix,Aggressive
15,TRICOLOR,BEAGLE,NEUTERED MALE,NORMAL,HEALTHY,ALTERED,EUTH,,-12,Mix,Normal
17,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,7,Pure,Normal


#### Handling Color - Option 1 (Grouping colors) - DID NOT CHANGE COLOR!!
* How does option1 effect RF/LogReg models?


In [48]:
# # How many unique color combinations?  - 333
# color_counts = df4['Color'].value_counts()
# # len(color_counts)
# print(df4['Color'].nunique())
# color_counts.head(20)

In [49]:
# # # Handling "Color" Option 1
# color_others = set(color_counts[color_counts < 1200].index)
# df4['Top_colors'] = df4['Color'].replace(list(color_others), 'Others')
# print(df4['Top_colors'].nunique())

In [50]:
# # Add new column called Color_new and set to Color
# df5 = df4.copy()
# df5["Color_new"]=df4["Color"]
# df5.head()
# df5.loc[320:330, :]

In [51]:
# Can't do this code here.  It takes COMBI tan/white and changes that to tan!  WRONG!!  I only want tan column


# Find rows containing colors: Gold/Yellow/Tan/Fawn/Buff/Apricot/Cream and set all to YELLOW
# df5.loc[df5["Color"].str.contains('Gold|Yellow|Tan|Fawn|Buff|Apricot|Cream'), "Color_new"] = "Tan"
# df5.loc[264:267, :]

### Feature Engineering 3 - Creating age categories
* Baby: <=6months
* Young: 7months-24 months
* Adult: 2 years-7years
* Senior: 7+

In [52]:
df4['PetAgeCategory'] = pd.cut(x=df4['AgeInMonths'], bins=[0, 6, 24, 84, 240], labels=['Baby', 'Young', 'Adult', 'Senior'])

In [53]:
df4.head(10)


Unnamed: 0,PrimaryColor,PrimaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory,IntakeStatus,PetAgeCategory
4,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,12.17,29,Pure,Normal,Young
6,BROWN,AMERICAN PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,-106,Mix,Normal,
8,WHITE,LABRADOR RETRIEVER,MALE,FEARFUL,HEALTHY,FERTILE,EUTH,,-84,Mix,Aggressive,
15,TRICOLOR,BEAGLE,NEUTERED MALE,NORMAL,HEALTHY,ALTERED,EUTH,,-12,Mix,Normal,
17,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,7,Pure,Normal,
18,BROWN BRINDLE,AMERICAN PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,5,Two,Normal,
19,TAN,AIREDALE TERRIER,FEMALE,NORMAL,HEALTHY,FERTILE,EUTH,,5,Mix,Normal,
21,BLACK,PIT BULL TERRIER,MALE,INJURED,HEALTHY,FERTILE,EUTH,,6,Pure,Sick,
24,BLACK,BEAGLE,FEMALE,NORMAL,HEALTHY,FERTILE,EUTH,,4,Mix,Normal,
25,BROWN,CHINESE SHARPEI,MALE,FEARFUL,HEALTHY,FERTILE,EUTH,,4,Mix,Aggressive,


### Feature Engineering 4 - Sex column


In [54]:
# Split gender because ReproductiveStatusAtIntake is similar to Gender
df4["Sex"] = "Male"

In [55]:
df4.head()

Unnamed: 0,PrimaryColor,PrimaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory,IntakeStatus,PetAgeCategory,Sex
4,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,12.17,29,Pure,Normal,Young,Male
6,BROWN,AMERICAN PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,-106,Mix,Normal,,Male
8,WHITE,LABRADOR RETRIEVER,MALE,FEARFUL,HEALTHY,FERTILE,EUTH,,-84,Mix,Aggressive,,Male
15,TRICOLOR,BEAGLE,NEUTERED MALE,NORMAL,HEALTHY,ALTERED,EUTH,,-12,Mix,Normal,,Male
17,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,7,Pure,Normal,,Male


In [56]:
# Find rows containing word "FEMALE" and set to Female
df4.loc[df4["Gender"].str.contains("FEMALE"), "Sex"] = "Female"

In [57]:
df4.head()

Unnamed: 0,PrimaryColor,PrimaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory,IntakeStatus,PetAgeCategory,Sex
4,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,12.17,29,Pure,Normal,Young,Male
6,BROWN,AMERICAN PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,-106,Mix,Normal,,Male
8,WHITE,LABRADOR RETRIEVER,MALE,FEARFUL,HEALTHY,FERTILE,EUTH,,-84,Mix,Aggressive,,Male
15,TRICOLOR,BEAGLE,NEUTERED MALE,NORMAL,HEALTHY,ALTERED,EUTH,,-12,Mix,Normal,,Male
17,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,7,Pure,Normal,,Male


## Changing to Binary Outcome - Take the dog or Don't take the dog?
* Positive Outcome is Adoption - Take in the dog
* Negative Outcome is a combination of Transfer, Euthanized and Death categories - Don't take in the dog

In [58]:
# Create new BINARY_OUTCOME column and set to 0 = Deny

df4["BinaryOutcome"]=0
df4.head()

Unnamed: 0,PrimaryColor,PrimaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory,IntakeStatus,PetAgeCategory,Sex,BinaryOutcome
4,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,12.17,29,Pure,Normal,Young,Male,0
6,BROWN,AMERICAN PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,-106,Mix,Normal,,Male,0
8,WHITE,LABRADOR RETRIEVER,MALE,FEARFUL,HEALTHY,FERTILE,EUTH,,-84,Mix,Aggressive,,Male,0
15,TRICOLOR,BEAGLE,NEUTERED MALE,NORMAL,HEALTHY,ALTERED,EUTH,,-12,Mix,Normal,,Male,0
17,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,7,Pure,Normal,,Male,0


In [59]:
# Find rows containing word "Adoption" and set to 1 (for Positive Outcome)
# All other rows will be 0 (for Negative Outcome)

df4.loc[df4["OutcomeType"].str.contains("ADOPTION"), "BinaryOutcome"] = 1

In [65]:
df4.shape

(63695, 14)

## Uncomment to save CLEAN dataset to csv if needed

In [61]:
# df3.to_csv('LouisvilleClean.csv')

## Dealing with high cardinality in PrimaryBreed (250 unique values!)

* Option 1 - Changing PrimaryBreed to FreqEncoding
* Option 2 - Take top 25 breeds?


In [None]:
# Option 1 - Freq Encoding

# df_freq = df4.PrimaryBreed.value_counts().to_dict()
# df_freq

In [None]:
# df4.PrimaryBreed = df4.PrimaryBreed.map(df_freq)

In [None]:
# df4.head()

In [73]:
# Option 2 - Taking top breeds (Top 25 breed account about 80.9%)

breed_counts = df4['PrimaryBreed'].value_counts()
breed_counts.head(25)

PIT BULL TERRIER                  12617
LABRADOR RETRIEVER                 7768
GERMAN SHEPHERD DOG                4530
BEAGLE                             4476
BOXER                              2294
CHIHUAHUA - SMOOTH COATED          2244
AMERICAN PIT BULL TERRIER          2034
CHOW CHOW                          1928
ROTTWEILER                         1857
BORDER COLLIE                      1361
SHIH TZU                           1336
JACK RUSS TER                      1100
POODLE - MINIATURE                  817
SIBERIAN HUSKY                      745
YORKSHIRE TERRIER                   725
DACHSHUND                           677
AUSTRALIAN SHEPHERD                 676
COCKER SPANIEL                      642
GOLDEN RETRIEVER                    605
POMERANIAN                          603
PUG                                 555
MINIATURE PINSCHER                  539
AUSTRALIAN CATTLE DOG               523
AMERICAN STAFFORDSHIRE TERRIER      489
PARSON (JACK) RUSSELL TERRIER       442


In [74]:
df4.loc[200:210, :]

Unnamed: 0,PrimaryColor,PrimaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory,IntakeStatus,PetAgeCategory,Sex,BinaryOutcome
200,BLACK,AMERICAN PIT BULL TERRIER,FEMALE,NORMAL,HEALTHY,FERTILE,EUTH,,-143,Mix,Normal,,Female,0
202,BLACK,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,6,Pure,Normal,,Male,0
207,BLACK,ROTTWEILER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,-202,Mix,Normal,,Male,0
208,BLACK,BLOODHOUND,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,5,Mix,Normal,,Male,0
209,BROWN,PIT BULL TERRIER,FEMALE,AGED,TREATABLE/MANAGEABLE,FERTILE,EUTH,109.57,8,Pure,Other,Senior,Female,0
210,BROWN,GERMAN SHEPHERD DOG,FEMALE,INJURED,HEALTHY,FERTILE,EUTH,,0,Mix,Sick,,Female,0


In [75]:
breed_others = set(breed_counts[breed_counts < 441].index)
df4['TopBreed'] = df4['PrimaryBreed'].replace(list(breed_others), 'Other')

In [76]:
df4.head()

Unnamed: 0,PrimaryColor,PrimaryBreed,Gender,IntakeInternalStatus,IntakeAsilomarStatus,ReproductiveStatusAtIntake,OutcomeType,AgeInMonths,DurationInShelter,BreedCategory,IntakeStatus,PetAgeCategory,Sex,BinaryOutcome,TopBreed
4,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,12.17,29,Pure,Normal,Young,Male,0,PIT BULL TERRIER
6,BROWN,AMERICAN PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,-106,Mix,Normal,,Male,0,AMERICAN PIT BULL TERRIER
8,WHITE,LABRADOR RETRIEVER,MALE,FEARFUL,HEALTHY,FERTILE,EUTH,,-84,Mix,Aggressive,,Male,0,LABRADOR RETRIEVER
15,TRICOLOR,BEAGLE,NEUTERED MALE,NORMAL,HEALTHY,ALTERED,EUTH,,-12,Mix,Normal,,Male,0,BEAGLE
17,WHITE,PIT BULL TERRIER,MALE,NORMAL,HEALTHY,FERTILE,EUTH,,7,Pure,Normal,,Male,0,PIT BULL TERRIER


In [78]:
breed_counts = df4['TopBreed'].value_counts()
breed_counts.head(26)

PIT BULL TERRIER                  12617
Other                             12112
LABRADOR RETRIEVER                 7768
GERMAN SHEPHERD DOG                4530
BEAGLE                             4476
BOXER                              2294
CHIHUAHUA - SMOOTH COATED          2244
AMERICAN PIT BULL TERRIER          2034
CHOW CHOW                          1928
ROTTWEILER                         1857
BORDER COLLIE                      1361
SHIH TZU                           1336
JACK RUSS TER                      1100
POODLE - MINIATURE                  817
SIBERIAN HUSKY                      745
YORKSHIRE TERRIER                   725
DACHSHUND                           677
AUSTRALIAN SHEPHERD                 676
COCKER SPANIEL                      642
GOLDEN RETRIEVER                    605
POMERANIAN                          603
PUG                                 555
MINIATURE PINSCHER                  539
AUSTRALIAN CATTLE DOG               523
AMERICAN STAFFORDSHIRE TERRIER      489


# TESTING - Drop PetAgeCategory because of nulls for additional 20K of InternalStatus data

In [79]:
df4.columns

Index(['PrimaryColor', 'PrimaryBreed', 'Gender', 'IntakeInternalStatus',
       'IntakeAsilomarStatus', 'ReproductiveStatusAtIntake', 'OutcomeType',
       'AgeInMonths', 'DurationInShelter', 'BreedCategory', 'IntakeStatus',
       'PetAgeCategory', 'Sex', 'BinaryOutcome', 'TopBreed'],
      dtype='object')

In [80]:
# ORIGINAL CODE WITH PETAGECATEGORY
# Final dropping of duplicated columns
# df5 = df4.drop(columns=[ 'Gender', 'IntakeInternalStatus','IntakeAsilomarStatus','ReproductiveStatusAtIntake','OutcomeType','AgeInMonths','DurationInShelter'])


# Testing WITHOUT PetAgeCategory
df5 = df4.drop(columns=[ 'PrimaryBreed','Gender', 'IntakeInternalStatus','IntakeAsilomarStatus','ReproductiveStatusAtIntake','OutcomeType','AgeInMonths','DurationInShelter', 'PetAgeCategory'])

In [81]:
df5.head()
df5.isnull().sum()
df5.head()

Unnamed: 0,PrimaryColor,BreedCategory,IntakeStatus,Sex,BinaryOutcome,TopBreed
4,WHITE,Pure,Normal,Male,0,PIT BULL TERRIER
6,BROWN,Mix,Normal,Male,0,AMERICAN PIT BULL TERRIER
8,WHITE,Mix,Aggressive,Male,0,LABRADOR RETRIEVER
15,TRICOLOR,Mix,Normal,Male,0,BEAGLE
17,WHITE,Pure,Normal,Male,0,PIT BULL TERRIER


#### Documentation of Target Encoding
* https://medium.com/@pouryaayria/k-fold-target-encoding-dfe9a594874b
* https://brendanhasz.github.io/2019/03/04/target-encoding.html

#### Installation
pip install category_encoders
* https://pypi.org/project/category-encoders/

## Back to pandas get_dummies

In [None]:
df5.columns

In [82]:
# ANGIE ANGIE took out petagecat - trying to see if additional 20L rows makes an improvement in the model
df6 = pd.get_dummies(df5[['PrimaryColor',
                          'TopBreed', 
                          'IntakeStatus', 
                          'BreedCategory',
                          'BinaryOutcome',
                          "Sex"]], drop_first=True)

In [83]:
df6.columns

Index(['BinaryOutcome', 'PrimaryColor_BEIGE', 'PrimaryColor_BLACK',
       'PrimaryColor_BLACK BRINDLE', 'PrimaryColor_BLACK SMOKE',
       'PrimaryColor_BLACK TIGER', 'PrimaryColor_BLONDE', 'PrimaryColor_BLUE',
       'PrimaryColor_BLUE CREAM', 'PrimaryColor_BLUE MERLE',
       'PrimaryColor_BLUE SMOKE', 'PrimaryColor_BLUE TICKED',
       'PrimaryColor_BLUE TIGER', 'PrimaryColor_BRINDLE', 'PrimaryColor_BROWN',
       'PrimaryColor_BROWN BRINDLE', 'PrimaryColor_BROWN MERLE',
       'PrimaryColor_BROWN TABBY', 'PrimaryColor_BROWN TIGER',
       'PrimaryColor_BUFF', 'PrimaryColor_CHAMPAIGN', 'PrimaryColor_CHOCOLATE',
       'PrimaryColor_CREAM', 'PrimaryColor_CREAM TIGER', 'PrimaryColor_FAWN',
       'PrimaryColor_GOLD', 'PrimaryColor_GRAY', 'PrimaryColor_GRAY TIGER',
       'PrimaryColor_LIVER', 'PrimaryColor_MERLE', 'PrimaryColor_ORANGE',
       'PrimaryColor_PINK', 'PrimaryColor_RED', 'PrimaryColor_RED MERLE',
       'PrimaryColor_RED TICKED', 'PrimaryColor_RUDDY', 'PrimaryColor_SABLE

In [84]:
# Define X and y
X = df6.drop(['BinaryOutcome'], axis="columns")
y = df6[['BinaryOutcome']]

In [85]:
X.shape


(63695, 77)

In [86]:
y.shape

(63695, 1)

## Random Forest model


In [87]:
# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)

In [88]:
# Import, initialize, fit and predict
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators = 200, random_state = 20) 
rf_model.fit(X_train, y_train)

  after removing the cwd from sys.path.


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=20, verbose=0,
                       warm_start=False)

In [89]:
predict_y_test = rf_model.predict(X_test)

In [90]:
# Validate - run accuracy score
from sklearn import metrics
print("Accuracy score: ", metrics.accuracy_score(y_test, predict_y_test))

Accuracy score:  0.7544586787239387


In [91]:
print(y_test[0:10])

        BinaryOutcome
13790               0
102540              0
70574               0
15971               0
134260              0
36850               0
126222              0
123182              1
34416               1
101740              0


In [92]:
print(predict_y_test[0:10])

[0 0 0 0 0 0 0 0 0 0]


In [93]:
from rfpimp import *



In [94]:
# total number of nodes in all decision trees of the forest and the height (in nodes) of the typical tree.
# The tree height matters because that is the path taken by the RF prediction mechanism and so tree height effects prediction speed.
from rfpimp import *
print(f"{rfnnodes(rf_model):,d} tree nodes and {np.median(rfmaxdepths(rf_model))} median tree height")

1,145,876 tree nodes and 51.0 median tree height


In [95]:
# RF Feature importance?
feature_list = list(X.columns)
feature_importance = pd.Series(rf_model.feature_importances_, index=feature_list).sort_values(ascending=False) 
print(feature_importance[0:10])

IntakeStatus_Normal                   0.172355
TopBreed_PIT BULL TERRIER             0.071767
Sex_Male                              0.065411
BreedCategory_Two                     0.046587
IntakeStatus_Sick                     0.043780
BreedCategory_Pure                    0.043313
IntakeStatus_Other                    0.041212
TopBreed_Other                        0.036402
TopBreed_CHIHUAHUA - SMOOTH COATED    0.027596
TopBreed_CHOW CHOW                    0.021649
dtype: float64


In [96]:
# Validate with cross validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_model, X, y, cv=10, scoring="accuracy")
print(scores)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[0.76279435 0.76216641 0.76185243 0.77080063 0.76295133 0.75883184
 0.75883184 0.75993092 0.73920553 0.70073795]


In [97]:
scores.mean()

0.7538103224187606

In [98]:
from sklearn.metrics import mean_absolute_error, make_scorer
test_mae = mean_absolute_error(y_test, predict_y_test)
print(test_mae)

0.2455413212760613


## SVM


In [99]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X ,y, random_state=20)  

target = df6["BinaryOutcome"]

In [100]:
from sklearn.svm import SVC 
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [101]:
# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
#   decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
#   max_iter=-1, probability=False, random_state=None, shrinking=True,
#   tol=0.001, verbose=False)

In [102]:
# Model Accuracy
print('Test Acc: %.4f' % svm_model.score(X_test, y_test))

Test Acc: 0.7587


In [103]:
 # Calculate classification report
from sklearn.metrics import classification_report
predictions = svm_model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.77      0.97      0.86     12025
           1       0.53      0.12      0.19      3899

    accuracy                           0.76     15924
   macro avg       0.65      0.54      0.53     15924
weighted avg       0.71      0.76      0.70     15924

