In [60]:
# This file takes the merged dataset "intakes_outcomes.csv" from the SQL database and cleans it
# ------------------------------------------------------------------------------------------------
# Import dependencies
import pandas as pd
import numpy as np
import hvplot.pandas
from datetime import datetime
from path import Path
import numpy as np
import plotly.express as px
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [61]:
# Read in Merged DataFrame
filepath = 'Resources/intakes_outcomes.csv'
df = pd.read_csv(filepath)
df

Unnamed: 0,animal_id,animal_name,animal_type,breed,color,intake_type,date_of_birth,intake_date,found_location,intake_condition,sex_upon_intake,age_upon_intake,outcome_date,outcome_type,outcome_subtype,sex_upon_outcome,age_upon_outcome
0,A786884,*Brock,Dog,Beagle Mix,Tricolor,Stray,2017-01-03,2019-01-03,2501 Magin Meadow Dr in Austin (TX),Normal,Neutered Male,2 years,2019-01-08,Transfer,Partner,Neutered Male,2 years
1,A682524,Rio,Dog,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,Stray,2010-06-29,2014-06-29,800 Grove Blvd in Austin (TX),Normal,Neutered Male,4 years,2014-07-02,Return to Owner,,Neutered Male,4 years
2,A696408,*Pearl,Dog,Chihuahua Shorthair,Tricolor,Stray,2013-02-04,2015-02-04,9705 Thaxton in Austin (TX),Normal,Intact Female,2 years,2015-05-28,Adoption,Foster,Spayed Female,2 years
3,A736287,*Twilight,Cat,Domestic Shorthair Mix,Torbie,Stray,2016-08-08,2016-10-08,South First And Stassney in Austin (TX),Normal,Intact Female,1 month,2016-10-12,Adoption,,Spayed Female,2 months
4,A810994,,Other,Bat,Brown,Wildlife,2017-12-24,2019-12-25,7900 Rm 1826 Rd in Travis (TX),Normal,Unknown,2 years,2019-12-26,Disposal,,Unknown,2 years
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105362,A841042,185 Grams,Cat,Domestic Medium Hair,Black/White,Stray,2021-08-06,2021-08-19,15Th Street And San Jacinto in Austin (TX),Normal,Intact Male,1 weeks,2021-08-19,Transfer,Partner,Intact Male,1 weeks
105363,A840777,Sorla,Dog,Belgian Malinois Mix,Black/White,Stray,2020-12-13,2021-08-13,1217 August Drive in Austin (TX),Normal,Intact Female,7 months,2021-08-19,Adoption,,Spayed Female,8 months
105364,A841054,*Targaryen,Dog,German Shepherd,Brown/White,Owner Surrender,2021-07-04,2021-08-19,Austin (TX),Normal,Intact Male,1 month,2021-08-30,Adoption,,Neutered Male,1 month
105365,A841070,,Other,Bat,Brown,Wildlife,2020-08-19,2021-08-19,Austin (TX),Normal,Unknown,1 year,2021-08-23,Euthanasia,Rabies Risk,Unknown,1 year


In [44]:
# Check data types
df.dtypes

animal_id           object
animal_name         object
animal_type         object
breed               object
color               object
intake_type         object
date_of_birth       object
intake_date         object
found_location      object
intake_condition    object
sex_upon_intake     object
age_upon_intake     object
outcome_date        object
outcome_type        object
outcome_subtype     object
sex_upon_outcome    object
age_upon_outcome    object
dtype: object

In [5]:
# Count number of null values in each column
df.isna().sum()

animal_id               0
animal_name         39355
animal_type             0
breed                   0
color                   0
intake_type             0
date_of_birth           0
intake_date             0
found_location          0
intake_condition        0
sex_upon_intake         1
age_upon_intake         0
outcome_date            0
outcome_type           18
outcome_subtype     50550
sex_upon_outcome        1
age_upon_outcome       13
dtype: int64

In [50]:
# Get unique values from outcome_type
print(df.outcome_type.unique())

['Transfer' 'Return to Owner' 'Adoption' 'Disposal' 'Died' 'Euthanasia'
 'Rto-Adopt' 'Missing' 'Relocate' nan]


In [7]:
# Get unique values from animal_type
print(df.animal_type.unique())

['Dog' 'Cat' 'Other' 'Bird' 'Livestock']


In [8]:
# Get unique values from intake_type
print(df.intake_type.unique())

['Stray' 'Wildlife' 'Owner Surrender' 'Public Assist' 'Abandoned'
 'Euthanasia Request']


In [9]:
# Get unique values from intake_condition
print(df.intake_condition.unique())

['Normal' 'Injured' 'Nursing' 'Aged' 'Sick' 'Other' 'Feral' 'Medical'
 'Pregnant' 'Behavior' 'Neonatal' 'Space']


In [46]:
# Tokenize the unique values


In [55]:
# Convert date_of_birth to datetime
df["date_of_birth"] = pd.to_datetime(df["date_of_birth"])
df["date_of_birth"]

0        2017-01-03
1        2010-06-29
2        2013-02-04
3        2016-08-08
4        2017-12-24
            ...    
105362   2021-08-06
105363   2020-12-13
105364   2021-07-04
105365   2020-08-19
105366   2021-05-20
Name: date_of_birth, Length: 105367, dtype: datetime64[ns]

In [57]:
# Convert intake_date to datetime
df["intake_date"] = pd.to_datetime(df["intake_date"])
df["intake_date"]

0        2019-01-03
1        2014-06-29
2        2015-02-04
3        2016-10-08
4        2019-12-25
            ...    
105362   2021-08-19
105363   2021-08-13
105364   2021-08-19
105365   2021-08-19
105366   2021-08-20
Name: intake_date, Length: 105367, dtype: datetime64[ns]

In [58]:
# Convert  outcome_date to datetime
df["outcome_date"] = pd.to_datetime(df["outcome_date"])
df["outcome_date"]

0        2019-01-08
1        2014-07-02
2        2015-05-28
3        2016-10-12
4        2019-12-26
            ...    
105362   2021-08-19
105363   2021-08-19
105364   2021-08-30
105365   2021-08-23
105366   2021-08-30
Name: outcome_date, Length: 105367, dtype: datetime64[ns]

In [59]:
# Create column "length_of_stay" to store duration animal stays in the shelter
df['length_of_stay'] = df['outcome_date'] - df['intake_date']
df

Unnamed: 0,animal_id,animal_name,animal_type,breed,color,intake_type,date_of_birth,intake_date,found_location,intake_condition,sex_upon_intake,age_upon_intake,outcome_date,outcome_type,outcome_subtype,sex_upon_outcome,age_upon_outcome,length_of_stay
0,A786884,*Brock,Dog,Beagle Mix,Tricolor,Stray,2017-01-03,2019-01-03,2501 Magin Meadow Dr in Austin (TX),Normal,Neutered Male,2 years,2019-01-08,Transfer,Partner,Neutered Male,2 years,5 days
1,A682524,Rio,Dog,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,Stray,2010-06-29,2014-06-29,800 Grove Blvd in Austin (TX),Normal,Neutered Male,4 years,2014-07-02,Return to Owner,,Neutered Male,4 years,3 days
2,A696408,*Pearl,Dog,Chihuahua Shorthair,Tricolor,Stray,2013-02-04,2015-02-04,9705 Thaxton in Austin (TX),Normal,Intact Female,2 years,2015-05-28,Adoption,Foster,Spayed Female,2 years,113 days
3,A736287,*Twilight,Cat,Domestic Shorthair Mix,Torbie,Stray,2016-08-08,2016-10-08,South First And Stassney in Austin (TX),Normal,Intact Female,1 month,2016-10-12,Adoption,,Spayed Female,2 months,4 days
4,A810994,,Other,Bat,Brown,Wildlife,2017-12-24,2019-12-25,7900 Rm 1826 Rd in Travis (TX),Normal,Unknown,2 years,2019-12-26,Disposal,,Unknown,2 years,1 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105362,A841042,185 Grams,Cat,Domestic Medium Hair,Black/White,Stray,2021-08-06,2021-08-19,15Th Street And San Jacinto in Austin (TX),Normal,Intact Male,1 weeks,2021-08-19,Transfer,Partner,Intact Male,1 weeks,0 days
105363,A840777,Sorla,Dog,Belgian Malinois Mix,Black/White,Stray,2020-12-13,2021-08-13,1217 August Drive in Austin (TX),Normal,Intact Female,7 months,2021-08-19,Adoption,,Spayed Female,8 months,6 days
105364,A841054,*Targaryen,Dog,German Shepherd,Brown/White,Owner Surrender,2021-07-04,2021-08-19,Austin (TX),Normal,Intact Male,1 month,2021-08-30,Adoption,,Neutered Male,1 month,11 days
105365,A841070,,Other,Bat,Brown,Wildlife,2020-08-19,2021-08-19,Austin (TX),Normal,Unknown,1 year,2021-08-23,Euthanasia,Rabies Risk,Unknown,1 year,4 days


In [68]:
# Create Custom encoding functions

le = LabelEncoder()

df2 = df.copy()

df2['animal_type'] = le.fit_transform(df2['animal_type'])
df2['sex_upon_intake'] = le.fit_transform(df2['sex_upon_intake']) 
df2['sex_upon_outcome'] = le.fit_transform(df2['sex_upon_outcome'])
df2['outcome_type'] = le.fit_transform(df2['outcome_type'])
df2['breed'] = le.fit_transform(df2['breed'])
df2['color'] = le.fit_transform(df2['color'])
df2['intake_type'] = le.fit_transform(df2['intake_type'])
df2['intake_condition'] = le.fit_transform(df2['intake_condition'])


df2

Unnamed: 0,animal_id,animal_name,animal_type,breed,color,intake_type,date_of_birth,intake_date,found_location,intake_condition,sex_upon_intake,age_upon_intake,outcome_date,outcome_type,outcome_subtype,sex_upon_outcome,age_upon_outcome
0,A786884,*Brock,2,295,498,4,2017-01-03,2019-01-03,2501 Magin Meadow Dr in Austin (TX),6,2,2 years,2019-01-08,8,Partner,2,2 years
1,A682524,Rio,2,1055,460,4,2010-06-29,2014-06-29,800 Grove Blvd in Austin (TX),6,2,4 years,2014-07-02,6,,2,4 years
2,A696408,*Pearl,2,759,498,4,2013-02-04,2015-02-04,9705 Thaxton in Austin (TX),6,0,2 years,2015-05-28,0,Foster,3,2 years
3,A736287,*Twilight,1,1091,470,4,2016-08-08,2016-10-08,South First And Stassney in Austin (TX),6,0,1 month,2016-10-12,0,,3,2 months
4,A810994,,4,292,128,5,2017-12-24,2019-12-25,7900 Rm 1826 Rd in Travis (TX),6,4,2 years,2019-12-26,2,,4,2 years
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105362,A841042,185 Grams,1,1087,61,4,2021-08-06,2021-08-19,15Th Street And San Jacinto in Austin (TX),6,1,1 weeks,2021-08-19,8,Partner,1,1 weeks
105363,A840777,Sorla,2,348,61,4,2020-12-13,2021-08-13,1217 August Drive in Austin (TX),6,0,7 months,2021-08-19,0,,3,8 months
105364,A841054,*Targaryen,2,1204,200,2,2021-07-04,2021-08-19,Austin (TX),6,1,1 month,2021-08-30,0,,2,1 month
105365,A841070,,4,292,128,5,2020-08-19,2021-08-19,Austin (TX),6,4,1 year,2021-08-23,3,Rabies Risk,4,1 year
