### 2 Data Loading

In [600]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#### 2.1 Data Collection

In [601]:
data = pd.read_csv('data/Animal_Shelter_Intake_and_Outcome.csv')

animal_data = data[['Breed', 'Color', "Sex", "Date Of Birth", "Intake Date", "Outcome Date", "Intake Type", "Type", "Outcome Type", "Intake Condition", "Days in Shelter"]].copy()

display(animal_data.head())

Unnamed: 0,Breed,Color,Sex,Date Of Birth,Intake Date,Outcome Date,Intake Type,Type,Outcome Type,Intake Condition,Days in Shelter
0,MALTESE/POODLE TOY,WHITE,Spayed,10/06/2014,07/05/2023,08/08/2023,STRAY,DOG,ADOPTION,UNKNOWN,34
1,DOMESTIC SH,ORG TABBY/WHITE,Spayed,05/07/2023,05/30/2023,08/08/2023,STRAY,CAT,ADOPTION,UNKNOWN,70
2,DOMESTIC SH,BRN TABBY/WHITE,Female,07/15/2012,07/15/2023,08/08/2023,STRAY,CAT,TRANSFER,UNKNOWN,24
3,POODLE MIN,WHITE,Neutered,,04/25/2023,07/19/2023,STRAY,DOG,ADOPTION,UNKNOWN,85
4,PUG,FAWN,Male,08/11/2021,08/08/2023,08/08/2023,STRAY,DOG,RETURN TO OWNER,HEALTHY,0


#### 2.1.1 Data Overview

In [602]:
print(animal_data.shape)
print(animal_data.info())
print(animal_data.isnull().sum())

(26187, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26187 entries, 0 to 26186
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Breed             26187 non-null  object
 1   Color             26187 non-null  object
 2   Sex               26187 non-null  object
 3   Date Of Birth     19720 non-null  object
 4   Intake Date       26187 non-null  object
 5   Outcome Date      25949 non-null  object
 6   Intake Type       26187 non-null  object
 7   Type              26187 non-null  object
 8   Outcome Type      25943 non-null  object
 9   Intake Condition  26187 non-null  object
 10  Days in Shelter   26187 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 2.2+ MB
None
Breed                  0
Color                  0
Sex                    0
Date Of Birth       6467
Intake Date            0
Outcome Date         238
Intake Type            0
Type                   0
Outcome Type         244


## 2.2 Data Cleaning

#### 2.2.1 Standardize Column Names

In [603]:
animal_data = animal_data.rename(
    columns={
    "Breed" : "Breed",
    "Color" : "Color",
    "Sex" : "Sex",
    "Date Of Birth" : "DOB",
    "Intake Date" : 'Intake_Date',
    "Outcome Date" : 'Outcome_Date',
    "Intake Type" : "Intake_Type",
    "Outcome Type" : "Outcome_Type",
    "Intake Condition" : "Intake_Condition",
    "Days in Shelter" : "Time_in_Shelter_Days",
    "Type": "Animal_Type"
})

#### 2.2.2 Data Type Conversion

In [604]:
animal_data = animal_data.astype({
    "Breed" : "object",
    "Color" : "category",
    "Sex" : "category",
    "Intake_Type" : "category",
    "Intake_Condition" : "category",
})

animal_data['DOB'] = pd.to_datetime(animal_data["DOB"], errors='coerce')
animal_data['Intake_Date'] = pd.to_datetime(animal_data["Intake_Date"], errors='coerce')
animal_data['Outcome_Date'] = pd.to_datetime(animal_data["Outcome_Date"], errors='coerce')

animal_data.dtypes

Breed                           object
Color                         category
Sex                           category
DOB                     datetime64[ns]
Intake_Date             datetime64[ns]
Outcome_Date            datetime64[ns]
Intake_Type                   category
Animal_Type                     object
Outcome_Type                    object
Intake_Condition              category
Time_in_Shelter_Days             int64
dtype: object

#### 2.2.4 Split & Standardize Color

In [605]:
animal_data['Color'] = animal_data['Color'].str.lower()
animal_data[['Primary_Color', 'Secondary_Color']] = animal_data['Color'].str.split('/', n=1, expand=True)

animal_data['Secondary_Color'] = animal_data['Secondary_Color'].fillna("none")

animal_data['Primary_Color'].astype('category')
animal_data['Secondary_Color'].astype('category')

print(dict(animal_data['Primary_Color'].value_counts()))
print(animal_data['Secondary_Color'].value_counts())

{'black': np.int64(7179), 'white': np.int64(3444), 'brown': np.int64(2469), 'brn tabby': np.int64(2148), 'tan': np.int64(2105), 'gray': np.int64(1536), 'tricolor': np.int64(768), 'gray tabby': np.int64(677), 'br brindle': np.int64(627), 'red': np.int64(608), 'org tabby': np.int64(558), 'blue': np.int64(478), 'tortie': np.int64(386), 'yellow': np.int64(312), 'calico': np.int64(294), 'fawn': np.int64(234), 'cream': np.int64(200), 'orange': np.int64(200), 'lynx pt': np.int64(196), 'seal pt': np.int64(193), 'buff': np.int64(175), 'chocolate': np.int64(170), 'agouti': np.int64(168), 'blue cream': np.int64(135), 'gold': np.int64(125), 'bl brindle': np.int64(109), 'blue merle': np.int64(96), 'blk tabby': np.int64(93), 'flame pt': np.int64(50), 'pink': np.int64(42), 'lilac pt': np.int64(34), 'blue pt': np.int64(34), 'silver': np.int64(33), 'blk smoke': np.int64(33), 'liver': np.int64(28), 'green': np.int64(28), 'brn merle': np.int64(27), 'slvr tabby': np.int64(23), 'apricot': np.int64(23), 'to

#### 2.2.5 Fill Outcome Types

In [606]:
animal_data['Outcome_Type'] = animal_data['Outcome_Type'].fillna('Not Available')
animal_data['Outcome_Type'] = animal_data['Outcome_Type'].str.lower()
animal_data['Outcome_Type'] = animal_data['Outcome_Type'].astype('category')

animal_data['Outcome_Type'].value_counts()

Outcome_Type
return to owner    8715
adoption           7645
transfer           5515
euthanize          3588
not available       244
disposal            236
died                176
rtos                 51
escaped/stolen       17
Name: count, dtype: int64

#### 2.2.7 Create Outcome Age

In [607]:
animal_data['Outcome_Age'] = animal_data['Outcome_Date'] - animal_data['DOB']
animal_data['Outcome_Age'] = animal_data['Outcome_Age'].astype(str).str.replace(' days', '', regex=False)

#### 2.2.8 Insert Shelter Name

In [611]:
animal_data['Shelter_Name'] = 'Sonoma County Animal Shelter'

#### 2.2.9 Drop Columns

In [612]:
animal_data.drop(columns=['Color'],inplace=True)

## 2.3 Data Inspection & Cleaning

In [613]:
animal_data.shape

(26187, 14)

#### 2.3.1 Remove Duplicated Values

In [614]:
print(f"Duplicated Values: {animal_data.duplicated().sum()}")
animal_data = animal_data.drop_duplicates()

Duplicated Values: 1312


#### 2.3.2 Null / Missing Values

In [615]:
animal_data.isna().sum()

Breed                      0
Sex                        0
DOB                     5796
Intake_Date                0
Outcome_Date             214
Intake_Type                0
Animal_Type                0
Outcome_Type               0
Intake_Condition           0
Time_in_Shelter_Days       0
Primary_Color              0
Secondary_Color            0
Outcome_Age                0
Shelter_Name               0
dtype: int64

#### 2.3.3 Placeholder DOB

In [619]:
animal_data['DOB'] = animal_data['DOB'].fillna('1900-01-01 00:00:00')

#### 2.3.5 Create Intake Age

In [620]:
animal_data['Intake_Age'] = (animal_data['Intake_Date'] - animal_data['DOB']).dt.days

#### 2.3.5 Impute Outcome Age (Days)

In [621]:
animal_data['Outcome_Age'] = animal_data['Outcome_Age'].fillna(animal_data['Intake_Age'] + animal_data['Time_in_Shelter_Days'])

#### 2.3.6 Placeholder Outcome Date

In [622]:
animal_data['Outcome_Date'] = animal_data['Outcome_Date'].fillna('2100-01-01 00:00:00')

#### 2.3.7 Final Shape

In [623]:
print(animal_data.shape)
animal_data = animal_data.reset_index(drop=True)
animal_data.isna().sum()

(24875, 15)


Breed                   0
Sex                     0
DOB                     0
Intake_Date             0
Outcome_Date            0
Intake_Type             0
Animal_Type             0
Outcome_Type            0
Intake_Condition        0
Time_in_Shelter_Days    0
Primary_Color           0
Secondary_Color         0
Outcome_Age             0
Shelter_Name            0
Intake_Age              0
dtype: int64

# 3. Export Sonoma Animal Shelter Data

In [625]:
animal_data.to_csv('Sonoma_Animal_Data.csv', index=False)
animal_data.columns

Index(['Breed', 'Sex', 'DOB', 'Intake_Date', 'Outcome_Date', 'Intake_Type',
       'Animal_Type', 'Outcome_Type', 'Intake_Condition',
       'Time_in_Shelter_Days', 'Primary_Color', 'Secondary_Color',
       'Outcome_Age', 'Shelter_Name', 'Intake_Age'],
      dtype='object')