In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# show all columns in the df when printing to screen
pd.options.display.max_columns = None

In [2]:
# load csv file (200k results)
df = pd.read_csv("../petfinder_data/favorites_sample_small.csv", sep=";")
df.head()

Unnamed: 0,user_id,favorited_at,id,organization_id,organization_active,organization_enabled,organization_display_id,import_on,import_software,organization_type,location_type,is_public_location,state_code,country_code,adoption_status,created_at,updated_at,name,organization_animal_identifier,tags,adoption_fee,adoption_fee_waived,display_adoption_fee,published_date,mixed_breed,unknown_breed,pet_description,pet_description_length,special_needs_notes,age,animal_type,primary_breed,secondary_breed,primary_color,secondary_color,tertiary_color,gender,size,species,coat_length,latitude,longitude,postal_code,good_with_children,good_with_dogs,good_with_cats,good_with_other_animals,good_with_others_notes,house_trained,altered,declawed,special_needs,shots_current,num_photos,num_aifs,num_videos
0,4369629,2020-01-01 00:00:04,46861637,44719,1,1,OH1073,0,,Rescue Group / Foster-Based,Default Location,0,OH,US,Adoptable,2019-12-16 15:06:18,2020-01-24 13:12:54,Elsa,,,,0,0,2019-12-16 15:08:08,1,0,Meet Elsa!Elsa came to us as a stray. She had ...,639.0,DEAF,Baby,Dog,Pit Bull Terrier,,,,,Female,Medium,Dog,Short,41.457,-81.7875,44111,unknown,yes,unknown,unknown,,0,1,0,1,1,5,11,0
1,4369629,2020-01-01 00:00:07,46898514,35651,1,1,OH745,0,,Rescue Group / Foster-Based,Default Location,1,OH,US,Adoptable,2019-12-20 02:58:49,2020-01-30 03:11:07,Coco,,,,0,0,2019-12-20 03:02:07,1,0,Coco came into the rescue a couple years ago w...,945.0,,Young,Dog,Pit Bull Terrier,,,,,Female,Medium,Dog,,41.291,-82.0509,44044,yes,yes,no,unknown,,0,1,0,0,1,2,0,0
2,4112169,2020-01-01 00:00:08,46980809,40518,1,1,NY1044,0,,Rescue Group / Foster-Based,Default Location,0,NY,US,Adopted,2019-12-31 18:24:35,2020-01-12 16:28:59,Cooper,,,,0,0,2019-12-31 18:26:55,0,0,AGE: 1 YearSEX: Neutered MaleWEIGHT: 50 Pounds...,1005.0,,Young,Dog,Labrador Retriever,,,,,Male,Medium,Dog,Medium,42.9011,-76.7934,13148,yes,yes,yes,unknown,,1,1,0,0,1,6,0,0
3,4369680,2020-01-01 00:00:12,46956517,29207,1,1,FL806,0,,,Default Location,1,FL,US,Adoptable,2019-12-28 07:50:35,2020-01-01 20:18:10,Frankie,436,,,0,0,2019-12-28 07:53:26,1,0,Very sweet puppy will neuter micrchip and give...,151.0,,Baby,Dog,German Shepherd Dog,,,,,Male,Large,Dog,Medium,25.7739,-80.1939,33153,yes,yes,yes,unknown,,0,0,0,0,1,1,1,0
4,4369681,2020-01-01 00:00:13,46860620,48762,1,1,TN914,1,Petfinder Import Standard Template,Rescue Group / Foster-Based,Foster Home,0,CT,US,Adoptable,2019-12-16 07:30:36,2020-01-03 07:31:08,,FFRR-A-36,,,0,0,2019-12-16 07:30:36,0,0,,,,Baby,Dog,American Staffordshire Terrier,,,,,Female,Small,Dog,,41.6601,-72.6614,6067,unknown,unknown,unknown,unknown,,0,0,0,0,0,3,26,0


In [3]:
# replace both 'pet_description' and 'pet_description_length' columns with a new column 'has_description'
def create_has_description(row):
    # compare on 'pet_description' field
    # the 'pet_description_length' field uses both 0.0 and nan for a pet with no description
    if pd.isnull(row['pet_description']):
        return 0
    else:
        return 1
df['has_description'] = df.apply(lambda row: create_has_description(row), axis=1)
# verify flag is working properly
df[['has_description', 'pet_description']]

Unnamed: 0,has_description,pet_description
0,1,Meet Elsa!Elsa came to us as a stray. She had ...
1,1,Coco came into the rescue a couple years ago w...
2,1,AGE: 1 YearSEX: Neutered MaleWEIGHT: 50 Pounds...
3,1,Very sweet puppy will neuter micrchip and give...
4,0,
...,...,...
140964,0,
140965,1,"Say hello to Duke. This handsome, very laid ba..."
140966,1,"My name is Samson and I'm a 2 yr old, 30 lb Ca..."
140967,0,


In [4]:
# drop columns that the model will not train on
to_drop = [
    'adoption_fee',
    'adoption_fee_waived',
    'adoption_status',
    'country_code',
    'created_at',
    'display_adoption_fee',
    'favorited_at',
    'good_with_others_notes',
    'id',
    'import_on',
    'import_software',
    'is_public_location',
    'latitude',
    'location_type',
    'longitude',
    'name',
    'num_aifs',
    'organization_active',
    'organization_animal_identifier',
    'organization_display_id',
    'organization_enabled',
    'organization_id',
    'organization_type',
    'pet_description',
    'pet_description_length',
    'postal_code',
    'published_date',
    'secondary_breed',
    'secondary_color',
    'special_needs_notes',
    'state_code',
    'tags',
    'tertiary_color',
    'unknown_breed',
    'updated_at',
    'user_id'
]
df.drop(to_drop, axis=1, inplace=True)
df.head()

Unnamed: 0,mixed_breed,age,animal_type,primary_breed,primary_color,gender,size,species,coat_length,good_with_children,good_with_dogs,good_with_cats,good_with_other_animals,house_trained,altered,declawed,special_needs,shots_current,num_photos,num_videos,has_description
0,1,Baby,Dog,Pit Bull Terrier,,Female,Medium,Dog,Short,unknown,yes,unknown,unknown,0,1,0,1,1,5,0,1
1,1,Young,Dog,Pit Bull Terrier,,Female,Medium,Dog,,yes,yes,no,unknown,0,1,0,0,1,2,0,1
2,0,Young,Dog,Labrador Retriever,,Male,Medium,Dog,Medium,yes,yes,yes,unknown,1,1,0,0,1,6,0,1
3,1,Baby,Dog,German Shepherd Dog,,Male,Large,Dog,Medium,yes,yes,yes,unknown,0,0,0,0,1,1,0,1
4,0,Baby,Dog,American Staffordshire Terrier,,Female,Small,Dog,,unknown,unknown,unknown,unknown,0,0,0,0,0,3,0,0


In [5]:
# fill nan values in all color fields
df['primary_color'].fillna(value='NO_PRIMARY_COLOR', inplace=True)
# fill nan values in for coat_length
df['coat_length'].fillna(value='NO_COAT_LENGTH', inplace=True)
df.head()

Unnamed: 0,mixed_breed,age,animal_type,primary_breed,primary_color,gender,size,species,coat_length,good_with_children,good_with_dogs,good_with_cats,good_with_other_animals,house_trained,altered,declawed,special_needs,shots_current,num_photos,num_videos,has_description
0,1,Baby,Dog,Pit Bull Terrier,NO_PRIMARY_COLOR,Female,Medium,Dog,Short,unknown,yes,unknown,unknown,0,1,0,1,1,5,0,1
1,1,Young,Dog,Pit Bull Terrier,NO_PRIMARY_COLOR,Female,Medium,Dog,NO_COAT_LENGTH,yes,yes,no,unknown,0,1,0,0,1,2,0,1
2,0,Young,Dog,Labrador Retriever,NO_PRIMARY_COLOR,Male,Medium,Dog,Medium,yes,yes,yes,unknown,1,1,0,0,1,6,0,1
3,1,Baby,Dog,German Shepherd Dog,NO_PRIMARY_COLOR,Male,Large,Dog,Medium,yes,yes,yes,unknown,0,0,0,0,1,1,0,1
4,0,Baby,Dog,American Staffordshire Terrier,NO_PRIMARY_COLOR,Female,Small,Dog,NO_COAT_LENGTH,unknown,unknown,unknown,unknown,0,0,0,0,0,3,0,0


In [6]:
# validate all columns have 140,969 non-null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140969 entries, 0 to 140968
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   mixed_breed              140969 non-null  int64 
 1   age                      140969 non-null  object
 2   animal_type              140969 non-null  object
 3   primary_breed            140969 non-null  object
 4   primary_color            140969 non-null  object
 5   gender                   140969 non-null  object
 6   size                     140969 non-null  object
 7   species                  140969 non-null  object
 8   coat_length              140969 non-null  object
 9   good_with_children       140969 non-null  object
 10  good_with_dogs           140969 non-null  object
 11  good_with_cats           140969 non-null  object
 12  good_with_other_animals  140969 non-null  object
 13  house_trained            140969 non-null  int64 
 14  altered             

In [7]:
print(df.good_with_children.unique())
print(df.good_with_dogs.unique())
print(df.good_with_cats.unique())
print(df.good_with_other_animals.unique())

['unknown' 'yes' 'no']
['yes' 'unknown' 'no']
['unknown' 'no' 'yes']
['unknown' 'yes']


In [8]:
# convert all good_with_xxx columns to 0 or 1 (include unknown with 0)
# replace both 'pet_description' and 'pet_description_length' columns with a new column 'has_description'
def convert_to_bin(row, col):
    # compare on 'pet_description' field
    # the 'pet_description_length' field uses both 0.0 and nan for a pet with no description
    if row[col] in ["no", "unknown"]:
        return 0
    else:
        return 1
df['good_with_children'] = df.apply(lambda row: convert_to_bin(row, "good_with_children"), axis=1)
df['good_with_dogs'] = df.apply(lambda row: convert_to_bin(row, "good_with_dogs"), axis=1)
df['good_with_cats'] = df.apply(lambda row: convert_to_bin(row, "good_with_cats"), axis=1)
df['good_with_other_animals'] = df.apply(lambda row: convert_to_bin(row, "good_with_other_animals"), axis=1)
df.head()

Unnamed: 0,mixed_breed,age,animal_type,primary_breed,primary_color,gender,size,species,coat_length,good_with_children,good_with_dogs,good_with_cats,good_with_other_animals,house_trained,altered,declawed,special_needs,shots_current,num_photos,num_videos,has_description
0,1,Baby,Dog,Pit Bull Terrier,NO_PRIMARY_COLOR,Female,Medium,Dog,Short,0,1,0,0,0,1,0,1,1,5,0,1
1,1,Young,Dog,Pit Bull Terrier,NO_PRIMARY_COLOR,Female,Medium,Dog,NO_COAT_LENGTH,1,1,0,0,0,1,0,0,1,2,0,1
2,0,Young,Dog,Labrador Retriever,NO_PRIMARY_COLOR,Male,Medium,Dog,Medium,1,1,1,0,1,1,0,0,1,6,0,1
3,1,Baby,Dog,German Shepherd Dog,NO_PRIMARY_COLOR,Male,Large,Dog,Medium,1,1,1,0,0,0,0,0,1,1,0,1
4,0,Baby,Dog,American Staffordshire Terrier,NO_PRIMARY_COLOR,Female,Small,Dog,NO_COAT_LENGTH,0,0,0,0,0,0,0,0,0,3,0,0


In [9]:
# encode age column, since we want speficic order will encode manually 
# order: ["Baby", "Young", "Adult", "Senior"]

def encode_age(row):
    if row["age"] == "Baby":
        return 0
    elif row["age"] == "Young":
        return 1
    elif row["age"] == "Adult":
        return 2
    else:
        # senior pets
        return 3

# can later use user_enc like so age_enc.inverse_transform([24230]) to get original user_id
df['age'] = df.apply(lambda row: encode_age(row), axis=1)
df.head()

Unnamed: 0,mixed_breed,age,animal_type,primary_breed,primary_color,gender,size,species,coat_length,good_with_children,good_with_dogs,good_with_cats,good_with_other_animals,house_trained,altered,declawed,special_needs,shots_current,num_photos,num_videos,has_description
0,1,0,Dog,Pit Bull Terrier,NO_PRIMARY_COLOR,Female,Medium,Dog,Short,0,1,0,0,0,1,0,1,1,5,0,1
1,1,1,Dog,Pit Bull Terrier,NO_PRIMARY_COLOR,Female,Medium,Dog,NO_COAT_LENGTH,1,1,0,0,0,1,0,0,1,2,0,1
2,0,1,Dog,Labrador Retriever,NO_PRIMARY_COLOR,Male,Medium,Dog,Medium,1,1,1,0,1,1,0,0,1,6,0,1
3,1,0,Dog,German Shepherd Dog,NO_PRIMARY_COLOR,Male,Large,Dog,Medium,1,1,1,0,0,0,0,0,1,1,0,1
4,0,0,Dog,American Staffordshire Terrier,NO_PRIMARY_COLOR,Female,Small,Dog,NO_COAT_LENGTH,0,0,0,0,0,0,0,0,0,3,0,0
