In [1]:
import requests
import pandas as pd
import time
import re

### Going to scrape some shelter IDs to use for API calls

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait

In [3]:
shelter_search = 'https://www.petfinder.com/animal-shelters-and-rescues/search/?location=New+York+City%2C+New+York&shelter_name='

In [4]:
driver = webdriver.Chrome()

In [5]:
driver.get(shelter_search)

In [6]:
first_shelter_id_link = driver.find_elements_by_tag_name('h4')[0].find_element_by_tag_name('a').get_attribute('href')

In [7]:
first_shelter_id_link

'https://www.petfinder.com/shelters/NY374.html'

In [8]:
shelter_links = driver.find_elements_by_tag_name('h4')

In [9]:
shelter_ids = []

for link in shelter_links:
    shelter_ids.append(link.find_element_by_tag_name('a').get_attribute('href'))

In [10]:
shelter_ids

['https://www.petfinder.com/shelters/NY374.html',
 'https://www.petfinder.com/shelters/NY133.html',
 'https://www.petfinder.com/shelters/NY1286.html',
 'https://www.petfinder.com/shelters/NY17.html',
 'https://www.petfinder.com/shelters/NY114.html',
 'https://www.petfinder.com/shelters/NY1183.html',
 'https://www.petfinder.com/shelters/NY1100.html',
 'https://www.petfinder.com/shelters/NY864.html',
 'https://www.petfinder.com/shelters/NY139.html',
 'https://www.petfinder.com/shelters/NY955.html',
 'https://www.petfinder.com/shelters/NY447.html',
 'https://www.petfinder.com/shelters/NY1363.html',
 'https://www.petfinder.com/shelters/NY765.html',
 'https://www.petfinder.com/shelters/NY100.html',
 'https://www.petfinder.com/shelters/NY262.html',
 'https://www.petfinder.com/shelters/NY558.html',
 'https://www.petfinder.com/shelters/NY1319.html',
 'https://www.petfinder.com/shelters/NY557.html',
 'https://www.petfinder.com/shelters/NY1008.html',
 'https://www.petfinder.com/shelters/NY835.ht

In [11]:
button = driver.find_element_by_css_selector('span.fieldBtn-label')

In [12]:
button.click()

### Now I'm going to do it all in one loop, clicking the button a number of times times to get a nice list of shelter IDs

In [13]:
driver.get(shelter_search)

shelter_ids = []

for i in range(40):
    shelter_links = driver.find_elements_by_tag_name('h4')
    for link in shelter_links:
        shelter_ids.append(link.find_element_by_tag_name('a').get_attribute('href'))
    button = driver.find_element_by_css_selector('span.fieldBtn-label')
    driver.execute_script("arguments[0].scrollIntoView(true)", button)
    button.click()
    time.sleep(1)

In [14]:
#shelter_ids

### I don't want New Jersey shelters, so I'm going to delete those ones

In [15]:
shelter_ids_sans_nj = []

test_string = 'NY'

for shelter in shelter_ids:
    if test_string in shelter:
        shelter_ids_sans_nj.append(shelter)
        
#shelter_ids_sans_nj

In [16]:
len(shelter_ids_sans_nj)

700

In [17]:
len('https://www.petfinder.com/shelters/')

35

In [18]:
shelter_ids_clean = []

for shelter in shelter_ids_sans_nj:
    shelter_ids_clean.append(shelter[35:-5])
    
#shelter_ids_clean

### Now back to the API

This API call will get me a list of pets (and all their information) from the shelters in the list I scraped together above. Lots of these will end up being repeats, apparently listed by multiple shelters idk didn't really investigate too much, so I'm only going to take the unique ones for my database.

In [19]:
shelter_pet_data = []

for shelter_id in shelter_ids_clean:
    url = 'http://api.petfinder.com/shelter.getPets?key=98d37048ffbfab829c7875e3a2bff3a5&id=' + shelter_id + '&output=full&format=json' 
    response = (requests.get(url)).json()
    shelter_pet_data.append(response)

#shelter_pet_data

In [20]:
all_pet_info = []

for shelter in shelter_pet_data:
    try:
        for pet in shelter['petfinder']['pets']['pet']:
            all_pet_info.append(pet)
    except:
        pass
    
len(all_pet_info)

10160

### Making it a dataframe

Going to turn the API data into a list of dictionaries and change to pandas dataframe

In [21]:
pet_data_rows = []

for info in all_pet_info:
    pet_data_dict = {}
    
    pet_breed_list = []
    try:
        #if info['breeds']['breed'] is list: 
        for pet_breed in info['breeds']['breed']:
            pet_breed_list.append(pet_breed['$t'])
        pet_data_dict['breeds'] = pet_breed_list
        #else:
           # pet_data_dict['breeds'] = info['breeds']['breed']['$t']
    except:
        try:
            pet_data_dict['breeds'] = info['breeds']['breed']['$t']
        except:
            pass
    try:
        pet_data_dict['age'] = info['age']['$t']
    except:
        pass
    try:
        pet_data_dict['animal'] = info['animal']['$t']
    except:
        pass
    try:
        pet_data_dict['description'] = info['description']['$t']
    except:
        pass
    try:
        pet_data_dict['id'] = info['id']['$t']
    except:
        pass
    try:
        pet_data_dict['lastUpdate'] = info['lastUpdate']['$t']
    except:
        pass
    try:
        pet_data_dict['mixed'] = info['mix']['$t']
    except:
        pass
    try:
        pet_data_dict['name'] = info['name']['$t']
    except:
        pass
    try:
        pet_data_dict['sex'] = info['sex']['$t']
    except:
        pass
    try:
        pet_data_dict['shelterID'] = info['shelterId']['$t']
    except:
        pass
    try:
        pet_data_dict['pet_size'] = info['size']['$t']
    except:
        pass
    try:
        pet_data_dict['zipcode'] = info['contact']['zip']['$t']
    except:
        pass
    
    pet_pic_list = []
    try:
        for pet_pic in info['media']['photos']['photo']:
            pet_pic_list.append(pet_pic['$t'])
        pet_data_dict['picture_links'] = pet_pic_list
    except:
        pass
    
    additional_info_list = []
    try:
        if type(info['options']['option']) is list:
            for additional_info in info['options']['option']:
                additional_info_list.append(additional_info['$t'])
            pet_data_dict['additional_info'] = additional_info_list
        else:
            pet_data_dict['additional_info'] = info['options']['option']['$t']
    except:
        pass
        
    
    if pet_data_dict not in pet_data_rows:
        pet_data_rows.append(pet_data_dict)

In [22]:
len(pet_data_rows)

477

In [23]:
df = pd.DataFrame(pet_data_rows)
df.head()

Unnamed: 0,additional_info,age,animal,breeds,description,id,lastUpdate,mixed,name,pet_size,picture_links,sex,shelterID,zipcode
0,"[altered, hasShots, housetrained, noDogs, noKids]",Adult,Cat,Tabby,Lincoln (#A1017544) was saved from the ACC's A...,30806905,2014-11-14T17:51:56Z,no,Lincoln,M,[http://photos.petfinder.com/photos/pets/30806...,M,NY374,10150
1,"[altered, hasShots, housetrained]",Adult,Cat,Domestic Short Hair,ROOSIE (short for ROOSEVELT) is a nice easy-go...,32389630,2015-06-09T11:42:04Z,no,Roosie,M,[http://photos.petfinder.com/photos/pets/32389...,M,NY374,10150
2,"[altered, hasShots, housetrained]",Young,Cat,Domestic Short Hair,Princess was a bottle baby rescued through ACC...,33623840,2015-10-26T21:06:00Z,no,Princess,M,[http://photos.petfinder.com/photos/pets/33623...,F,NY374,10150
3,"[altered, hasShots, housetrained]",Young,Cat,Domestic Short Hair,"MURRY!! Oh, we love this adolescent. Murry ...",34111054,2015-12-27T21:56:31Z,no,Murry,M,[http://photos.petfinder.com/photos/pets/34111...,M,NY374,10150
4,"[altered, hasShots, housetrained]",Young,Cat,Domestic Short Hair,ADRIAN and ARABELLA (now called ARIE) are brot...,39463373,2017-09-23T05:21:02Z,no,ADRIAN & Arabella,M,[http://photos.petfinder.com/photos/pets/39463...,M,NY374,10150


In [24]:
df.to_csv('petfinderNY_database.csv', index=False)

### After saving to a csv...

Time to clean the data!

In [27]:
df.pet_size = df.pet_size.replace('M', 'Medium')
df.pet_size = df.pet_size.replace('S', 'Small')
df.pet_size = df.pet_size.replace('L', 'Large')
df.pet_size = df.pet_size.replace('XL', 'Extra Large')
df.head()

Unnamed: 0,additional_info,age,animal,breeds,description,id,lastUpdate,mixed,name,pet_size,picture_links,sex,shelterID,zipcode
0,"[altered, hasShots, housetrained, noDogs, noKids]",Adult,Cat,Tabby,Lincoln (#A1017544) was saved from the ACC's A...,30806905,2014-11-14T17:51:56Z,no,Lincoln,Medium,[http://photos.petfinder.com/photos/pets/30806...,M,NY374,10150
1,"[altered, hasShots, housetrained]",Adult,Cat,Domestic Short Hair,ROOSIE (short for ROOSEVELT) is a nice easy-go...,32389630,2015-06-09T11:42:04Z,no,Roosie,Medium,[http://photos.petfinder.com/photos/pets/32389...,M,NY374,10150
2,"[altered, hasShots, housetrained]",Young,Cat,Domestic Short Hair,Princess was a bottle baby rescued through ACC...,33623840,2015-10-26T21:06:00Z,no,Princess,Medium,[http://photos.petfinder.com/photos/pets/33623...,F,NY374,10150
3,"[altered, hasShots, housetrained]",Young,Cat,Domestic Short Hair,"MURRY!! Oh, we love this adolescent. Murry ...",34111054,2015-12-27T21:56:31Z,no,Murry,Medium,[http://photos.petfinder.com/photos/pets/34111...,M,NY374,10150
4,"[altered, hasShots, housetrained]",Young,Cat,Domestic Short Hair,ADRIAN and ARABELLA (now called ARIE) are brot...,39463373,2017-09-23T05:21:02Z,no,ADRIAN & Arabella,Medium,[http://photos.petfinder.com/photos/pets/39463...,M,NY374,10150


In [28]:
df.sex = df.sex.replace('M', 'Male')
df.sex = df.sex.replace('F', 'Female')
df.head()

Unnamed: 0,additional_info,age,animal,breeds,description,id,lastUpdate,mixed,name,pet_size,picture_links,sex,shelterID,zipcode
0,"[altered, hasShots, housetrained, noDogs, noKids]",Adult,Cat,Tabby,Lincoln (#A1017544) was saved from the ACC's A...,30806905,2014-11-14T17:51:56Z,no,Lincoln,Medium,[http://photos.petfinder.com/photos/pets/30806...,Male,NY374,10150
1,"[altered, hasShots, housetrained]",Adult,Cat,Domestic Short Hair,ROOSIE (short for ROOSEVELT) is a nice easy-go...,32389630,2015-06-09T11:42:04Z,no,Roosie,Medium,[http://photos.petfinder.com/photos/pets/32389...,Male,NY374,10150
2,"[altered, hasShots, housetrained]",Young,Cat,Domestic Short Hair,Princess was a bottle baby rescued through ACC...,33623840,2015-10-26T21:06:00Z,no,Princess,Medium,[http://photos.petfinder.com/photos/pets/33623...,Female,NY374,10150
3,"[altered, hasShots, housetrained]",Young,Cat,Domestic Short Hair,"MURRY!! Oh, we love this adolescent. Murry ...",34111054,2015-12-27T21:56:31Z,no,Murry,Medium,[http://photos.petfinder.com/photos/pets/34111...,Male,NY374,10150
4,"[altered, hasShots, housetrained]",Young,Cat,Domestic Short Hair,ADRIAN and ARABELLA (now called ARIE) are brot...,39463373,2017-09-23T05:21:02Z,no,ADRIAN & Arabella,Medium,[http://photos.petfinder.com/photos/pets/39463...,Male,NY374,10150


In [52]:
df.additional_info[0][0]

'altered'

The following function will pull out all of the 'additional info' into new columns so I can compare more stuff

In [76]:
def sort_additional_info(row):
    if type(row.additional_info) == list:
    
        if 'altered' in row.additional_info:
            row['altered'] = 'Altered'
        else:
            row['altered'] = 'Not altered'

        if 'hasShots' in row.additional_info:
            row['vaccinated'] = 'Vaccinated'
        else:
            row['vaccinated'] = 'Not vaccinated'

        if 'housetrained' in row.additional_info:
            row['housetrained'] = 'Housetrained'
        else:
            row['housetrained'] = 'Not housetrained'

        if 'noDogs' in row.additional_info:
            row['cynophobic'] = 'Cynophobic'
        else:
            row['cynophobic'] = 'Not cynophobic'

        if 'noKids' in row.additional_info:
            row['hates_children'] = 'Definitely hates children'
        else:
            row['hates_children'] = 'Does not hate children'

        if 'noCats' in row.additional_info:
            row['felinephobic'] = 'Felinephobic'
        else:
            row['felinephobic'] = 'Not felinephobic'

        if 'specialNeeds' in row.additional_info:
            row['specialNeeds'] = 'Has special needs'
        else:
            row['specialNeeds'] = 'No special needs'
    else:
        if row.additional_info == 'altered':
            row['altered'] = 'Altered'
        else:
            row['altered'] = 'Not altered'
            
        if row.additional_info == 'hasShots':
            row['vaccinated'] = 'Vaccinated'
        else:
            row['vaccinated'] = 'Not vaccinated'
            
        if row.additional_info == 'housetrained':
            row['housetrained'] = 'Housetrained'
        else:
            row['housetrained'] = 'Not housetrained'
            
        if row.additional_info == 'noDogs':
            row['cynophobic'] = 'Cynophobic'
        else:
            row['cynophobic'] = 'Not cynophobic'
            
        if row.additional_info == 'noKids':
            row['hates_children'] = 'Definitely hates children'
        else:
            row['hates_children'] = 'Does not hate children'
            
        if row.additional_info == 'noCats':
            row['felinephobic'] = 'Felinephobic'
        else:
            row['felinephobic'] = 'Not felinephobic'
            
        if row.additional_info == 'specialneeds':
            row['specialNeeds'] = 'Has special needs'
        else:
            row['specialNeeds'] = 'No special needs'
    return row

In [83]:
df = df.apply(sort_additional_info, axis=1)
df.head()

Unnamed: 0,additional_info,age,animal,breeds,description,id,lastUpdate,mixed,name,pet_size,...,sex,shelterID,zipcode,altered,vaccinated,housetrained,cynophobic,hates_children,felinephobic,specialNeeds
0,"[altered, hasShots, housetrained, noDogs, noKids]",Adult,Cat,Tabby,Lincoln (#A1017544) was saved from the ACC's A...,30806905,2014-11-14T17:51:56Z,no,Lincoln,Medium,...,Male,NY374,10150,Altered,Vaccinated,Housetrained,Cynophobic,Definitely hates children,Not felinephobic,No special needs
1,"[altered, hasShots, housetrained]",Adult,Cat,Domestic Short Hair,ROOSIE (short for ROOSEVELT) is a nice easy-go...,32389630,2015-06-09T11:42:04Z,no,Roosie,Medium,...,Male,NY374,10150,Altered,Vaccinated,Housetrained,Not cynophobic,Does not hate children,Not felinephobic,No special needs
2,"[altered, hasShots, housetrained]",Young,Cat,Domestic Short Hair,Princess was a bottle baby rescued through ACC...,33623840,2015-10-26T21:06:00Z,no,Princess,Medium,...,Female,NY374,10150,Altered,Vaccinated,Housetrained,Not cynophobic,Does not hate children,Not felinephobic,No special needs
3,"[altered, hasShots, housetrained]",Young,Cat,Domestic Short Hair,"MURRY!! Oh, we love this adolescent. Murry ...",34111054,2015-12-27T21:56:31Z,no,Murry,Medium,...,Male,NY374,10150,Altered,Vaccinated,Housetrained,Not cynophobic,Does not hate children,Not felinephobic,No special needs
4,"[altered, hasShots, housetrained]",Young,Cat,Domestic Short Hair,ADRIAN and ARABELLA (now called ARIE) are brot...,39463373,2017-09-23T05:21:02Z,no,ADRIAN & Arabella,Medium,...,Male,NY374,10150,Altered,Vaccinated,Housetrained,Not cynophobic,Does not hate children,Not felinephobic,No special needs


In [84]:
df.hates_children.value_counts()

Does not hate children       436
Definitely hates children     41
Name: hates_children, dtype: int64

### Finally I'm going to check and make sure there are no NaN values, which also means dropping this one random row that is entirely NaN values

In [86]:
df['description'] = df.description.fillna('No description')
df.description.isnull().value_counts()

False    477
Name: description, dtype: int64

In [91]:
df['additional_info'] = df.additional_info.fillna('No additional info')
df.description.isnull().value_counts()

False    477
Name: description, dtype: int64

In [96]:
df = df.drop(index=318, axis=0)

In [118]:
df['picture_links'] = df.picture_links.fillna('No pics')
df.description.isnull().value_counts()

False    476
Name: description, dtype: int64

### Resaving the database in its clean form

In [119]:
df.to_csv('petfinderNY_database.csv', index=False)