Cat Information Scraping Notebook

Using Selenium and ChromeDriverManager to scrape both adopted cats and cats up for adoption on the catrangers website. Information is written into a csv file for cleaning. 

In [1]:
#importing modules 
from bs4 import BeautifulSoup
import requests
from selenium import webdriver


from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException


In [None]:
# Loops through all the cats available for adoption and adds them to a dictionary 
def find_cat_ids_for_adoption():
    '''
    Ret: dictionary with the keys: ids values: links 
    
    '''
    base_url = 'https://www.catrangers.org/animals/list?Status=Available'
    response = requests.get(base_url)
    adoption_content = BeautifulSoup(response.content, 'html.parser')

    extracted_list = adoption_content.find_all('td', class_='portalTableValue')

    id_dict = {}

    for item in extracted_list:
        link = item.find('a', href=True)
        if link:
            url = link['href']
            animal_id = int(url.split('=')[-1])
            id_dict[animal_id] = url

    return id_dict

In [None]:
#function that uses the dictionary with cat ids to write information into the csv file
import csv 
def scrub_cat_info(id_dict, start, end, mode_type='w'):
    '''
    params: 
    id_dict (type: dict) 
    start (type: int)
    end (type: int)
    mode_type (csv mode types)

    purpose: writes each cat's info into the 'raw_cat_data' csv file 

    Ret: None 
    '''

    filename = 'raw_cat_data.csv'

    try:
        with open(filename, mode=mode_type, newline='') as file:
            writer = csv.writer(file)
            
            # preset headers
            headers = [
                "Name", "Breed", "Sex and Status", "Age", "Size", 
                "Adoption Status", "Species", "Rescue ID", "General Color", 
                "Current Age", "Fence Required", "Declawed", "Housetrained", 
                "Exercise Needs", "Grooming Needs", "Shedding Amount", 
                "Owner Experience Needed", "Reaction to New People"]
            
            writer.writerow(headers)
            count = start
            created_list = list(id_dict.values())

            for value in created_list[start:end]:
                entry = []
                id_url = 'https://www.catrangers.org' + value

                try:
                    response = requests.get(id_url, timeout=10)
                    response.raise_for_status()  
                    # raises an HTTPError for bad responses
                except requests.RequestException as e:
                    raise RuntimeError(f"Failed to fetch url") from e

                soup = BeautifulSoup(response.content, 'html.parser')

                name = soup.find('span', class_='pageCenterTitle')
                more_info = soup.find('p', style='text-align:center;')
                info = more_info.find('strong') if more_info else None

                entry.append(name.get_text(strip=True)[:-11])

                if info:
                    info_text = info.get_text(strip=True)
                    text = info_text.replace('\xa0', '').split('::')   
                    entry.extend(text)
                elements = soup.find_all('li')
                
                for item in elements:
                    item_text = item.get_text(strip=True)
                    entry.append(item_text)
                    
                writer.writerow(entry)

                print('Line:', count)
                count += 1

            print('Finished')

    except FileNotFoundError:
        raise FileExistsError("File does not exist and must be created")
    except Exception as e:
        raise Exception('An error has occurred:', e)


In [None]:
def scrub_adopted_cats(url=None):
    '''
    Scrapes the adopted animals' data and stores it in a dictionary.
    '''
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get('https://www.catrangers.org/animals/successes?')  

    wait = WebDriverWait(driver, 10)
    max_pages = 500
    current_page = 0
    cat_dict = {}

    try:
        while current_page < max_pages:
            current_page += 1
            try:
                next_page_link = wait.until(EC.presence_of_element_located((By.LINK_TEXT, 'Next Page »')))
                adopted_content = BeautifulSoup(driver.page_source, 'html.parser')
                
                extracted_list = adopted_content.find_all('table', align='center')
                for table in extracted_list:
                    a_tags = table.find_all('a', href=True)
                    for a in a_tags:
                        href = a['href']
                        animal_id = int(href.split('=')[-1])
                        cat_dict[animal_id] = href

                next_page_link.click()
            except StaleElementReferenceException:
                raise StaleElementReferenceException("Element was stale")
            
            except Exception as e:
                raise Exception(f"An error occurred: {e}")
                
    finally:
        driver.quit()

    return cat_dict


In [None]:
adopted_dict = scrub_adopted_cats()

Clicked successfully 1
Clicked successfully 2
Clicked successfully 3
Clicked successfully 4
Clicked successfully 5
Clicked successfully 6
Clicked successfully 7
Clicked successfully 8
Clicked successfully 9
Clicked successfully 10
Clicked successfully 11
Clicked successfully 12
Clicked successfully 13
Clicked successfully 14
Clicked successfully 15
Clicked successfully 16
Clicked successfully 17
Clicked successfully 18
Clicked successfully 19
Clicked successfully 20
Clicked successfully 21
Clicked successfully 22
Clicked successfully 23
Clicked successfully 24
Clicked successfully 25
Clicked successfully 26
Clicked successfully 27
Clicked successfully 28
Clicked successfully 29
Clicked successfully 30
Clicked successfully 31
Clicked successfully 32
Clicked successfully 33
Clicked successfully 34
Clicked successfully 35
Clicked successfully 36
Clicked successfully 37
Clicked successfully 38
Clicked successfully 39
Clicked successfully 40
Clicked successfully 41
Clicked successfully 42
C

In [7]:
len(adopted_dict)

2500

In [None]:
scrub_cat_info(adopted_dict, 0, 2000)

Line: 0
Line: 1
Line: 2
Line: 3
Line: 4
Line: 5
Line: 6
Line: 7
Line: 8
Line: 9
Line: 10
Line: 11
Line: 12
Line: 13
Line: 14
Line: 15
Line: 16
Line: 17
Line: 18
Line: 19
Line: 20
Line: 21
Line: 22
Line: 23
Line: 24
Line: 25
Line: 26
Line: 27
Line: 28
Line: 29
Line: 30
Line: 31
Line: 32
Line: 33
Line: 34
Line: 35
Line: 36
Line: 37
Line: 38
Line: 39
Line: 40
Line: 41
Line: 42
Line: 43
Line: 44
Line: 45
Line: 46
Line: 47
Line: 48
Line: 49
Line: 50
Line: 51
Line: 52
Line: 53
Line: 54
Line: 55
Line: 56
Line: 57
Line: 58
Line: 59
Line: 60
Line: 61
Line: 62
Line: 63
Line: 64
Line: 65
Line: 66
Line: 67
Line: 68
Line: 69
Line: 70
Line: 71
Line: 72
Line: 73
Line: 74
Line: 75
Line: 76
Line: 77
Line: 78
Line: 79
Line: 80
Line: 81
Line: 82
Line: 83
Line: 84
Line: 85
Line: 86
Line: 87
Line: 88
Line: 89
Line: 90
Line: 91
Line: 92
Line: 93
Line: 94
Line: 95
Line: 96
Line: 97
Line: 98
Line: 99
Line: 100
Line: 101
Line: 102
Line: 103
Line: 104
Line: 105
Line: 106
Line: 107
Line: 108
Line: 109
Line: 110


In [None]:
scrub_cat_info(adopted_dict, 2000, len(adopted_dict)+1, 'a')

Line: 2000
Line: 2001
Line: 2002
Line: 2003
Line: 2004
Line: 2005
Line: 2006
Line: 2007
Line: 2008
Line: 2009
Line: 2010
Line: 2011
Line: 2012
Line: 2013
Line: 2014
Line: 2015
Line: 2016
Line: 2017
Line: 2018
Line: 2019
Line: 2020
Line: 2021
Line: 2022
Line: 2023
Line: 2024
Line: 2025
Line: 2026
Line: 2027
Line: 2028
Line: 2029
Line: 2030
Line: 2031
Line: 2032
Line: 2033
Line: 2034
Line: 2035
Line: 2036
Line: 2037
Line: 2038
Line: 2039
Line: 2040
Line: 2041
Line: 2042
Line: 2043
Line: 2044
Line: 2045
Line: 2046
Line: 2047
Line: 2048
Line: 2049
Line: 2050
Line: 2051
Line: 2052
Line: 2053
Line: 2054
Line: 2055
Line: 2056
Line: 2057
Line: 2058
Line: 2059
Line: 2060
Line: 2061
Line: 2062
Line: 2063
Line: 2064
Line: 2065
Line: 2066
Line: 2067
Line: 2068
Line: 2069
Line: 2070
Line: 2071
Line: 2072
Line: 2073
Line: 2074
Line: 2075
Line: 2076
Line: 2077
Line: 2078
Line: 2079
Line: 2080
Line: 2081
Line: 2082
Line: 2083
Line: 2084
Line: 2085
Line: 2086
Line: 2087
Line: 2088
Line: 2089
Line: 2090

In [None]:
adoption_dict = find_cat_ids_for_adoption()

In [14]:
len(adoption_dict)

165

In [None]:
scrub_cat_info(adoption_dict, 0, len(adoption_dict)+1, 'a')

Line: 0
Line: 1
Line: 2
Line: 3
Line: 4
Line: 5
Line: 6
Line: 7
Line: 8
Line: 9
Line: 10
Line: 11
Line: 12
Line: 13
Line: 14
Line: 15
Line: 16
Line: 17
Line: 18
Line: 19
Line: 20
Line: 21
Line: 22
Line: 23
Line: 24
Line: 25
Line: 26
Line: 27
Line: 28
Line: 29
Line: 30
Line: 31
Line: 32
Line: 33
Line: 34
Line: 35
Line: 36
Line: 37
Line: 38
Line: 39
Line: 40
Line: 41
Line: 42
Line: 43
Line: 44
Line: 45
Line: 46
Line: 47
Line: 48
Line: 49
Line: 50
Line: 51
Line: 52
Line: 53
Line: 54
Line: 55
Line: 56
Line: 57
Line: 58
Line: 59
Line: 60
Line: 61
Line: 62
Line: 63
Line: 64
Line: 65
Line: 66
Line: 67
Line: 68
Line: 69
Line: 70
Line: 71
Line: 72
Line: 73
Line: 74
Line: 75
Line: 76
Line: 77
Line: 78
Line: 79
Line: 80
Line: 81
Line: 82
Line: 83
Line: 84
Line: 85
Line: 86
Line: 87
Line: 88
Line: 89
Line: 90
Line: 91
Line: 92
Line: 93
Line: 94
Line: 95
Line: 96
Line: 97
Line: 98
Line: 99
Line: 100
Line: 101
Line: 102
Line: 103
Line: 104
Line: 105
Line: 106
Line: 107
Line: 108
Line: 109
Line: 110
