Cat Adoption Project 

Web scraping through catrangers' database to scrape information about adopted and up for adoption cats to combine into a csv file 

*change vars later 

In [37]:
#importing modules 
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import csv

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException


In [38]:
# Loops through all the cats available for adoption and adds them to a dictionary 
def for_adoption():
    '''
    Ret: dictionary with the keys: ids values: links 
    '''
    master_url = 'https://www.catrangers.org/animals/list?Status=Available'
    response = requests.get(master_url)
    adoption_content = BeautifulSoup(response.content, 'html.parser')

    extracted_list = adoption_content.find_all('td', class_='portalTableValue')

    id_dict = {}

    for item in extracted_list:
        link = item.find('a', href=True)
        if link:
            url = link['href']
            animal_id = int(url.split('=')[-1])
            id_dict[animal_id] = url

    return id_dict

In [71]:

#function that uses the id_dict to write information into the csv file 
def animal_info(id_dict, start, end, mode_type='w'):
    '''
    params: 
    id_dict (type: dict)
    start (type: int)
    end (type: int)
    mode_type (csv mode types)

    purpose: writes into the cat information csv file 
    '''

    filename = 'cat_information.csv'


    try:
        with open(filename, mode=mode_type, newline='') as file:
            writer = csv.writer(file)
            
            # Write headers
            headers = [
                "Name", "Breed", "Sex and Status", "Age", "Size", 
                "Adoption Status", "Species", "Rescue ID", "General Color", 
                "Current Age", "Fence Required", "Declawed", "Housetrained", 
                "Exercise Needs", "Grooming Needs", "Shedding Amount", 
                "Owner Experience Needed", "Reaction to New People"]
            
            writer.writerow(headers)

            count = start
            created_list = list(id_dict.values())

            for value in created_list[start:end]:
                entry = []
                id_url = 'https://www.catrangers.org' + value

                try:
                    response = requests.get(id_url, timeout=10)
                    response.raise_for_status()  
                    # Raises an HTTPError for bad responses
                except requests.RequestException as e:
                    print(f"Failed to retrieve data for {id_url}: {e}")
                    continue

                soup = BeautifulSoup(response.content, 'html.parser')

                name = soup.find('span', class_='pageCenterTitle')
                more_info = soup.find('p', style='text-align:center;')
                info = more_info.find('strong') if more_info else None

                entry.append(name.get_text(strip=True)[:-11])

                if info:
                    info_text = info.get_text(strip=True)
                    text = info_text.replace('\xa0', '').split('::')
                  
                                    
                    entry.extend(text)
                

                elements = soup.find_all('li')
                
                for item in elements:
                    item_text = item.get_text(strip=True)
            
                    
                    entry.append(item_text)
                    
                    

                writer.writerow(entry)

                print('Line:', count)
                count += 1

            print('Finished')

    except FileNotFoundError:
        print('File does not exist')
    except Exception as e:
        print('An error has occurred:', e)


In [40]:
def scrub_adopted(url=None):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    driver.get('https://www.catrangers.org/animals/successes?')  # Replace with your URL

    wait = WebDriverWait(driver, 10)

    max_pages = 8
    current_page = 0

    dicty = {}

    try:
        while current_page < max_pages:
            current_page += 1

            try:
                next_page_link = wait.until(EC.presence_of_element_located((By.LINK_TEXT, 'Next Page »')))

                master_url = next_page_link.get_attribute('href')
                response = requests.get(master_url)
                adopted_content = BeautifulSoup(response.content, 'html.parser')

                extracted_list = adopted_content.find_all('table', align='center')

                for table in extracted_list:
                    a_tags = table.find_all('a', href=True)
                    for a in a_tags:
                        href = a['href']
                        animal_id = int(href.split('=')[-1])
                        dicty[animal_id] = href

                next_page_link.click()

                print('Clicked successfully', current_page)

            except StaleElementReferenceException:
                continue
            except Exception as e:
                print(f"An error occurred: {e}")
                break

    finally:
        driver.quit()

    return dicty

In [41]:
adopted_dict = scrub_adopted()

Clicked successfully 1
Clicked successfully 2
Clicked successfully 3
Clicked successfully 4
Clicked successfully 5
Clicked successfully 6
Clicked successfully 7
Clicked successfully 8


In [72]:
animal_info(adopted_dict, 0, 2000)

Line: 0
Line: 1
Line: 2
Line: 3
Line: 4
Line: 5
Line: 6
Line: 7
Line: 8
Line: 9
Line: 10
Line: 11
Line: 12
Line: 13
Line: 14
Line: 15
Line: 16
Line: 17
Line: 18
Line: 19
Line: 20
Line: 21
Line: 22
Line: 23
Line: 24
Line: 25
Line: 26
Line: 27
Line: 28
Line: 29
Line: 30
Line: 31
Line: 32
Line: 33
Line: 34
Line: 35
Line: 36
Line: 37
Line: 38
Line: 39
Finished


In [35]:
animal_info(adopted_dict, 2000, len(adopted_dict)+1, 'a')

NameError: name 'adopted_dict' is not defined

In [34]:
adoption_dict = for_adoption()

In [22]:
len(adoption_dict)

138

In [35]:
animal_info(adoption_dict, 0, len(adoption_dict)+1, 'a')

Line: 0
Line: 1
Line: 2
Line: 3
Line: 4
Line: 5
Line: 6
Line: 7
Line: 8
Line: 9
Line: 10
Line: 11
Line: 12
Line: 13
Line: 14
Line: 15
Line: 16
Line: 17
Line: 18
Line: 19
Line: 20
Line: 21
Line: 22
Line: 23
Line: 24
Line: 25
Line: 26
Line: 27
Line: 28
Line: 29
Line: 30
Line: 31
Line: 32
Line: 33
Line: 34
Line: 35
Line: 36
Line: 37
Line: 38
Line: 39
Line: 40
Line: 41
Line: 42
Line: 43
Line: 44
Line: 45
Line: 46
Line: 47
Line: 48
Line: 49
Line: 50
Line: 51
Line: 52
Line: 53
Line: 54
Line: 55
Line: 56
Line: 57
Line: 58
Line: 59
Line: 60
Line: 61
Line: 62
Line: 63
Line: 64
Line: 65
Line: 66
Line: 67
Line: 68
Line: 69
Line: 70
Line: 71
Line: 72
Line: 73
Line: 74
Line: 75
Line: 76
Line: 77
Line: 78
Line: 79
Line: 80
Line: 81
Line: 82
Line: 83
Line: 84
Line: 85
Line: 86
Line: 87
Line: 88
Line: 89
Line: 90
Line: 91
Line: 92
Line: 93
Line: 94
Line: 95
Line: 96
Line: 97
Line: 98
Line: 99
Line: 100
Line: 101
Line: 102
Line: 103
Line: 104
Line: 105
Line: 106
Line: 107
Line: 108
Line: 109
Line: 110


In [50]:
#start checking at size, check if it is Status = Adopted, if it is replace with NAn 
import pandas as pd 
import numpy as np 

In [52]:
cat_df = pd.read_csv('cat_information2.csv')

cat_df.head(5)

ParserError: Error tokenizing data. C error: Expected 17 fields in line 10, saw 18
