In [51]:
import gspread
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import datetime
import os

def auth_gspread():
    try:
        gc = gspread.service_account(filename='gfile.json')
        sheet = gc.open("CMACC_sheet").sheet1
        print("Successfully authenticated and accessed the sheet.")
        return sheet
    except Exception as e:
        print(f"Error authenticating Google Sheets: {e}")
        raise

# Function to determine the adopted or looking
def assign_result(row, all_days_df):
    current_date = pd.to_datetime(row["Scrape Date"]).date()
    dog_id = row["ID"]
    # Convert Scrape Date column to datetime for efficient filtering
    all_days_df["Scrape Date"] = pd.to_datetime(all_days_df["Scrape Date"]).dt.date
    # Determine if this is the most recent date in the dataset
    most_recent_date = all_days_df["Scrape Date"].max()
    # Check if the dog appears in any future data
    is_in_next = all_days_df[
        (all_days_df["Scrape Date"] > current_date) & (all_days_df["ID"] == dog_id)
    ].shape[0] > 0
    # Determine the result
    if current_date == most_recent_date:
        return "Looking for Furever Home"  # Default to Looking for Furever Home for the most recent date
    elif not is_in_next:
        return "Adopted"  # Not found in future dates -> Adopted
    else:
        return "Looking for Furever Home"  # Found in future dates

# Convert age to years function
def convert_age_to_years(age):
    if pd.isnull(age):
        return None
    if isinstance(age, str):
        if 'year' in age:
            return int(age.split()[0])
        elif 'month' in age:
            return int(age.split()[0]) / 12
        elif 'week' in age:
            return int(age.split()[0]) / 52
    return None

def wrangle(dogs_scrape):
    date_columns = ['Brought to Shelter', 'Scrape Date']  # List all your date columns here
    for col in date_columns:
        if col in dogs_scrape.columns:
            dogs_scrape[col] = pd.to_datetime(dogs_scrape[col]).dt.strftime('%Y-%m-%d')

    # Convert age and extract numerical values from strings
    dogs_scrape['age_numeric'] = dogs_scrape['Age'].apply(convert_age_to_years)
    dogs_scrape['Weight_num'] = dogs_scrape['Weight'].str.extract(r'(\d+\.?\d*)').astype(float)

    # Rename columns and split values
    dogs_scrape.rename(columns={'Name': 'Name_ID'}, inplace=True)
    dogs_scrape['Name'] = dogs_scrape['Name_ID'].str.extract(r'([A-Za-z\s]+) \(')
    dogs_scrape['ID'] = dogs_scrape['Name_ID'].str.extract(r'(\d+)')

    # Simplify location descriptions
    dogs_scrape['location_simple'] = np.where(
        dogs_scrape['Kennel Location'].str.contains('toom', case=False, na=False), 'Toomey',
        np.where(
            dogs_scrape['Kennel Location'].str.contains('CARE|LST|ADOPT|PUPPY|INTAKE|CLINIC|FOUND|FERRET|ADPT', case=False), 'Byrum',
        np.where(
            dogs_scrape['Kennel Location'].str.contains('Foster', case=False), 'Foster',
            dogs_scrape['Kennel Location'])))

    # Extract components from Kennel Location
    dogs_scrape['kennel_name'] = dogs_scrape['Kennel Location'].str.extract(r'([A-Za-z\s]+)')
    dogs_scrape['kennel_num'] = dogs_scrape['Kennel Location'].str.extract(r'(\d+)')
    dogs_scrape['side'] = dogs_scrape['Kennel Location'].apply(lambda x: '' if 'FOSTER' in x or x[-1] not in ['R', 'L'] else x[-1])

    # Ensure date fields are correctly converted to datetime objects and handle 'NaT' values
    dogs_scrape['Brought to Shelter'] = pd.to_datetime(dogs_scrape['Brought to Shelter'], errors='coerce').dt.date
    dogs_scrape['Scrape Date'] = pd.to_datetime(dogs_scrape['Scrape Date'], errors='coerce').dt.date

    # Calculate days at shelter safely
    for index, row in dogs_scrape.iterrows():
        if pd.notna(row['Brought to Shelter']) and pd.notna(row['Scrape Date']):
            delta = row['Scrape Date'] - row['Brought to Shelter']
            dogs_scrape.at[index, 'Days_at_shelter'] = delta.days
        else:
            dogs_scrape.at[index, 'Days_at_shelter'] = None  # Assign None if either date is NaT

    # Apply function to assign results based on dates
    dogs_scrape["result"] = dogs_scrape.apply(assign_result, axis=1, all_days_df=dogs_scrape)
    return dogs_scrape




# Function to fetch and parse HTML
def fetch_and_parse(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except requests.exceptions.RequestException as req_err:
        print(f"Request error occurred: {req_err}")
    return None

# Function to extract data from soup
def extract_data(soup):
    if soup is None:
        return []
    dogs = soup.find_all('div', class_='gridResult')
    dog_list = []
    for dog in dogs:
        try:
            dog_list.append({
                'Name': dog.find('span', class_='text_Name results').text.strip() if dog.find('span', class_='text_Name results') else 'N/A',
                'Gender': dog.find('span', class_='text_Gender results').text.strip() if dog.find('span', class_='text_Gender results') else 'N/A',
                'Breed': dog.find('span', class_='text_Breed results').text.strip() if dog.find('span', class_='text_Breed results') else 'N/A',
                'Age': dog.find('span', class_='text_Age results').text.strip() if dog.find('span', class_='text_Age results') else 'N/A',
                'Animal Type': dog.find('span', class_='text_Animaltype results').text.strip() if dog.find('span', class_='text_Animaltype results') else 'N/A',
                'Weight': dog.find('span', class_='text_Weight results').text.strip() if dog.find('span', class_='text_Weight results') else 'N/A',
                'Brought to Shelter': dog.find('span', class_='text_Broughttotheshelter results').text.strip() if dog.find('span', class_='text_Broughttotheshelter results') else 'N/A',
                'Located At': dog.find('span', class_='text_Locatedat results').text.strip() if dog.find('span', class_='text_Locatedat results') else '',
                'Kennel Location': dog.find('span', class_='text_KennelLocation results').text.strip() if dog.find('span', class_='text_KennelLocation results') else '',
                'Qualified For': dog.find('span', class_='text_ViewType results').text.strip() if dog.find('span', class_='text_ViewType results') else 'N/A'
            })
        except AttributeError as e:
            print(f"Error extracting data for one dog: {e}")
    return dog_list

# Main function to scrape dog data
def scrape_dog_data():
    base_url = 'https://24petconnect.com/CLTAdopt'
    index = 0
    all_dogs = []
    current_date = pd.to_datetime(datetime.datetime.now().strftime('%Y-%m-%d')).date()

    while True:
        current_url = f"{base_url}?index={index}&at=DOG"
        print(f"Fetching data from index: {index}")
        soup = fetch_and_parse(current_url)

        if soup is None:
            print("Error fetching page. Exiting loop.")
            break

        new_dogs = extract_data(soup)
        if not new_dogs:
            print("No more dogs found. Exiting loop.")
            break

        all_dogs.extend(new_dogs)
        index += 30
        time.sleep(1)

    print(f"Total dogs fetched: {len(all_dogs)}")

    if all_dogs:
        dogs_at_shelter = pd.DataFrame(all_dogs)
        dogs_at_shelter['Scrape Date'] = pd.to_datetime('today').strftime('%Y-%m-%d')
        dogs_at_shelter = wrangle(dogs_at_shelter)

        csv_file = 'dogs_at_shelter_test.csv'
        if os.path.exists(csv_file):
            existing_data = pd.read_csv(csv_file)
            existing_data['Scrape Date'] = pd.to_datetime(existing_data['Scrape Date']).dt.strftime('%Y-%m-%d')
            print(f"Existing data before filtering: {existing_data.shape}")

            filtered_data = existing_data[existing_data['Scrape Date'] != current_date.strftime('%Y-%m-%d')]
            print(f"Data after filtering out today's date: {filtered_data.shape}")

            updated_data = pd.concat([filtered_data, dogs_at_shelter], ignore_index=True)
            updated_data['Scrape Date'] = updated_data['Scrape Date'].astype(str)  # Ensure Scrape Date is a string
            updated_data.to_csv(csv_file, index=False)
            print("Data updated and saved.")
            try:
                sheet = auth_gspread()
                sheet.clear()
                # Ensure all data to upload is in string format
                for col in updated_data.columns:
                    updated_data[col] = updated_data[col].astype(str)

                sheet.update([updated_data.columns.values.tolist()] + updated_data.values.tolist())
                print("Data saved to Google Sheet.")
            except Exception as e:
                print(f"Error updating Google Sheet: {e}")
        else:
            dogs_at_shelter.to_csv(csv_file, index=False)
            print("Data saved as new file.")
    else:
        print("No data fetched today.")

scrape_dog_data()


Fetching data from index: 0
Fetching data from index: 30
Fetching data from index: 60
Fetching data from index: 90
Fetching data from index: 120
Fetching data from index: 150
Fetching data from index: 180
Fetching data from index: 210
Fetching data from index: 240
Fetching data from index: 270
Fetching data from index: 300
Fetching data from index: 330
HTTP error occurred: 500 Server Error: Internal Server Error for url: https://24petconnect.com/CLTAdopt?index=330&at=DOG
Error fetching page. Exiting loop.
Total dogs fetched: 310
Existing data before filtering: (10596, 21)
Data after filtering out today's date: (10506, 21)
Data updated and saved.
Successfully authenticated and accessed the sheet.
Data saved to Google Sheet.


In [None]:
import schedule

# Scheduling
def schedule_job():
    schedule.every(30).minutes.do(scrape_dog_data)
    print("Scheduled job to run every 30 mins...")
    try:
        while True:
            schedule.run_pending()
            time.sleep(1)
    except KeyboardInterrupt:
        print("Script interrupted by user, exiting.")


if __name__ == '__main__':
    schedule_job()  # Then start the scheduled job



Scheduled job to run every 30 mins...
