In [56]:
import gspread
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime
import os

def auth_gspread():
    try:
        gc = gspread.service_account(filename='gfile.json')
        sheet = gc.open("CMACC_sheet").sheet1
        print("Successfully authenticated and accessed the sheet.")
        return sheet
    except Exception as e:
        print(f"Error authenticating Google Sheets: {e}")
        raise

def assign_result(row, all_days_df):
    current_date = pd.to_datetime(row["Scrape Date"]).date()
    dog_id = row["ID"]
    all_days_df["Scrape Date"] = pd.to_datetime(all_days_df["Scrape Date"]).dt.date
    most_recent_date = all_days_df["Scrape Date"].max()
    is_in_next = all_days_df[(all_days_df["Scrape Date"] > current_date) & (all_days_df["ID"] == dog_id)].shape[0] > 0
    return "Adopted" if not is_in_next and current_date != most_recent_date else "Looking for Furever Home"

def convert_age_to_years(age):
    if pd.isnull(age):
        return None
    if isinstance(age, str):
        if 'year' in age:
            return int(age.split()[0])
        elif 'month' in age:
            return int(age.split()[0]) / 12
        elif 'week' in age:
            return int(age.split()[0]) / 52
    return None

def wrangle(dogs_scrape):
    for col in ['Brought to Shelter', 'Scrape Date']:
        dogs_scrape[col] = pd.to_datetime(dogs_scrape[col], errors='coerce').dt.strftime('%Y-%m-%d')
    dogs_scrape['age_numeric'] = dogs_scrape['Age'].apply(convert_age_to_years)
    dogs_scrape['Weight_num'] = dogs_scrape['Weight'].str.extract(r'(\d+\.?\d*)').astype(float)
    dogs_scrape.rename(columns={'Name': 'Name_ID'}, inplace=True)
    dogs_scrape['Name'] = dogs_scrape['Name_ID'].str.extract(r'([A-Za-z\s]+) \(')
    dogs_scrape['ID'] = dogs_scrape['Name_ID'].str.extract(r'(\d+)')
    dogs_scrape["result"] = dogs_scrape.apply(assign_result, axis=1, all_days_df=dogs_scrape)
    return dogs_scrape

def fetch_and_parse(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except requests.exceptions.RequestException as req_err:
        print(f"Request error occurred: {req_err}")
    return None

def extract_data(soup):
    dog_list = []
    if soup:
        dogs = soup.find_all('div', class_='gridResult')
        for dog in dogs:
            try:
                dog_list.append({
                    'Name': dog.find('span', class_='text_Name results').text.strip(),
                    'Gender': dog.find('span', class_='text_Gender results').text.strip(),
                    'Breed': dog.find('span', class_='text_Breed results').text.strip(),
                    'Age': dog.find('span', class_='text_Age results').text.strip(),
                    'Animal Type': dog.find('span', class_='text_Animaltype results').text.strip(),
                    'Weight': dog.find('span', class_='text_Weight results').text.strip(),
                    'Brought to Shelter': dog.find('span', class_='text_Broughttotheshelter results').text.strip(),
                    'Kennel Location': dog.find('span', class_='text_KennelLocation results').text.strip()
                })
            except AttributeError as e:
                print(f"Error extracting data for one dog: {e}")
    return dog_list

def scrape_dog_data():
    base_url = 'https://24petconnect.com/CLTAdopt'
    index = 0
    all_dogs = []
    while True:
        current_url = f"{base_url}?index={index}&at=DOG"
        print(f"Fetching data from index: {index}")
        soup = fetch_and_parse(current_url)
        if soup is None:
            print("Error fetching page. Exiting loop.")
            break
        new_dogs = extract_data(soup)
        if not new_dogs:
            print("No more dogs found. Exiting loop.")
            break
        all_dogs.extend(new_dogs)
        index += 30
        time.sleep(1)
    if all_dogs:
        dogs_at_shelter = pd.DataFrame(all_dogs)
        dogs_at_shelter['Scrape Date'] = pd.to_datetime('today').strftime('%Y-%m-%d')
        dogs_at_shelter = wrangle(dogs_at_shelter)
        csv_file = 'dogs_at_shelter_test.csv'
        if os.path.exists(csv_file):
            existing_data = pd.read_csv(csv_file)
            filtered_data = existing_data[existing_data['Scrape Date'] != pd.to_datetime('today').strftime('%Y-%m-%d')]
            updated_data = pd.concat([filtered_data, dogs_at_shelter], ignore_index=True)
            updated_data.to_csv(csv_file, index=False)
            try:
                sheet = auth_gspread()
                sheet.clear()
                for col in updated_data.columns:
                    updated_data[col] = updated_data[col].astype(str)
                sheet.update([updated_data.columns.values.tolist()] + updated_data.values.tolist())
                print("Data saved to Google Sheet.")
            except Exception as e:
                print(f"Error updating Google Sheet: {e}")
        else:
            dogs_at_shelter.to_csv(csv_file, index=False)
            print("Data saved as new file.")
    else:
        print("No data fetched today.")

scrape_dog_data()



Fetching data from index: 0
Fetching data from index: 30
Fetching data from index: 60
Fetching data from index: 90
Fetching data from index: 120
Fetching data from index: 150
Fetching data from index: 180
Fetching data from index: 210
Fetching data from index: 240
Fetching data from index: 270
Fetching data from index: 300
Fetching data from index: 330
HTTP error occurred: 500 Server Error: Internal Server Error for url: https://24petconnect.com/CLTAdopt?index=330&at=DOG
Error fetching page. Exiting loop.
Successfully authenticated and accessed the sheet.
Data saved to Google Sheet.


In [55]:
import schedule

# Scheduling
def schedule_job():
    schedule.every(30).minutes.do(scrape_dog_data)
    print("Scheduled job to run every 30 mins...")
    try:
        while True:
            schedule.run_pending()
            time.sleep(1)
    except KeyboardInterrupt:
        print("Script interrupted by user, exiting.")


if __name__ == '__main__':
    scrape_dog_data()
    schedule_job()  # Then start the scheduled job



Fetching data from index: 0
Fetching data from index: 30
Fetching data from index: 60
Fetching data from index: 90
Fetching data from index: 120
Fetching data from index: 150
Fetching data from index: 180
Fetching data from index: 210
Fetching data from index: 240
Fetching data from index: 270
Fetching data from index: 300
Fetching data from index: 330
HTTP error occurred: 500 Server Error: Internal Server Error for url: https://24petconnect.com/CLTAdopt?index=330&at=DOG
Error fetching page. Exiting loop.
Total dogs fetched: 309
Existing data before filtering: (10815, 21)
Data after filtering out today's date: (10506, 21)
Data updated and saved.
Successfully authenticated and accessed the sheet.
Data saved to Google Sheet.
Scheduled job to run every 30 mins...
Script interrupted by user, exiting.
