This notebook job pulls data from horsebid.com

In [217]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import re
from selenium.common.exceptions import TimeoutException




In [225]:
def getDriver():
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    return driver

In [227]:
def getNumPages(url, driver):
    driver.get(url)
    number=None
    try:
        # Adjust the selector based on your element's class or ID
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "filter-item.pages-wrap.ng-binding"))
        )
        # Extract the text content
        number = re.findall(r'\d+', element.text)  # Finds all sequences of digits
        return number
    except Exception as e:
        print(f"Error while visiting {url}: {e}")

    


In [229]:
def getNumPagesAuction(url, driver):
    driver.get(url)
    number=None
    try:
        # Adjust the selector based on your element's class or ID
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "pages-label"))
        )
        # Extract the text content
        number = re.findall(r'\d+', element.text)  # Finds all sequences of digits
        return number
    except Exception as e:
        print(f"Error while visiting {url}: {e}")

In [199]:
def getAuctionsUrl(page, driver):
    curUrl=f"https://bid.horsebid.com/auctions/past?view=grid&page={page}&limit=20"
    driver.get(curUrl)
    try:
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "btn-custom")))
        # Find all elements with the class "btn-custom"
        link_elements = driver.find_elements(By.CLASS_NAME, "btn-custom")
        # Filter links whose text contains both "View" and "Items"
        filtered_links = []
        for link_element in link_elements:
            text = link_element.text
            if "VIEW" in text and "ITEMS" in text:
                href = link_element.get_attribute('href')  # Get the href attribute
                filtered_links.append((text,href))        
        urls=[]
        for text, href in filtered_links:
            urls.append(href)
        return urls 
    except Exception as e:
        print(f"Error while visiting {curUrl}: {e}")

  

In [200]:
def getHorseUrl(baseURL, page, driver):
    curUrl=f"{baseURL}?page={page}&limit=36"
    driver.get(curUrl)
    try:
        # Find all elements with the class "btn-custom"
        link_elements = driver.find_elements(By.CSS_SELECTOR, "a.block-link.ng-binding")
        hrefs=[]
        for link in link_elements:
            href=link.get_attribute("href")
            hrefs.append(href)
        return hrefs
    except Exception as e:
        print(f"Error while visiting {curUrl}: {e}")
  

In [234]:
def getHorsePrice(url, driver):
    driver.get(url)
    try:
        price = driver.find_element(By.CSS_SELECTOR, "span.sold-amount.amount.ng-binding")
        numeric_part = re.sub(r"[^\d.]", "", price.text)
        numeric_value = float(numeric_part)
        return numeric_value
    except TimeoutException:
        print(f"Timeout while trying to visit {url}")
    except Exception as e:
        print(f"Error while getting price at {url}: {e}")

In [202]:
def getHorseMetadata(url, driver):
    driver.get(url)
    try:
        # Adjust the selector based on your element's class or ID
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "description"))
        )
        #class="pages-label ng-binding"
        # Extract the text content
        text=element.text
        keyWords=["COLOR", "BREED", "LOCATION", "AGE"]
        cols=["Consignor contact", "Location","Breed", "Registered", "Registered name", "Color", "Height", "Age", "Gender"]
       
        if all(keyword in text for keyword in keyWords):
            #extract the consignor contact, location, breed, registered, color, height, age, and gender 
            lines = text.strip().split("\n")
            # Initialize an empty dictionary
            attributes_dict = {}

            # Loop through each line, split it by ": ", and add the key-value pair to the dictionary
            for line in lines:
                if ":" in line:
                    parts=line.split(": ",1)
                    if len(parts)==2:
                        key, value = parts
                        # Optionally, make the key more readable (e.g., capitalizing the first letter)
                        key = key.strip()
                        key=key.capitalize()
                        if key in cols:
                            value = value.strip()
                            attributes_dict[key] = value
            if (len(attributes_dict)>0):
                attributes_dict["Url"]=url
                attributes_dict["Price"]=getHorsePrice(url, driver)
            return attributes_dict
        else:
            print("insufficient information on page", url)
            return 
    except TimeoutException:
        print(f"Timeout while trying to visit {url}")
    except Exception as e:
        print(f"Error while visiting {url}: {e}")



    

In [230]:
baseUrl='https://bid.horsebid.com/auctions/past'
driver=getDriver()
noPages=int(getNumPages(baseUrl, driver)[0])
print(noPages)

7


In [231]:
auctionURLS=[]
for page in range(1,noPages+1):
    urlsForPage=getAuctionsUrl(page, getDriver())
    auctionURLS= auctionURLS+urlsForPage

In [232]:
allHorseUrls=[]
for auction in auctionURLS:
    #1. get the number of pages for this auction
    #2. for each page, get the horse urls
    #3. for each horse url, navigate to the page and extract the metadata about the horse
    noPages=getNumPagesAuction(auction, driver)
    noPages=int(noPages[0])
    # listOfHorseMeta=[]
    for page in range(1,noPages+1):
        #get the url of the horse
        horseURLs=getHorseUrl(auction, page, driver)
        allHorseUrls=allHorseUrls+horseURLs

print(f"have {len(allHorseUrls)} to go through ", allHorseUrls)
        


have 3930 to go through  ['https://bid.horsebid.com/lots/view/4-FIOBX5/how-it-works-the-how-tos-of-getting-started', 'https://bid.horsebid.com/lots/view/4-FIOC9E/-', 'https://bid.horsebid.com/lots/view/4-FIOA5P/indigo', 'https://bid.horsebid.com/lots/view/4-FIOA7F/swayze', 'https://bid.horsebid.com/lots/view/4-FIOA55/flash', 'https://bid.horsebid.com/lots/view/4-FIOA6N/president', 'https://bid.horsebid.com/lots/view/4-FIOA73/silver-bullet', 'https://bid.horsebid.com/lots/view/4-FIOA4J/bachelor', 'https://bid.horsebid.com/lots/view/4-FIOA6T/rita', 'https://bid.horsebid.com/lots/view/4-FIOA69/miss-quackers', 'https://bid.horsebid.com/lots/view/4-FIOA79/snoopy', 'https://bid.horsebid.com/lots/view/4-FIOA4P/biggin', 'https://bid.horsebid.com/lots/view/4-FIOA5X/legend', 'https://bid.horsebid.com/lots/view/4-FIOA5V/las-vegas', 'https://bid.horsebid.com/lots/view/4-FIOA5F/grizzly', 'https://bid.horsebid.com/lots/view/4-FIOA5D/goose', 'https://bid.horsebid.com/lots/view/4-FIOA7X/turtle', 'http

In [235]:
#take the list of all horse urls and process metadata
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
horseMetaList=[]
for url in allHorseUrls:
    horseMeta=getHorseMetadata(url, driver)
    if(horseMeta):
        horseMetaList.append(horseMeta)

insufficient information on page https://bid.horsebid.com/lots/view/4-FIOBX5/how-it-works-the-how-tos-of-getting-started
insufficient information on page https://bid.horsebid.com/lots/view/4-FIOC9E/-
Error while getting price at https://bid.horsebid.com/lots/view/4-FIOA5D/goose: Message: no such element: Unable to locate element: {"method":"css selector","selector":"span.sold-amount.amount.ng-binding"}
  (Session info: chrome=131.0.6778.265); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000102d6a138 cxxbridge1$str$ptr + 3653888
1   chromedriver                        0x0000000102d62988 cxxbridge1$str$ptr + 3623248
2   chromedriver                        0x00000001027c8968 cxxbridge1$string$len + 89228
3   chromedriver                        0x000000010280cd4c cxxbridge1$string$len + 368752
4   chromedriver                        0

In [160]:
print(len(horseMetaList))

3719


In [161]:
df=pd.DataFrame(horseMetaList)
df.to_csv('horsebid.csv', index=False)

In [216]:
driver.quit()