This notebook pulls data from platinumequineauction.com. This data includes Contact info, location, breed, foal date, sex, color, height, markings, and registration.

In [5]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

In [6]:
def getNumPages(url):
    getReq=requests.get(url)
    soup = BeautifulSoup(getReq.content, 'html.parser')
    pages=soup.find_all('a', class_='page-numbers')
    pagesText=[a.get_text(strip=True) for a in pages]
    return int(pagesText[-2])

In [7]:
def getColumns(url):
    getReq=requests.get(url)
    soup = BeautifulSoup(getReq.content, 'html.parser')
    tableAttributes=soup.find_all('th',class_='woocommerce-product-attributes-item__label')
    tableAttributesTexts=[th.get_text(strip=True) for th in tableAttributes]
    return tableAttributesTexts

In [37]:
def determineLocality(dic):
    nearby_states=['NJ','NY', 'PA', 'DE', 'CT'] 
    #have a dictionary
    
    if 'Location' in dic and dic['Location']:
        location=dic['Location']
        try:
            state = location.split(",")[1].strip()
            return state in nearby_states
        except IndexError:
            # Handle the case where location does not have a comma or is malformed
            print(f"Warning: Unable to extract state from location '{location}'")
            print(dic)
            return False
    else:
        return False

In [53]:
def scrapePage(pageNo):
    url=f"https://platinumequineauction.com/expired-auctions/page/{pageNo}"
    r=requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    #extract the attributes, values, prices, link to more information
    tableAttributes=soup.find_all('th',class_='woocommerce-product-attributes-item__label')
    tableValues=soup.find_all('td',class_='woocommerce-product-attributes-item__value')
    biddingValue=soup.find_all('span', class_='price')
    listingUrls=soup.find_all('a', class_='woocommerce-LoopProduct-link woocommerce-loop-product__link')
    hrefs = [link['href'] for link in listingUrls if link.has_attr('href')]

    #parse text
    tableValuesTexts = [td.find("p").get_text(strip=True) for td in tableValues]
    tableAttributesTexts=[th.get_text(strip=True) for th in tableAttributes]
    biddingValueTexts=[span.get_text(strip=True) for span in biddingValue]

 

    #build results list for this page
    num_keys=9
    pageResults=[]
    for i in range(0, len(tableValuesTexts), num_keys):
        # Create the dictionary from list1 and list2
        #0, 9, 18
        #count by the number of keys 
        
        dictionary = dict(zip([key.capitalize() for key in tableAttributesTexts[i:i + num_keys]], tableValuesTexts[i:i + num_keys]))
        # Add the "Price" key with the value from biddingValueTexts
        dictionary["Price"] = biddingValueTexts[i // num_keys]
        dictionary["Url"]=hrefs[i//num_keys]
        dictionary["Page"]=pageNo
        dictionary["Is_Local"]=determineLocality(dictionary)
        pageResults.append(dictionary)
    return pageResults


In [39]:
baseUrl="https://platinumequineauction.com/expired-auctions"
numPages=getNumPages(baseUrl)

In [54]:
allResults=[]
for pageNo in range(1, numPages+1):
    print(f"Scraping page {pageNo} of {numPages}...")
    time.sleep(1)
    scrapedPage=scrapePage(pageNo)
    for d in scrapedPage:
        allResults.append(d)
    


Scraping page 1 of 153...
20
Scraping page 2 of 153...
20
Scraping page 3 of 153...
20
Scraping page 4 of 153...
20
Scraping page 5 of 153...
20
Scraping page 6 of 153...
20
Scraping page 7 of 153...
20
Scraping page 8 of 153...
20
Scraping page 9 of 153...
20
Scraping page 10 of 153...
20
Scraping page 11 of 153...
20
Scraping page 12 of 153...
20
Scraping page 13 of 153...
20
Scraping page 14 of 153...
20
Scraping page 15 of 153...
20
Scraping page 16 of 153...
20
Scraping page 17 of 153...
20
Scraping page 18 of 153...
20
Scraping page 19 of 153...
20
Scraping page 20 of 153...
20
Scraping page 21 of 153...
20
Scraping page 22 of 153...
20
Scraping page 23 of 153...
20
Scraping page 24 of 153...
20
Scraping page 25 of 153...
20
Scraping page 26 of 153...
20
Scraping page 27 of 153...
20
Scraping page 28 of 153...
20
Scraping page 29 of 153...
20
Scraping page 30 of 153...
20
Scraping page 31 of 153...
20
Scraping page 32 of 153...
20
Scraping page 33 of 153...
20
Scraping page 34 of

In [55]:
df=pd.DataFrame(allResults)

In [56]:
df.head()

Unnamed: 0,Contact,Location,Breed,Registered,Foal date,Sex,Color,Height,Markings,Price,Url,Page,Is_Local,Name,Consignor,For more info call
0,Ellesse Schwartz248-390-6831,"Howell, MI",Miniature Horse,Yes,2017,Gelding,Chestnut,9.2,4 Stockings,Reserve price Not met!,https://platinumequineauction.com/product/aloh...,1,False,,,
1,Longhorn RanchDina 831-537-1915,"Afton, WY",Quarter Horse,No,2017,Gelding,Palomino,13.3,,"Winning Bid:$7,250.00",https://platinumequineauction.com/product/herb...,1,False,,,
2,Maxwell Quality HorsesLandon 704-689-9035,"Cherryville, NC",Quarter Horse,No,2018,Gelding,Bay,15.1,4 Socks,"Winning Bid:$4,450.00",https://platinumequineauction.com/product/ty/,1,False,,,
3,Buster HorsesAshley 214-605-7174,"Weatherford, TX",Friesian Sport Horse,Yes,2018,Gelding,Black,15.2,,"Winning Bid:$10,450.00",https://platinumequineauction.com/product/coun...,1,False,,,
4,Trent Barnett662-927-0796,"Calhoun City, MS",Quarter Horse,Yes,2020,Mare,Bay,14.3,Blaze,"Winning Bid:$7,000.00",https://platinumequineauction.com/product/swee...,1,False,,,


In [59]:
def determine_metrics_overall(df, breed=None, local=None, registered=None ):
    df['Price'] = df['Price'].astype(str)
    #extract numerical value of the price
    df['Price'] = df['Price'].str.extract(r'(\d[\d,\.]*)')[0]
    df['Price'] = df['Price'].replace({',': ''}, regex=True).astype(float)
    df['Registered'] = df['Registered'].fillna('')  # cSpell:ignore fillna
    df['Breed']=df['Breed'].fillna('')
    df['Registered'] = df['Registered'].astype(str)
    df['Breed'] = df['Breed'].astype(str)
    if breed is not None:
        df=df[df['Breed'].str.contains(breed, case=False, na=False)]
    if local is not None:
        df = df[df['Is_Local'] == local]
    if registered is not None:
        df=df[df['Registered'].str.contains(registered, case=False, na=False)]
    mean_price = df['Price'].mean()
    median_price = df['Price'].median()
    min_price = df['Price'].min()

    print(f"Mean Price: ${mean_price:.2f}")
    print(f"Median Price: ${median_price:.2f}")
    print(f"Minimum Price: ${min_price:.2f}")   

In [60]:
determine_metrics_overall(df, breed='Quarter', local=True, registered='No')


Mean Price: $7667.21
Median Price: $6675.00
Minimum Price: $3750.00


In [61]:
df.to_csv('platinum_auction.csv', index=False)