In [1]:
import os
import sys
import time
import math
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
players = pd.read_csv('players.csv',index_col = 0)
players = players.loc[players.pick_no <= 30].reset_index(drop = True)

In [12]:
PAGE_MAX = 250
LOTS_URL = "https://www.psacard.com/auctionprices/GetItemLots"
SEARCH_URL = "https://www.psacard.com/auctionprices/Search"
SET_NAME = 'Bowman Chrome Prospects'
CARD_SEARCH_CSV_FN = 'bcbb_card_search.csv'
CARD_LOTS_CSV_FN = 'bcbb_card_lots.csv'
CARD_CATEGORY_ID = 1

In [13]:
def scrape_lots(card_id):

    total_sales, sales, curr_page = 1, [], 1
    while total_sales > len(sales):

        form_data = {
            "specID": str(card_id),
            "draw": curr_page,
            "start": PAGE_MAX * (curr_page - 1),
            "length": PAGE_MAX
        }

        with requests.Session() as sess:
            response = sess.post(LOTS_URL, data=form_data)
            response.raise_for_status()
            lots_data = response.json()

        sales += lots_data["data"]
        total_sales = lots_data['recordsTotal']
        curr_page += 1

    return sales

def check_value_in_csv(file_path, target_value):
    with open(file_path, 'r') as f:
        for line in f:
            if target_value in line:
                return True
    return False

def scrape_search(search_term):
    POST_DATA = {
        'draw': 1,
        'filterCategoryID': CARD_CATEGORY_ID, #basketball cards
        'pageNumber': 1,
        'pageSize': PAGE_MAX,
        'search': search_term,
        'pricesOnly': True,
        'searchSequence': 1
    }

    with requests.Session() as sess:
        response = sess.post(SEARCH_URL, data=POST_DATA)
        response.raise_for_status()
        search_results = response.json()
        
    return search_results

def normalize_name(name):
    """Normalize a name for comparison."""
    name = name.lower()
    for term in ['mr', 'mrs', 'ms', 'dr', 'jr', 'sr', '.']:
        name = name.replace(term, '')
    return ' '.join(name.split())  # Remove extra whitespace

In [14]:
if os.path.exists(CARD_SEARCH_CSV_FN) == False:
    df = pd.DataFrame(columns = ['CardNumber', 'CategoryName', 'CategoryNameSEO', 'HeadingID',
                                 'ItemNameSEO', 'LotsFound', 'SetName', 'SetNameSEO', 'SpecDescription',
                                 'SpecID', 'SpecSubjectName', 'SportCategoryID', 'Variety',
                                 'YearIssued'])
else:
    df = pd.read_csv(CARD_SEARCH_CSV_FN)

In [28]:
search_term = SET_NAME + ' autograph'
search_term = "2019 Bowman Prospect Autographs Chrome julio rodriguez"

search_results = scrape_search(search_term)

In [29]:
df = pd.DataFrame(search_results['data'])

In [30]:
df.SetNameSEO.unique()

array(['bowman-dual-prospect-autographs-chrome',
       'bowman-prospect-autographs-chrome', 'bowman-prospects-chrome',
       'bowman-sterling-prospect-autographs',
       'bowman-chrome-2018-afl-fall-stars-autographs',
       'bowman-chrome-prime-autograph', 'bowman-chrome-prospects',
       'bowman-chrome-prospects-autographs'], dtype=object)

min_ann_trade_ct = df.LotsFound/(2024 - df.YearIssued.astype(int)) > 10

set_search = df['SetNameSEO'].apply(lambda x: all([y in x for y in ['bowman','chrome','prospect','autograph']])&
                    all(y not in x for y in ['dual']))
df.loc[min_ann_trade_ct&set_search]

## BGS

In [281]:
BASE_URL = "https://www.beckett.com"
SET_ITEMS_ENDPOINT = "/detail_services/get_items_pricing_by_set_id"

SET_EX = 16663420

In [283]:
POST_DATA = {
    'set_id': SET_EX,
    'offset': 2,
    'pricing_type': 'default_grid'
}

with requests.Session() as sess:
    response = sess.post(BASE_URL + SET_ITEMS_ENDPOINT, data=POST_DATA, headers=headers)
    response.raise_for_status()
    print(response.status_code)
    results = response.json()

200


JSONDecodeError: Extra data: line 1 column 33248 (char 33247)

In [297]:
response.content

b'[{"result_type_id":"59_16663427","disp_title":"<div class=\\"description\\"><a href=\\"https:\\/\\/www.beckett.com\\/baseball\\/2019\\/bowman-chrome-prospect-autographs\\/cpaac-aaron-civale-16663427\\">2019 Bowman Chrome Prospect Autographs #CPAAC Aaron Civale<\\/a><\\/div><div class=\\"team\\"><strong>Team:<\\/strong> Cleveland Indians<\\/div><div class=\\"ser-au-icon\\"><ul><li class=\\"au\\">au<\\/li><\\/ul><\\/div>","photo_url":"<img title=\'item image\' width=\'28\' height=\'39\' class=\'img-responsive\' src=\'https:\\/\\/www.beckett.com\\/timthumb.php?src=https:\\/\\/d3mop092koehdk.cloudfront.net\\/images\\/no-image-new.jpg&w=28&h=39&q=90\'>","print_run":"-","price_high":"<a class=\'currency_alert\' href=\\"https:\\/\\/www.beckett.com\\/subscriptions\\/buy\\/item\\/16663427\\">Buy Pricing<\\/a>","price_low":"<a class=\'currency_alert\' href=\\"https:\\/\\/www.beckett.com\\/subscriptions\\/buy\\/item\\/16663427\\">Buy Pricing<\\/a>","price_available":1,"ser":"-","actions":"<div 

In [None]:
all_cards_list = []

for i in range(len(results)):
    soup = BeautifulSoup(results[i]['disp_title'])
    card_link = soup.find('a', href=True)['href']
    card_name = soup.find('a', href=True).text
    link_details = card_link[card_link.find(r'.com') + 5 :].split('/')
    card_info = {
        'card_name': card_name,
        'card_id': link_details[-1].split('-')[-1], 
        'card_id_long': link_details[-1], 
        'card_set': link_details[-2], 
        'card_year': link_details[-3], 
        'sport': link_details[0]
    }
    all_cards_list.append(card_info)

# pd.DataFrame(all_cards_list)

In [242]:
response = requests.get("https://www.beckett.com/baseball/2019/bowman-chrome-prospect-autographs/", headers = headers)

In [267]:
soup = BeautifulSoup(response.text)
stats_zone = soup.find('div', attrs={'class': 'statsZoneBox'}).find_all('li')#[1]
while stats_zone[0].text != 'Total Cards':
    stats_zone = stats_zone[1:]
total_cards = int(stats_zone[1].text.strip())

In [270]:
MAX_OFFSET = math.ceil(total_cards/20) - 1
MAX_OFFSET

7

In [181]:
import requests
COOKIES  ="ga_campaign=Twitter; ga_medium=Twitter+Traffic; ga_source=Twitter; PHPSESSID=o7vqridf99aok51nf05op19785; ga_campaign=Twitter; ga_medium=Twitter+Traffic; ga_source=Twitter; _gid=GA1.2.434806967.1696789489; __attentive_id=575c6453b4974989bcf4d07a98e10f20; _attn_=eyJ1Ijoie1wiY29cIjoxNjk2Nzg5NDg4OTg3LFwidW9cIjoxNjk2Nzg5NDg4OTg3LFwibWFcIjoyMTkwMCxcImluXCI6ZmFsc2UsXCJ2YWxcIjpcIjU3NWM2NDUzYjQ5NzQ5ODliY2Y0ZDA3YTk4ZTEwZjIwXCJ9In0=; __attentive_cco=1696789488989; _tt_enable_cookie=1; _ttp=B05JVJUTsKPkTizHKvqZxD8RTuf; _clck=kywqlq|2|ffo|0|1376; __attentive_dv=1; __qca=P0-716074945-1696789488870; hubspotutk=e02472ad1c0a25bd23a2d9b04a3e9209; __hssrc=1; last_post_id=294898; last_post_id=294898; bk_session=a%3A4%3A%7Bs%3A10%3A%22session_id%22%3Bs%3A32%3A%225deb8e318bdba5b6d96902196c12b9cc%22%3Bs%3A10%3A%22ip_address%22%3Bs%3A12%3A%2269.126.47.83%22%3Bs%3A10%3A%22user_agent%22%3Bs%3A50%3A%22Mozilla%2F5.0+%28Macintosh%3B+Intel+Mac+OS+X+10_15_7%29+Ap%22%3Bs%3A13%3A%22last_activity%22%3Bs%3A10%3A%221696799871%22%3B%7D90c6cc48d42b92cc656d9e012995488c; __attentive_ss_referrer=https://www.google.com/; __hstc=239519004.e02472ad1c0a25bd23a2d9b04a3e9209.1696789490108.1696789490108.1696799873094.2; attntv_mstore_email=limjoe25@gmail.com:0; mybb[lastvisit]=1696799909; mybb[lastactive]=1696799909; loginattempts=1; mybbuser=1691108_JZTtB2AVeDQZcMsCBk7LxufK7DhHhCY3zjMihB6OjtJBqN76ah; sid=44b98aef753f57a4d749c83a85ad414f; mmr=QDkzLFE4Q2BQOUYsWSwjOUMtRihSLFNAUSxDNFktNiRYLCYwUDk2KUQuMyRgCmAK; __unam=94894f2-18b1129e4a5-dfe8b7f-2; _gcl_au=1.1.143432214.1696789489.934266431.1696799912.1696800209; _ga=GA1.1.76779320.1696789489; _uetsid=f6ec9690660711ee929e87f8fca600b8; _uetvid=f6ec7ce0660711ee988b2764d338eb14; __attentive_pv=11; __hssc=239519004.10.1696799873094; _clsk=nrhwfx|1696800210094|12|1|v.clarity.ms/collect; _ga_VMXYB2L7XG=GS1.1.1696799872.2.1.1696800244.24.0.0"
COOKIES = "ga_campaign=Twitter; ga_medium=Twitter+Traffic; ga_source=Twitter; PHPSESSID=o7vqridf99aok51nf05op19785; ga_campaign=Twitter; ga_medium=Twitter+Traffic; ga_source=Twitter; _gid=GA1.2.434806967.1696789489; __attentive_id=575c6453b4974989bcf4d07a98e10f20; _attn_=eyJ1Ijoie1wiY29cIjoxNjk2Nzg5NDg4OTg3LFwidW9cIjoxNjk2Nzg5NDg4OTg3LFwibWFcIjoyMTkwMCxcImluXCI6ZmFsc2UsXCJ2YWxcIjpcIjU3NWM2NDUzYjQ5NzQ5ODliY2Y0ZDA3YTk4ZTEwZjIwXCJ9In0=; __attentive_cco=1696789488989; _tt_enable_cookie=1; _ttp=B05JVJUTsKPkTizHKvqZxD8RTuf; _clck=kywqlq|2|ffo|0|1376; __attentive_dv=1; __qca=P0-716074945-1696789488870; hubspotutk=e02472ad1c0a25bd23a2d9b04a3e9209; __hssrc=1; last_post_id=294898; last_post_id=294898; __hstc=239519004.e02472ad1c0a25bd23a2d9b04a3e9209.1696789490108.1696789490108.1696799873094.2; attntv_mstore_email=limjoe25@gmail.com:0; mybb[lastvisit]=1696799909; mybb[lastactive]=1696799909; loginattempts=1; mybbuser=1691108_JZTtB2AVeDQZcMsCBk7LxufK7DhHhCY3zjMihB6OjtJBqN76ah; sid=44b98aef753f57a4d749c83a85ad414f; mmr=QDkzLFE4Q2BQOUYsWSwjOUMtRihSLFNAUSxDNFktNiRYLCYwUDk2KUQuMyRgCmAK; common_web_guid=683c7a54-1175-6668-9967-2b80b64503ad; _ga_2KH2PDC5RQ=GS1.2.1696803598.1.0.1696803598.60.0.0; bktShield=1; __attentive_ss_referrer=https://www.beckett.com/baseball/2019/bowman-chrome-prospect-autographs/cpajro-julio-rodriguez-16663484; bk_session=a%3A4%3A%7Bs%3A10%3A%22session_id%22%3Bs%3A32%3A%2284b9addd6441620acbed8aca63ca1c19%22%3Bs%3A10%3A%22ip_address%22%3Bs%3A12%3A%2269.126.47.83%22%3Bs%3A10%3A%22user_agent%22%3Bs%3A50%3A%22Mozilla%2F5.0+%28Macintosh%3B+Intel+Mac+OS+X+10_15_7%29+Ap%22%3Bs%3A13%3A%22last_activity%22%3Bs%3A10%3A%221696804317%22%3B%7D67993a121eb8579187272f6e636b9119; _gcl_au=1.1.143432214.1696789489.934266431.1696799912.1696804368; _uetsid=f6ec9690660711ee929e87f8fca600b8; _uetvid=f6ec7ce0660711ee988b2764d338eb14; _ga=GA1.1.76779320.1696789489; __attentive_pv=34; __hssc=239519004.31.1696799873094; _clsk=nrhwfx|1696804368682|41|1|v.clarity.ms/collect; _ga_VMXYB2L7XG=GS1.1.1696799872.2.1.1696804620.60.0.0; __unam=94894f2-18b1129e4a5-dfe8b7f-10"
headers = {
    'Accept': 'text/html, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Dnt': '1',
    'Referer': 'https://www.beckett.com/baseball/2019/bowman-chrome-prospect-autographs/cpajro-julio-rodriguez-16663484',
    'Sec-Ch-Ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': '"macOS"',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
    # Include sanitized cookies here
    'Cookie': COOKIES
}
PAGE_NO = 0
url = f'https://www.beckett.com/pgs_search/ajax_mkt_get_list/16663484/{PAGE_NO}/desc/datesold/1/item'
url = "https://www.beckett.com/baseball/2019/bowman-chrome-prospect-autographs/cpajro-julio-rodriguez-16663484"
response = requests.get(url, headers=headers)
print(response.status_code)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    result_pages = soup.find_all('li')
    total_pages = int(result_pages[-2].text)
    data = [[y.text.strip() for y in x.find_all('td') ]for x in soup.find('table').find_all('tr') if x.find('td') != None]



200


ValueError: invalid literal for int() with base 10: '\n\n\n'

In [104]:
soup = BeautifulSoup(response.text, 'html.parser')

In [156]:
detail_list = soup.find_all(lambda tag: tag.name and "Sport:" in tag.get_text())[-1]

while detail_list.find('ul') == None:
    detail_list = detail_list.parent

item_details = {}    
for x in detail_list.find_all('li'):
    attr_name = x.find('strong').text[:-1]
    try:
        attr_value = x.find('a').text
    except:
        attr_value = x.text.replace(attr_name + ': ', '')
    item_details[attr_name] = attr_value

In [188]:
url = "https://www.beckett.com/baseball/2019/bowman-chrome-prospect-autographs/"
response = requests.get(url, headers = headers)

In [189]:
soup = BeautifulSoup(response.text)

In [190]:
soup

<!DOCTYPE html>
<html itemscope="" itemtype="https://schema.org/WebPage" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width; initial-scale=1.0; maximum-scale=1.0; user-scalable=0;" name="viewport"/>
<meta content="https://d3mop092koehdk.cloudfront.net/images/beckett_logo_128x128.png" itemprop="image"/>
<meta content="detailhttps%3A%2F%2Fwww.beckett.com%2Fonline-price-guide%3Frefer%3DOTk5OTk0MDQxMzQ5MzAxMA%3D%3D%26tm%3D1696804702" property="og:url"/>
<meta content="website" property="og:type"/>
<meta content="Beckett - Most Trusted Name in Grading, Authentication &amp; Pricing" property="og:title"/>
<meta content="With over 35 years of experience, Beckett Collectibles is a one-stop shop for all your card grading, authentication, pricing, and marketplace needs. Visit the website to learn more about our newly launched Beckett Vaulting Service." property="og:description"/>
<meta content="https://www.beckett.com

if os.path.exists(CARD_SEARCH_CSV_FN) == False:
    df = pd.DataFrame(columns = ['CardNumber', 'CategoryName', 'CategoryNameSEO', 'HeadingID',
                                 'ItemNameSEO', 'LotsFound', 'SetName', 'SetNameSEO', 'SpecDescription',
                                 'SpecID', 'SpecSubjectName', 'SportCategoryID', 'Variety',
                                 'YearIssued'])
    
else:
    df = pd.read_csv(CARD_SEARCH_CSV_FN)
    
for i, row in players.iterrows():
    normalized_player_name = normalize_name(row.player_name)
    player_match = df.loc[df.SpecSubjectName.apply(lambda x: normalize_name(x)) == normalized_player_name]

    if len(player_match) == 0:
        search_term = f'{row.draft_year} {SET_NAME} {row.player_name}'
        search_results = scrape_search(search_term)
        
        df_temp = pd.DataFrame(search_results['data'])
        df_temp = df_temp.loc[
            (df_temp.SpecSubjectName.apply(lambda x: normalize_name(x)) == normalized_player_name) & # player name exact
            (df_temp.SetNameSEO == SET_NAME.lower().replace(' ','-')) & # set name match
            (df_temp.LotsFound >= 1)  # traded only
        ] # basketball cards only filtered by search
        # rookie cards only
        if len(df_temp) > 0:
            df_temp = df_temp.loc[df_temp.YearIssued == df_temp.YearIssued.astype(int).min().astype(str)]
            df = pd.concat([df, df_temp], ignore_index=True)
            df.to_csv(CARD_SEARCH_CSV_FN, index=False)

        print(f'scraped {len(df_temp)} {SET_NAME} cards for {row.player_name} {CARD_SEARCH_CSV_FN} length is now {len(df) + len(df_temp)}')
    time.sleep(0.25)

df = pd.read_csv(CARD_SEARCH_CSV_FN)

df.YearIssued = df.YearIssued.astype(int)
df['YearsFromIssue'] = 2023 - df.YearIssued
df['TotalLotsFound'] = df.groupby('ItemNameSEO').LotsFound.transform('sum')
df['MaxLotsFoundSingleCard'] = df.groupby('ItemNameSEO').LotsFound.transform('max')

df['MaxLotsFoundSingleCardPerYear'] = df['MaxLotsFoundSingleCard']/df['YearsFromIssue']
df['LotsFoundPerYear'] = df['TotalLotsFound']/df['YearsFromIssue']

""" 
Only looking at players who's most traded prizm card sold an average of 50 times or more per year
This most traded card will play role as the reference card for pricing 
And the threshold will serve to insure some type of accuracy of said reference price
"""

card_universe = df.loc[df.MaxLotsFoundSingleCardPerYear >= 50]
print(card_universe.LotsFound.sum())

if os.path.exists(CARD_LOTS_CSV_FN) == False:
    df_lots = pd.DataFrame(columns = [['SpecID', 'AuctionItemID', 'AuctionName', 'AuctionType', 'CertNo', 'EndDate',
                                       'GradeString', 'HasQualifier', 'ImageURL', 'IsPSADNA', 'LotNo', 'Name',
                                       'NoGradeDescription', 'Qualifier', 'SalePrice', 'URL']])
    df_lots.to_csv(CARD_LOTS_CSV_FN, header = True, index = False)
else:
    df_lots = pd.read_csv(CARD_LOTS_CSV_FN)

current_db_size = len(df_lots)
print(f'Starting Length of lots database {current_db_size}')

card_universe_unscraped = card_universe[~card_universe.SpecID.isin(df_lots.SpecID)]
    
for i, row in card_universe_unscraped.iterrows():
    spec_id_match = df_lots.loc[df_lots.SpecID == row.SpecID]
    if len(spec_id_match) == 0:
        try:
            lot_data = scrape_lots(row.SpecID)
            df_temp = pd.DataFrame(lot_data)
            if 'SpecID' not in df_temp.columns:
                df_temp.insert(loc=0, column='SpecID', value=row.SpecID)

            df_lots = pd.concat([df_lots, df_temp], ignore_index=True)
            df_lots.to_csv(CARD_LOTS_CSV_FN, index=False)
            print(f'appending {len(df_temp)} lots found for {row.SpecSubjectName} {row.Variety} card (spec id: {row.SpecID}) TOTAL: {len(df_lots)}')
        except:
            print(f'failed to scrape {row.LotsFound} lots found for {row.SpecSubjectName} {row.Variety} card (spec id: {row.SpecID} ) ')
            
            