## Pull Firebase Items List

In [1]:
FIREBASE_CLIENT_EMAIL="firebase-adminsdk-xxx@xxxx"
FIREBASE_PRIVATE_KEY=""
FIREBASE_PROJECT_ID="strustore-dev"

In [2]:
import firebase_admin
from firebase_admin import credentials, firestore
import csv

# Build service account info dictionary from your variables
service_account_info = {
    "type": "service_account",
    "project_id": FIREBASE_PROJECT_ID,
    "private_key_id": "",
    "private_key": FIREBASE_PRIVATE_KEY.replace('\\n', '\n'),
    "client_email": FIREBASE_CLIENT_EMAIL,
    "client_id": "",
    "auth_uri": "https://accounts.google.com/o/oauth2/auth",
    "token_uri": "https://oauth2.googleapis.com/token",
    "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
    "client_x509_cert_url": ""
}

# Initialize Firebase app only once
if not firebase_admin._apps:
    cred = credentials.Certificate(service_account_info)
    firebase_admin.initialize_app(cred)

# Firestore client
db = firestore.client()

# Reference the 'items' collection
items_ref = db.collection('items')
docs = items_ref.stream()

csv_file_path = 'items.csv'

# Open CSV in write mode (clears the file)
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['id', 'db_id', 'item'])  # Header: sequential id, Firestore id, item name

    names_written = set()
    counter = 1  # Sequential ID counter

    for doc in docs:
        data = doc.to_dict()
        name = data.get('name', '')
        deleted = data.get('deleted', '')
        doc_id = doc.id

        if name == "New Item":
            print(f"🟡 Skipping 'New Item': Document ID = {doc_id}")
            continue

        if name in names_written:
            print(f"🔴 Duplicate in this run '{name}': Document ID = {doc_id}")
            continue

        if not deleted:
            writer.writerow([counter, doc_id, name])
            names_written.add(name)
            counter += 1

print("✅ items.csv has been created (overwritten).")


🟡 Skipping 'New Item': Document ID = 0cVfA7cKl81hJzeYJeom
🔴 Duplicate in this run 'Inazuma Eleven Strikers 2013 (Wii)': Document ID = 2Q4SfkYkCGo9DbkGLPL9
🟡 Skipping 'New Item': Document ID = 30ipZsRWdB2s7Ur9KHmm
🔴 Duplicate in this run 'DDR Pad (boxed)': Document ID = 56kNwe7QwO0UG5KnIPLN
🟡 Skipping 'New Item': Document ID = 6EVr8m5dMCqBbFTlawD8
🟡 Skipping 'New Item': Document ID = 8KumJVAauDHs7V6k7KeW
🔴 Duplicate in this run 'Dragon Quest V (PS2)': Document ID = 9GevT39JnBUbBRxSUTOm
🟡 Skipping 'New Item': Document ID = 9dhSQ1nh7z1Dawxgi4yi
🔴 Duplicate in this run 'NFC Reader': Document ID = BQFije9nEgpgAILS2kRj
🟡 Skipping 'New Item': Document ID = Ca1FLv7Odsjsg3AcNE0d
🟡 Skipping 'New Item': Document ID = DgaPN3SGa5G1v2ERokjb
🔴 Duplicate in this run 'Royds Stick': Document ID = FU4iq83itVNXL5iERr8F
🔴 Duplicate in this run 'DRM V0/2': Document ID = Gvh6hL230MY787xMr35q
🔴 Duplicate in this run 'GC Mem 59': Document ID = HYjGDe8fm7cetdJZb01V
🟡 Skipping 'New Item': Document ID = KOqVzaPnC

## Create Item Urls

In [44]:
ITEMS_LIST = "items-select.csv"
ITEMS_URL = "items-url.csv"
ITEMS_LISTING_MENU = "./items_listing_menu/"
ITEMS_LISTING_JSON = "./items_listing/json"
ITEMS_LISTING_IMG = "./items_listing/raw"
SCRAPE_LISTING_LIMIT = 50

In [22]:
import pandas as pd

items_file = ITEMS_LIST
url_file = ITEMS_URL


df = pd.read_csv(items_file, usecols=['id', 'search_jp', 'auccat_id', 'min_price_yen'])

with open(url_file, "w", encoding="utf-8") as f:
    # File Header
    f.write("id,url\n")
    for index, row in df.iterrows():
        min_price_yen = row['min_price_yen']
        min_price_yen = int(min_price_yen) if min_price_yen else 0
        search_jp = str(row['search_jp']).strip()
        auccat_id = int(row['auccat_id'])
        item_id = row['id']

        url = (
            f"https://auctions.yahoo.co.jp/search/search?"
            f"min={min_price_yen}&max=&va={search_jp}&n=50&mode=1&auccat={auccat_id}&fixed=2"
        )

        print(url)  # optional: show on console

        # ✅ Save each line as: id, url
        f.write(f"{item_id},{url}\n")


https://auctions.yahoo.co.jp/search/search?min=0&max=&va=コントローラ&n=50&mode=1&auccat=22850&fixed=2
https://auctions.yahoo.co.jp/search/search?min=0&max=&va=本体&n=50&mode=1&auccat=22850&fixed=2
https://auctions.yahoo.co.jp/search/search?min=0&max=&va=本体&n=50&mode=1&auccat=22860&fixed=2
https://auctions.yahoo.co.jp/search/search?min=0&max=&va=本体&n=50&mode=1&auccat=2084237619&fixed=2
https://auctions.yahoo.co.jp/search/search?min=0&max=&va=本体&n=50&mode=1&auccat=2084290226&fixed=2
https://auctions.yahoo.co.jp/search/search?min=0&max=&va=つりコン64&n=50&mode=1&auccat=22850&fixed=2
https://auctions.yahoo.co.jp/search/search?min=0&max=&va=メモリーカード&n=50&mode=1&auccat=22860&fixed=2
https://auctions.yahoo.co.jp/search/search?min=10000&max=&va=コントローラ&n=50&mode=1&auccat=2084045784&fixed=2
https://auctions.yahoo.co.jp/search/search?min=0&max=&va=クラシックコントローラーPRO&n=50&mode=1&auccat=2084217064&fixed=2
https://auctions.yahoo.co.jp/search/search?min=3000&max=&va=リモコン&n=50&mode=1&auccat=2084217064&fixed=2


## Scrapy: Retrieve up to 50 listings from each url (5sec delay)

In [27]:
import csv
import requests
import time
import os

# Path to your global vals
items_file = ITEMS_URL
items_listing_menu = ITEMS_LISTING_MENU

# Ensure the folder exists
os.makedirs(items_listing_menu, exist_ok=True)

with open(items_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        item_id = row['id']
        url = row['url']

        print(f"Fetching {url}")
        
        # Making the Request
        try:
            response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Request failed for {url}: {e}")
            continue

        # Save the HTML
        filename = os.path.join(items_listing_menu, f"{item_id}.html")
        with open(filename, "w", encoding="utf-8") as f:
            f.write(response.text)

        print(f"Saved response to {filename}")

        # Wait 5 seconds before next requst
        time.sleep(5)
        
print("All listing menus retrieved!")

Fetching https://auctions.yahoo.co.jp/search/search?min=0&max=&va=コントローラ&n=50&mode=1&auccat=22850&fixed=2
Saved response to ./items_listing_menu/108.html
Fetching https://auctions.yahoo.co.jp/search/search?min=0&max=&va=本体&n=50&mode=1&auccat=22850&fixed=2
Saved response to ./items_listing_menu/37.html
Fetching https://auctions.yahoo.co.jp/search/search?min=0&max=&va=本体&n=50&mode=1&auccat=22860&fixed=2
Saved response to ./items_listing_menu/199.html
Fetching https://auctions.yahoo.co.jp/search/search?min=0&max=&va=本体&n=50&mode=1&auccat=2084237619&fixed=2
Saved response to ./items_listing_menu/193.html
Fetching https://auctions.yahoo.co.jp/search/search?min=0&max=&va=本体&n=50&mode=1&auccat=2084290226&fixed=2
Saved response to ./items_listing_menu/244.html
Fetching https://auctions.yahoo.co.jp/search/search?min=0&max=&va=つりコン64&n=50&mode=1&auccat=22850&fixed=2
Saved response to ./items_listing_menu/100.html
Fetching https://auctions.yahoo.co.jp/search/search?min=0&max=&va=メモリーカード&n=50&mode

## Read HTML and gather data from listings

In [49]:
import time

# Record the start time
start_time = time.time() 

In [50]:
import os
import csv
import json
import time
from bs4 import BeautifulSoup
from googletrans import Translator


# Inputs
items_file = ITEMS_URL
items_listing_menu = ITEMS_LISTING_MENU
items_listing_json = ITEMS_LISTING_JSON
items_listing_img = ITEMS_LISTING_IMG
scrape_listing_limit = 50

# Ensure output folder exists
os.makedirs(items_listing_json, exist_ok=True)
os.makedirs(items_listing_img, exist_ok=True)

# Read your CSV to get IDs (and to confirm which IDs you expect)
ids = []
with open(items_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        ids.append(row['id'])

translator = Translator()

# For each id, parse HTML
for item_id in ids:
    html_path = os.path.join(items_listing_menu, f"{item_id}.html")
    if not os.path.exists(html_path):
        print(f"HTML file missing for {item_id}, skipping...")
        continue

    with open(html_path, encoding="utf-8") as f:
        html = f.read()

    soup = BeautifulSoup(html, "html.parser")
    divs = soup.find_all("div", class_="Product__image")
    divs = divs[:scrape_listing_limit]

    listings = []
    for div in divs:
        # Instead of div.get(), get the <a> with the data I need
        a_tag = div.find("a")
        if not a_tag:
            print("No <a> tag found inside div, skipping this listing.")
            continue
        
        # Parse image url to a high res version  
        raw_img = a_tag.get("data-auction-img", "").strip()
        clean_img = raw_img.split("?", 1)[0] if raw_img else ""
        # Translate title JP > EN
        auction_title_jp = a_tag.get("data-auction-title", "").strip()
        auction_title_en = ""
        if auction_title_jp:
            try:
                translated = translator.translate(auction_title_jp, src="ja", dest="en")
                auction_title_en = translated.text
            except Exception as e:
                print(f"Translation error: {e}")
                auction_title_en = ""
        
        listings.append({
            "auction_id": a_tag.get("data-auction-id", "").strip(),
            "auction_category": a_tag.get("data-auction-category", "").strip(),
            "auction_title": auction_title_jp,
            "auction_title_en": auction_title_en,
            "auction_img": clean_img,
            "auction_price": a_tag.get("data-auction-price", "").strip(),
            "auction_isflea": a_tag.get("data-auction-isflea", "").strip(),
            "auction_isfreeshipping": a_tag.get("data-auction-isfreeshipping", "").strip()
        })

    # Save to JSON file named {id}.json
    output_path = os.path.join(items_listing_json, f"{item_id}.json")
    with open(output_path, "w", encoding="utf-8") as out_f:
        json.dump(listings, out_f, ensure_ascii=False, indent=2)

    print(f"Saved {len(listings)} listings to {output_path}")
    
    # Download images from each json
    item_img_folder = os.path.join(items_listing_img, item_id)
    os.makedirs(item_img_folder, exist_ok=True)
    for listing in listings:
        img_url = listing["auction_img"]
        auction_id = listing["auction_id"]
        if not img_url or not auction_id:
            print(f"Skipping image download for empty img or id in listing.")
            continue

        img_filename = os.path.join(item_img_folder, f"{auction_id}.jpg")

        try:
            response = requests.get(img_url, timeout=10)
            response.raise_for_status()
            with open(img_filename, "wb") as img_file:
                img_file.write(response.content)
            #print(f"Downloaded image for auction {auction_id}")
            # 1 second delay each image
            time.sleep(1)
        except Exception as e:
            print(f"Error downloading image for {auction_id}: {e}")
    print(f"Downloaded images for item {item_id}")
        

Saved 50 listings to ./items_listing/json/108.json
Downloaded image for auction n1190546392
Downloaded image for auction k1190812682
Downloaded image for auction u1190796085
Downloaded image for auction c1190908462
Downloaded image for auction n1190546392
Downloaded image for auction q1190614814
Downloaded image for auction j1190197725
Downloaded image for auction g1190619600
Downloaded image for auction e1190775844
Downloaded image for auction g1190939719
Downloaded image for auction w1190910649
Downloaded image for auction v1190912743
Downloaded image for auction n1190792262
Downloaded image for auction f1190128894
Downloaded image for auction r1190642791
Downloaded image for auction q1190444247
Downloaded image for auction k1190812682
Downloaded image for auction c1190415151
Downloaded image for auction g1190789755
Downloaded image for auction j1190792838
Downloaded image for auction u1159528119
Downloaded image for auction c1190182495
Downloaded image for auction f1190436459
Downlo

Downloaded image for auction h1191010892
Downloaded image for auction s1191009809
Downloaded image for auction t1190991446
Downloaded image for auction t1190995093
Downloaded image for auction x1191001937
Saved 50 listings to ./items_listing/json/244.json
Downloaded image for auction p1190786467
Downloaded image for auction h1191015247
Downloaded image for auction b1190791694
Downloaded image for auction 1190930220
Downloaded image for auction h1190962107
Downloaded image for auction t1190575716
Downloaded image for auction c1190957028
Downloaded image for auction u1190977111
Downloaded image for auction g1190456402
Downloaded image for auction o1190774957
Downloaded image for auction u1190843359
Downloaded image for auction j1190783682
Downloaded image for auction q1190765175
Downloaded image for auction f1190149742
Downloaded image for auction 1190433862
Downloaded image for auction 1190757894
Downloaded image for auction w1190762588
Downloaded image for auction n1190316691
Downloade

Downloaded image for auction r1190670621
Downloaded image for auction o1190668498
Downloaded image for auction d1190778503
Downloaded image for auction w1190345464
Downloaded image for auction g1190474054
Downloaded image for auction e1190615219
Downloaded image for auction c1191013907
Downloaded image for auction g1188239267
Downloaded image for auction p1190291360
Downloaded image for auction u1190174062
Downloaded image for auction v1190496587
Downloaded image for auction c1190676782
Downloaded image for auction n1190208234
Downloaded image for auction u1190380306
Downloaded image for auction h1190646455
Downloaded image for auction g1187345046
Downloaded image for auction h1190789783
Downloaded image for auction c1190311213
Downloaded image for auction n1189269829
Downloaded image for auction d1183182842
Downloaded image for auction m1190672838
Downloaded image for auction d1190132477
Downloaded image for auction o1190759904
Downloaded image for auction p1190907251
Downloaded image

In [51]:
# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.4f} seconds")

Elapsed time: 874.7346 seconds
