## Create Item Urls

In [2]:
ITEMS_LIST = "items.json"
ITEMS_URL = "items-url.csv"
MENU = "./menu/"
LISTING_JSON = "./auctions/json"
LISTING_IMG = "./auctions/raw"
SCRAPE_LISTING_LIMIT = 50

In [None]:
import json 
import math 

items_file = ITEMS_LIST
url_file = ITEMS_URL
listings_per_page = 20

# Opening JSON file return as dict
f = open(items_file)
data = json.load(f)

with open(url_file, "w", encoding="utf-8") as f:
    # File Header
    f.write("id,url\n")
    for d in data['DeviceFamilies']:
        item_id = d['id']
        code = d['code'] or ""
        search = d['search'] or ""
        children = d['children']
        #pull 20 listings each subtype if Family has a category
        if code or search:
            for page in range(1,(len(children)*2)+1):
                url = (f"https://zenmarket.jp/en/yahoo.aspx?"f"c={code}&q={search}&p={page}")
                f.write(f"{item_id},{url}\n")
                print(url)  # optional: show on console
        else:
            for c in children:
                if not c['ignore']:
                    code = c['code']
                    search = c['search'] or ""
                    url_1 = (f"https://zenmarket.jp/en/yahoo.aspx?"f"c={code}&q={search}&p=1")
                    url_2 = (f"https://zenmarket.jp/en/yahoo.aspx?"f"c={code}&q={search}&p=2")
                    f.write(f"{item_id},{url_1}\n")
                    f.write(f"{item_id},{url_2}\n")
                    print(url_1)
                    print(url_2)

# Closing file
f.close()


## Scrapy: Retrieve html from each URL (5sec delay)

In [None]:
import csv
import requests
import time
import os

# Path to your global vals
items_file = ITEMS_URL
menu = MENU

# Ensure the base folder exists
os.makedirs(items_listing_menu, exist_ok=True)

with open(items_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        item_id = row['id']
        url = row['url']

        print(f"Fetching {url}")

        # Making the Request
        try:
            response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Request failed for {url}: {e}")
            continue

        # Create a folder per item_id
        item_folder = os.path.join(items_listing_menu, item_id)
        os.makedirs(item_folder, exist_ok=True)

        # Determine the next file number
        existing_files = [
            fname for fname in os.listdir(item_folder)
            if fname.endswith('.html') and fname[:-5].isdigit()
        ]
        next_number = (
            max([int(fname[:-5]) for fname in existing_files], default=0) + 1
        )

        # Save the HTML
        filename = os.path.join(item_folder, f"{next_number}.html")
        with open(filename, "w", encoding="utf-8") as f:
            f.write(response.text)

        print(f"Saved response to {filename}")

        # Wait 5 seconds before next request
        time.sleep(5)

print("All listing menus retrieved!")


## Read HTML and gather data from listings

In [None]:
import os
import json
import time
import glob
import requests
from bs4 import BeautifulSoup
from googletrans import Translator
from collections import defaultdict

# Inputs
items_listing_menu = MENU
items_listing_json = LISTING_JSON
items_listing_img = LISTING_IMG

# Ensure output folders exist
os.makedirs(items_listing_json, exist_ok=True)
os.makedirs(items_listing_img, exist_ok=True)

# Initialize translator
translator = Translator()

# Find all HTML files recursively
html_files = glob.glob(os.path.join(items_listing_menu, "**/*.html"), recursive=True)

# Group files by item_id
files_by_item = defaultdict(list)
for html_path in html_files:
    item_id = os.path.basename(os.path.dirname(html_path))
    files_by_item[item_id].append(html_path)

if not files_by_item:
    print("No HTML files found.")
else:
    for item_id, file_paths in files_by_item.items():
        listings = []

        for html_path in sorted(file_paths):
            with open(html_path, encoding="utf-8") as f:
                html = f.read()

            soup = BeautifulSoup(html, "html.parser")
            parent_div = soup.find("div", id="yahoo-search-results")
            if not parent_div:
                print(f"No 'yahoo-search-results' div found in {html_path}, skipping...")
                continue

            divs = parent_div.find_all("div", class_="yahoo-search-result")
            # No slicing here—keep them all

            for div in divs:
                # Get TITLE
                a_tag = div.find("a", class_="auction-url")
                if not a_tag:
                    print("No <a> tag found inside div, skipping this listing.")
                    continue
                title = a_tag.get_text(strip=True)

                # Translate TITLE JP > EN
                auction_title_jp = title
                auction_title_en = ""
                if auction_title_jp:
                    try:
                        translated = translator.translate(auction_title_jp, src="ja", dest="en")
                        auction_title_en = translated.text
                    except Exception as e:
                        print(f"Translation error: {e}")
                        auction_title_en = ""

                # Get IMAGE URL
                img_tag = div.find("div", class_="img-wrap").find("img")
                img_src = img_tag["src"].strip() if img_tag else ""
                clean_img = img_src.split("?", 1)[0] if img_src else ""

                # Get PRICE SPAN
                price_div = div.find("div", class_="auction-price")
                amount_span = price_div.find("span", class_="amount") if price_div else None

                usd = amount_span.get("data-usd", "").strip() if amount_span else ""
                jpy = amount_span.get("data-jpy", "").strip() if amount_span else ""
                sgd = amount_span.get("data-sgd", "").strip() if amount_span else ""
                
                # Get AUCTION ID
                remove_watchlist_a = div.find("a", class_="removeFromWatchList")
                auction_id = remove_watchlist_a.get("data-auctionid", "").strip() if remove_watchlist_a else ""
                
                listings.append({
                    "auction_id": auction_id,
                    "auction_title": auction_title_jp,
                    "auction_title_en": auction_title_en,
                    "auction_img": clean_img,
                    "auction_price_usd": usd,
                    "auction_price_jpy": jpy,
                    "auction_price_sgd": sgd
                })

        # Save all listings for this item_id to one JSON
        output_path = os.path.join(items_listing_json, f"{item_id}.json")
        with open(output_path, "w", encoding="utf-8") as out_f:
            json.dump(listings, out_f, ensure_ascii=False, indent=2)

        print(f"Saved {len(listings)} listings to {output_path}")
        
        # Download images
        item_img_folder = os.path.join(items_listing_img, item_id)
        os.makedirs(item_img_folder, exist_ok=True)

        for listing in listings:
            img_url = listing["auction_img"]
            auction_id = listing["auction_id"]
            if not img_url or not auction_id:
                print("Skipping image download for empty img or id.")
                continue

            img_filename = os.path.join(item_img_folder, f"{auction_id}.jpg")

            try:
                response = requests.get(img_url, timeout=10)
                response.raise_for_status()
                with open(img_filename, "wb") as img_file:
                    img_file.write(response.content)
                # 1-second delay between images
                time.sleep(1)
            except Exception as e:
                print(f"Error downloading image for {auction_id}: {e}")

        print(f"Downloaded images for item {item_id}")

print("All processing complete.")
