In [11]:
import requests
from bs4 import BeautifulSoup
import json
import time
import re

# 1. 爬取所有型號的網址
def scrape_sogi_links(url):
    base_url = "https://www.sogi.com.tw"
    response = requests.get(url)
    response.encoding = 'utf-8'

    if response.status_code != 200:
        print(f"無法取得網頁: {url}，狀態碼: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    mixitup_div = soup.find('div', id="mixitup-block_38-cellphone")

    if not mixitup_div:
        print(f"找不到目標區塊: {url}")
        return []

    links = []
    for card in mixitup_div.find_all('div', class_="card shadow text-center p-2"):
        a_tag = card.find('a', class_="font-weight-bold text-danger text-decoration-none")
        if a_tag and 'href' in a_tag.attrs:
            full_url = base_url + a_tag['href']
            links.append(full_url)

    return links

# 2. 爬取單個型號的所有店家資訊
def scrape_sogi_data(url):
    response = requests.get(url)
    response.encoding = 'utf-8'

    if response.status_code != 200:
        print(f"無法取得 {url}，狀態碼: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    items = soup.find_all('div', class_='col-12 col-md-6 col-lg-3 my-2')

    results = []
    model_name = url.split("/")[-2].split("-")[0]  # 提取型號名稱
    storage_size = url.split("/")[-2].split("-")[-1] #手機容量
    brand = url.split("/")[-2].split("_")[0]  # 品牌
  
 
      


    for item in items:
        # 提取顏色
        color = item.find('span', class_='badge badge-light rounded-0 mx-0')
        color = color.text.strip() if color else 'N/A'

        # 提取店名
        shop_name = item.find('a', class_='text-row-2 mb-2 w-75 w-lg-100')
        shop_name = shop_name.text.strip() if shop_name else 'N/A'

        # 提取評分
        rating = item.find('a', class_='text-dark text-decoration-none text-row-1')
        rating_value = rating.find('small').text.strip().split("\n")[0] if rating else 'N/A'  # Extract rating (before \n)
        #提取評價數
        review_count = re.search(r"\((\d+)\)", rating.text)  # Extract review count inside parentheses
        review_count = review_count.group(1) if review_count else 'N/A'

        #價格
        price = item.find('span', class_='h4 text-danger font-weight-bold')
        price = price.text.strip().replace('$', '') if price else 'N/A'

        # 提取地址
        address = item.find('div', class_='text-muted text-decoration-none')
        address = address.find('small').text.strip() if address else 'N/A'

        # 提取更新時間
        update_time = item.find('div', class_='font-small text-muted mb-4')
        update_time = update_time.text.strip() if update_time else 'N/A'

        if update_time != 'N/A':
           date = update_time.split('：')[-1]
        else:
           date = 'N/A'

        # 提取機況資訊
        condition = item.find('div', class_='text-row-3 font-small mb-2')
        condition = condition.decode_contents().strip().replace('<br>', '\n') if condition else 'N/A'

        if condition != 'N/A':
        # Use regex to extract the required values
         appearance = re.search(r"外觀機況：([^<]+)", condition)
         warranty = re.search(r"門市保固：([^<]+)", condition)
         battery_health = re.search(r"電池健康度：([^%]+)", condition)

        # Extract values or assign 'N/A' if not found
         appearance_value = appearance.group(1) if appearance else 'N/A'
         warranty_value = warranty.group(1) if warranty else 'N/A'
         battery_health_value = battery_health.group(1) if battery_health else 'N/A'

        
        # 儲存資料
        results.append({
            '品牌': brand,
            '機型': model_name,
            '金額': price,
            '容量': storage_size,
            '顏色': color,
            '店名': shop_name,
            '評分': rating_value,
            '評價數': review_count,
            '地址': address,
            '更新時間': date,
            '外觀完整': appearance_value,
            '門市保固': warranty_value,
            '電池健康度': battery_health_value
        })

    return results

# 3. 爬取所有品牌的所有型號的店家資訊，並存入 JSON 檔案
def scrape_all_brands(brand_urls, output_json):
    all_results = []

    for brand_url in brand_urls:
        print(f"\n====== 開始爬取品牌: {brand_url} ======")
        model_links = scrape_sogi_links(brand_url)

        for index, model_url in enumerate(model_links, start=1):
            print(f"正在爬取第 {index}/{len(model_links)} 個型號: {model_url}")
            model_data = scrape_sogi_data(model_url)
            all_results.extend(model_data)
            time.sleep(1)  # 避免過快請求被封鎖

    # 存入 JSON 檔案
    with open(output_json, mode='w', encoding='utf-8') as file:
        json.dump(all_results, file, ensure_ascii=False, indent=4)

    print(f"\n共爬取 {len(all_results)} 筆資料，已成功存成 {output_json}")

# 4. 執行爬蟲
brand_urls = [
    "https://www.sogi.com.tw/brands/SAMSUNG/22",
    "https://www.sogi.com.tw/brands/Apple/116",
    "https://www.sogi.com.tw/brands/vivo/5603",
    "https://www.sogi.com.tw/brands/OPPO/5372",
    "https://www.sogi.com.tw/brands/ASUS/49",
    "https://www.sogi.com.tw/brands/Xiaomi/5368",
    "https://www.sogi.com.tw/brands/Sony/26",
    "https://www.sogi.com.tw/brands/realme/6012",
    "https://www.sogi.com.tw/brands/POCO/6016",
    "https://www.sogi.com.tw/brands/Google/4041",
    "https://www.sogi.com.tw/brands/HTC/154",
    "https://www.sogi.com.tw/brands/Motorola/15",
    "https://www.sogi.com.tw/brands/SHARP/39",
    "https://www.sogi.com.tw/brands/Nokia/17"
]

output_json = "sogi_data.json"
scrape_all_brands(brand_urls, output_json)



正在爬取第 1/56 個型號: https://www.sogi.com.tw/usedprices/samsung-galaxy-s25-256gb/907
正在爬取第 2/56 個型號: https://www.sogi.com.tw/usedprices/samsung_galaxy_z_fold6-256gb/817
正在爬取第 3/56 個型號: https://www.sogi.com.tw/usedprices/samsung_galaxy_z_flip6-256gb/815
正在爬取第 4/56 個型號: https://www.sogi.com.tw/usedprices/samsung_galaxy_a55_5g-256gb/772
正在爬取第 5/56 個型號: https://www.sogi.com.tw/usedprices/samsung_galaxy_a55_5g-128gb/771
正在爬取第 6/56 個型號: https://www.sogi.com.tw/usedprices/samsung_galaxy_a35_5g-128gb/770
正在爬取第 7/56 個型號: https://www.sogi.com.tw/usedprices/samsung_galaxy_s24_ultra-512gb/753
正在爬取第 8/56 個型號: https://www.sogi.com.tw/usedprices/samsung_galaxy_a15_5g-128gb/743
正在爬取第 9/56 個型號: https://www.sogi.com.tw/usedprices/samsung_galaxy_a25_5g-128gb/744
正在爬取第 10/56 個型號: https://www.sogi.com.tw/usedprices/samsung_galaxy_s24-256gb/745
正在爬取第 11/56 個型號: https://www.sogi.com.tw/usedprices/samsung_galaxy_s24plus-256gb/746
正在爬取第 12/56 個型號: https://www.sogi.com.tw/usedprices/samsung_galaxy_s24_ultra-256gb/7