In [72]:
import os
import sys
import requests
import datetime
import re
from typing import Tuple, List
import csv
from cachecontrol.caches import FileCache
from cachecontrol import CacheControlAdapter
from cachecontrol.heuristics import LastModified
from bs4 import BeautifulSoup
from datetime import datetime
from kanjize import kanji2number

# pip install lockfileが必要です



In [37]:
# ファミ通のハードウェア売上ページから、今週のハード売上リストと集計期間を取得する
def get_famitsu_hwsales_page(url: str) -> Tuple[List[str], str]:
    adapter = CacheControlAdapter(heuristic=LastModified(), cache=FileCache('_webcache'))
    session = requests.Session()
    session.mount('https://', adapter)
    response = session.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # ファミ通のハードウェア売上ページは、特定のクラス名を持つ要素からデータを抽出
    # ここでは、"article_detail_itemization_string"クラスのspan要素
    # と、"article_detail_annotation"クラスのspan要素を使用している
    # これらの要素から、ハードウェア売上のリストと集計期間を取得する
    items = soup.find_all("span", attrs={"class": "article_detail_itemization_string"})
    annotations = soup.find_all("span", attrs={"class": "article_detail_annotation"})


    raw_hard_sales = [item.get_text(strip=True) for item in items]
    raw_report_date = annotations[-1].get_text(strip=True)

    return raw_hard_sales, raw_report_date

In [None]:
target_url = "https://www.famitsu.com/article/202507/48013"
raw_sales, raw_report_date = get_famitsu_hwsales_page(target_url)


In [40]:
raw_sales

['Switch2／15万2165台（累計175万4876台）',
 'Switch／2283台（累計2012万5249台）',
 'Switch Lite／4276台（累計661万3787台）',
 'Nintendo Switch（有機ELモデル）／5207台（累計909万7284台）',
 'PS5／3635台（累計572万1126台）',
 'PS5 デジタル・エディション／849台（累計97万9902台）',
 'PS5 Pro／1664台（累計23万1290台）',
 'Xbox Series X／61台（累計32万1043台）',
 'Xbox Series X デジタルエディション／30台（累計21093台）',
 'Xbox Series S／59台（累計33万8761台）',
 'PS4／22台（累計792万9774台）']

In [65]:
def parse_hard_sales_lines(lines: List[str]) -> List[str]:
    """
    Parse the hard sales lines to extract hardware names and sales data.
    """
    hard_sales = []
    for line in lines:
        hard_sales_line = []
        hw, rest = line.split("／", 1)
        weekly_units, cumulative_units = rest.split("台（累計")
        # cumulative_units = cumulative_units.rstrip("台）")
        hard_sales_line.append(hw.strip())
    
        sales_units = kanji2number(weekly_units.strip())
        hard_sales_line.append(sales_units)

        hard_sales.append(hard_sales_line)

    return hard_sales


In [67]:
def normalize_hw_units(hard_sales: List[List[str]]) -> List[List[str]]:
    """
    Normalize the hardware units in the hard sales list.
    """
    HW_MAP = {
        "Switch2": "NS2",
        "Switch": "NSW",
        "Switch Lite": "NSW",
        "Nintendo Switch（有機ELモデル）": "NSW",
        "PS5": "PS5",
        "PS5 デジタル・エディション": "PS5",
        "PS5 Pro": "PS5",
        "PS4": "PS4",
        "Xbox Series X": "XSX",
        "Xbox Series X デジタルエディション": "XSX",
        "Xbox Series S": "XSX"
    }

    normalized_sales = {}
    for hw, units in hard_sales:
        hw_name = HW_MAP.get(hw)
        normalized_sales[hw_name] = normalized_sales.get(hw_name, 0) + units

    normalized_list = []
    for hw, units in normalized_sales.items():
        normalized_list.append([hw, units])

    return normalized_list

In [69]:
newlist = parse_hard_sales_lines(raw_sales)
newlist

[['Switch2', 152165],
 ['Switch', 2283],
 ['Switch Lite', 4276],
 ['Nintendo Switch（有機ELモデル）', 5207],
 ['PS5', 3635],
 ['PS5 デジタル・エディション', 849],
 ['PS5 Pro', 1664],
 ['Xbox Series X', 61],
 ['Xbox Series X デジタルエディション', 30],
 ['Xbox Series S', 59],
 ['PS4', 22]]

In [70]:
newlist2 = normalize_hw_units(newlist)
newlist2

[['NS2', 152165], ['NSW', 11766], ['PS5', 6148], ['XSX', 150], ['PS4', 22]]

In [42]:
def extract_date_range(date_string: str) -> Tuple[datetime, datetime]:
    """
    Extract two datetime objects from a string containing a date range.
    """
    # 正規表現で日付を抽出
    match = re.search(r"(\d{4}年\d{1,2}月\d{1,2}日)～(\d{1,2}月\d{1,2}日|\d{4}年\d{1,2}月\d{1,2}日)", date_string)
    if not match:
        raise ValueError("日付範囲が見つかりませんでした。")

    start_date_str, end_date_str = match.groups()

    # 開始日をdatetimeに変換
    start_date = datetime.strptime(start_date_str, "%Y年%m月%d日")

    # 終了日の年が含まれているか確認
    if "年" in end_date_str:
        end_date = datetime.strptime(end_date_str, "%Y年%m月%d日")
    else:
        # 開始日の年を使用して終了日を補完
        end_date = datetime.strptime(f"{start_date.year}年{end_date_str}", "%Y年%m月%d日")

    return start_date, end_date

start_date, end_date = extract_date_range(raw_report_date)

In [45]:
def calculate_date_range_days(start_date: datetime, end_date: datetime) -> Tuple[datetime, int]:
    """
    Calculate the number of days from start_date to end_date (inclusive).
    """
    delta_days = (end_date - start_date).days + 1  # +1 to include both start and end dates
    return end_date, delta_days

In [81]:
start_date, end_date = extract_date_range(raw_report_date)
end_date, period_date = calculate_date_range_days(start_date, end_date)

# convert end_date to string for CSV output
end_date_str = end_date.strftime("%Y-%m-%d")

In [84]:
new_record = [[f"{end_date_str}_{sales_line[0]}", end_date_str, period_date, sales_line[0], sales_line[1]] for sales_line in newlist2]


In [85]:
new_record

[['2025-07-20_NS2', '2025-07-20', 7, 'NS2', 152165],
 ['2025-07-20_NSW', '2025-07-20', 7, 'NSW', 11766],
 ['2025-07-20_PS5', '2025-07-20', 7, 'PS5', 6148],
 ['2025-07-20_XSX', '2025-07-20', 7, 'XSX', 150],
 ['2025-07-20_PS4', '2025-07-20', 7, 'PS4', 22]]