In [1]:
import sys
import requests
from cachecontrol.caches import FileCache
from cachecontrol import CacheControlAdapter
from cachecontrol.heuristics import LastModified
from bs4 import BeautifulSoup
import datetime
import re
from typing import List
import csv

# pip install lockfileが必要です

def hwdata_main():
    for y in range(2016, 2022):
        get_hwdata(y)

def get_hwdata(year:int) -> None:
    if year <= 2016:
        lines = clean_hwyear_data_2016(get_hwyear_page(year))
    else:
        lines = clean_hwyear_data_2017(get_hwyear_page(year))
    save_hwdata(year, lines)

def save_hwdata(year:int, data:List) -> None:
    file_name = "hwdata_" + str(year) + ".csv"
    with open(file_name, "w") as f:
        writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
        writer.writerows(data)
        print(file_name + " is saved")

def get_hwyear_page(year: int) -> List:
    URLBASE = "http://www.teitengame.com/"
    data_url = URLBASE + "hard_" + str(year) + ".html"

    adapter = CacheControlAdapter(heuristic=LastModified(), cache=FileCache('_webcache'))
    session = requests.Session()
    session.mount('http://', adapter)
    response = session.get(data_url)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", attrs={"class": "table1"})
    rows = table.findAll(["tr"])

    lines = []
    for row in rows:
        csv_row = []
        for cell in row.findAll(['td', 'th']):
            csv_row.append(cell.get_text().replace("\n", "").replace(" ", ""))
        lines.append(csv_row)
    lines[0][0] = ""
    return lines

def clean_hwyear_data_2016(rawdata:List) -> List:
    header = rawdata[0]
    body = rawdata[2:-1]
    newdata = [header]
    for line in body:
        top = line[0]
        if "販売台数" in top:
            offset = 3
        elif "月" in top:
            offset = 2
        elif "週" in top:
            offset = 1
        else:
            offset = 0
        
        newline = []
        for i in range(0, offset):
            newline.append(line[i])
        
        max = len(line)
        for i in range(offset, max, 2):
            newline.append(line[i])
        newdata.append(newline)

    return clean_hwyear_data(newdata)

def clean_hwyear_data_2017(rawdata:List) -> List:
    lines = rawdata[0:-2]
    return clean_hwyear_data(lines)

def clean_hwyear_data(rawdata:List) -> List:
    header = [''] + rawdata[0]
    header[0] = 'begin_date'
    header[1] = 'days'

    year = 2020
    time_stamp = datetime.datetime(year, 1, 1, 0, 0, 0)
    index_offset = 0
    clean_data = []

    for i,data in enumerate(rawdata[1:]):
        numbers = []
        clean_line = []
        if "年販売台数" in data[0]:
            # ここは最初の年のやつ
            m = re.match(r'([0-9]+)年販売台数', data[0])
            year = int(m.groups()[0])
            time_stamp = nth_aggregate_day(year, 0)
            index_offset = i
            numbers = data[3:]
        elif "月" in data[0]:
            numbers = data[2:]
            time_stamp = nth_aggregate_day(year, i - index_offset)
        else:
            numbers = data[1:]
            time_stamp = nth_aggregate_day(year, i - index_offset)

        if len(numbers) == 0:
            clean_data[-1][1] = clean_data[-1][1] + 7
        else:
            clean_line.append(str(time_stamp.date()))
            clean_line.append(7)
            numlist = []
            for n in numbers:
                n2 = n.replace('\xa0', "").replace(",", "")
                if n2 != "":
                    numlist.append(int(n2))
                else:
                    numlist.append('')              
            clean_line.extend(numlist)
            clean_data.append(clean_line)

    return [header] + clean_data
    

def nth_aggregate_day(year:int, nth:int) -> datetime.datetime:
    # get first sunday
    new_year_day = datetime.datetime(year, 1, 1, 0, 0, 0)
    first_sunday = new_year_day
    first_weekday = new_year_day.weekday()
    if first_weekday != 6:
        delta_day = 6 - first_weekday
        first_sunday = new_year_day + datetime.timedelta(days = delta_day)
    
    delta_day = nth * 7
    target_day = first_sunday + datetime.timedelta(days = delta_day) + datetime.timedelta(days = -6)
    return target_day


In [2]:
def get_hwyear_page2022() -> List:
    URLBASE = "http://www.teitengame.com/"
    data_url = URLBASE + "hard" + ".html"

    adapter = CacheControlAdapter(heuristic=LastModified(), cache=FileCache('_webcache'))
    session = requests.Session()
    session.mount('http://', adapter)
    response = session.get(data_url)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", attrs={"class": "table1"})
    rows = table.findAll(["tr"])

    lines = []
    for row in rows:
        csv_row = []
        for cell in row.findAll(['td', 'th']):
            csv_row.append(cell.get_text().replace("\n", "").replace(" ", ""))
        lines.append(csv_row)
    lines[0][0] = ""
    return lines

In [3]:
lines = get_hwyear_page2022()

In [4]:
lines

[['',
  'ニンテンドー3DS',
  'NINTENDOSWITCH',
  'プレイステーション4',
  'プレイステーション5',
  'XboxSeriesX/S'],
 ['2011年年間販売台数', '販売台数', '4,135,739', '\xa0', '', '\xa0', '\xa0'],
 ['シェア', '\xa0', '\xa0', '\xa0', '\xa0', '\xa0'],
 ['2012年年間販売台数', '販売台数', '5,626,763', '\xa0', '\xa0', '\xa0', '\xa0'],
 ['シェア', '\xa0', '\xa0', '\xa0', '\xa0', '\xa0'],
 ['2013年年間販売台数', '販売台数', '4,931,509', '\xa0', '\xa0', '\xa0', '\xa0'],
 ['シェア', '\xa0', '\xa0', '\xa0', '\xa0', '\xa0'],
 ['2014年年間販売台数', '販売台数', '3,153,045', '\xa0', '925,570', '\xa0', '\xa0'],
 ['シェア', '\xa0', '\xa0', '\xa0', '\xa0', '\xa0'],
 ['2015年年間販売台数', '販売台数', '2,189,900', '\xa0', '1,205,163', '\xa0', '\xa0'],
 ['シェア', '\xa0', '\xa0', '\xa0', '\xa0', '\xa0'],
 ['2016年年間販売台数', '販売台数', '1,874,457', '\xa0', '1,790,883', '\xa0', '\xa0'],
 ['シェア', '\xa0', '\xa0', '\xa0', '\xa0', '\xa0'],
 ['2017年年間販売台数',
  '販売台数',
  '1,827,131',
  '3,407,158',
  '1,935,247',
  '\xa0',
  '\xa0'],
 ['シェア', '\xa0', '\xa0', '\xa0', '\xa0', '\xa0'],
 ['2018年年間販売台数', '販売台数', '566

In [5]:
header = lines[0]

In [7]:
body = lines[1:-4]
newdata = [header]
skip = True
for line in body:
    top = line[0]
    if "2022年販売台数" in top:
        skip = False
    if skip:
        continue
    newdata.append(line)

In [8]:
newdata

[['',
  'ニンテンドー3DS',
  'NINTENDOSWITCH',
  'プレイステーション4',
  'プレイステーション5',
  'XboxSeriesX/S'],
 ['2022年販売台数', '1月', '1週', '569', '195,926', '24', '46,677', '523'],
 ['2週', '609', '148,691', '15', '10,677', '263'],
 ['3週', '489', '94,523', '17', '14,453', '2,267'],
 ['4週', '325', '77,219', '14', '18,857', '2,627'],
 ['5週', '267', '99,078', '11', '23,339', '1,696'],
 ['2月', '1週', '264', '92,452', '8', '16,888', '5,990'],
 ['2週', '360', '118,065', '4', '6,418', '5,955'],
 ['3週', '364', '96,929', '11', '25,434', '4,799'],
 ['4週', '511', '70,232', '17', '17,032', '4,285'],
 ['3月', '1週', '511', '96,952', '11', '25,679', '1,321'],
 ['2週', '461', '98,246', '16', '16,286', '1,962'],
 ['3週', '287', '85,497', '10', '16,968', '3,554'],
 ['4週', '280', '83,800', '13', '35,474', '390'],
 ['4月', '1週', '305', '67,164', '12', '30,666', '4,409'],
 ['2週', '194', '61,162', '14', '11,224', '4,068'],
 ['3週', '224', '60,289', '15', '11,259', '5,742'],
 ['4週', '205', '57,490', '12', '17,681', '6,025'],
 ['5週', '

In [9]:
clean_hwyear_data(newdata)

[['begin_date',
  'days',
  'ニンテンドー3DS',
  'NINTENDOSWITCH',
  'プレイステーション4',
  'プレイステーション5',
  'XboxSeriesX/S'],
 ['2021-12-27', 7, 569, 195926, 24, 46677, 523],
 ['2022-01-03', 7, 609, 148691, 15, 10677, 263],
 ['2022-01-10', 7, 489, 94523, 17, 14453, 2267],
 ['2022-01-17', 7, 325, 77219, 14, 18857, 2627],
 ['2022-01-24', 7, 267, 99078, 11, 23339, 1696],
 ['2022-01-31', 7, 264, 92452, 8, 16888, 5990],
 ['2022-02-07', 7, 360, 118065, 4, 6418, 5955],
 ['2022-02-14', 7, 364, 96929, 11, 25434, 4799],
 ['2022-02-21', 7, 511, 70232, 17, 17032, 4285],
 ['2022-02-28', 7, 511, 96952, 11, 25679, 1321],
 ['2022-03-07', 7, 461, 98246, 16, 16286, 1962],
 ['2022-03-14', 7, 287, 85497, 10, 16968, 3554],
 ['2022-03-21', 7, 280, 83800, 13, 35474, 390],
 ['2022-03-28', 7, 305, 67164, 12, 30666, 4409],
 ['2022-04-04', 7, 194, 61162, 14, 11224, 4068],
 ['2022-04-11', 7, 224, 60289, 15, 11259, 5742],
 ['2022-04-18', 7, 205, 57490, 12, 17681, 6025],
 ['2022-04-25', 14, 452, 176592, 81, 49798, 14118]]