https://zukan.pokemon.co.jp/ の HTML をダウンロードしてきてゴニョって li だけの羅列にしたものをパースする

In [1]:
from dataclasses import dataclass, asdict
from html.parser import HTMLParser

In [2]:
@dataclass()
class Poke:
    name: str
    alt: str
    number: str
    img_url: str

def get_attr(attrs: list[tuple[str, str]], name: str) -> str:
    for a, v in attrs:
        if a == name:
            return v
    raise ValueError(name)

class PParser(HTMLParser):
    
    def __init__(self):
        super().__init__()
        self.data = []
        self.name = ''
        self.alt = ''
        self.number = ''
        self.img_url = ''
        self.state = 0
    
    def handle_starttag(self, tag, attrs):
        self.state = 0
        match tag:
            case 'li':
                self.name = ''
                self.alt = ''
                self.number = get_attr(attrs, 'id')
                self.img_url = ''
            case 'img':
                self.img_url = get_attr(attrs, 'src')
                self.alt = get_attr(attrs, 'alt')
            case 'p':
                self.state = 1
            case 'span':
                self.state = 2

    def handle_endtag(self, tag):
        match tag:
            case 'li':
                self.data.append(Poke(self.name, self.alt, self.number, self.img_url))

    def handle_data(self, data):
        match self.state:
            case 1:
                self.name = data
            case 2:
                self.name += data

In [3]:
parser = PParser()
with open('poke.html') as f:
    parser.feed(f.read())

In [4]:
import csv
with open('poke.csv', 'w') as f:
    w = csv.DictWriter(f, list(Poke.__dataclass_fields__.keys()))
    w.writeheader()
    for d in parser.data:
        w.writerow(asdict(d))