In [14]:
import requests
from bs4 import BeautifulSoup
import json
import os
import re
from tqdm import tqdm

In [3]:
base_url = 'https://en.onepiece-cardgame.com/cardlist/?series=56910'
image_base_url = 'https://en.onepiece-cardgame.com'

In [4]:
# Function to save the image
def save_image(series, card_id, image_url):
    image_response = requests.get(image_url)
    image_response.raise_for_status()
    image_path = f'cards/series-{series}/{card_id}.jpg'

    with open(image_path, 'wb') as image_file:
        image_file.write(image_response.content)

    print(f'Successfully saved image: {image_path}')


In [10]:
def load_html_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [11]:
def extract_prefix(card_id):
    match = re.match(r'^[^-]+', card_id)
    return match.group(0) if match else ''

In [17]:
# Function to fetch and parse card details from a single page
def parse_card_details(html_file_path):
    html_content = load_html_from_file(html_file_path)
    soup = BeautifulSoup(html_content, 'html.parser')
    card_blocks = soup.find_all('dl', class_='modalCol')

    for i, card_block in tqdm(enumerate(card_blocks)):
        print(f'{i} / {len(card_blocks)}')
        card_id = card_block.get('id')
        card_name = card_block.find('div', class_='cardName').text.strip()
        cost = card_block.find('div', class_='cost').text.strip().replace('Cost', '').strip()
        power = card_block.find('div', class_='power').text.strip().replace('Power', '').strip()
        attribute = card_block.find('div', class_='attribute').find('i').text.strip()
        image_relative_url = card_block.find('img', {'class': 'lazy'}).get('data-src')
        image_url = f"{image_base_url}{image_relative_url.replace('..', '')}"
        
        card_data = {
            'id': card_id,
            'name': card_name,
            'cost': cost,
            'power': power,
            'attribute': attribute,
            'image_url': image_url
        }
        
        prefix = extract_prefix(card_id)
        directory = f'cards/{prefix}'
        os.makedirs(directory, exist_ok=True)
    
        with open(f'{directory}/{card_id}.json', 'w') as json_file:
            json.dump(card_data, json_file, indent=4)

        image_response = requests.get(image_url)
        image_response.raise_for_status()
        with open(f'{directory}/{card_id}.jpg', 'wb') as image_file:
            image_file.write(image_response.content)

In [22]:
for html_file in ["htmls/ST10.html"]:    
    parse_card_details(html_file)

0it [00:00, ?it/s]

0 / 19


1it [00:02,  2.14s/it]

1 / 19


2it [00:04,  2.09s/it]

2 / 19


3it [00:06,  2.09s/it]

3 / 19


4it [00:08,  2.10s/it]

4 / 19


5it [00:10,  2.10s/it]

5 / 19


6it [00:12,  2.11s/it]

6 / 19


7it [00:14,  2.12s/it]

7 / 19


8it [00:16,  2.12s/it]

8 / 19


9it [00:19,  2.12s/it]

9 / 19


10it [00:21,  2.11s/it]

10 / 19


11it [00:23,  2.10s/it]

11 / 19


12it [00:25,  2.13s/it]

12 / 19


13it [00:27,  2.13s/it]

13 / 19


14it [00:29,  2.14s/it]

14 / 19


15it [00:31,  2.13s/it]

15 / 19


16it [00:33,  2.13s/it]

16 / 19


17it [00:36,  2.12s/it]

17 / 19


18it [00:38,  2.12s/it]

18 / 19


19it [00:40,  2.12s/it]
