In [3]:
from bs4 import BeautifulSoup
import json
import re

# Path to your saved HTML file
html_file_path = "base_singles.html"

# Base URL for generating links
base_url = "https://www.cardmarket.com/en/Pokemon/Products/Singles/"

try:
    # Load the HTML file
    with open(html_file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Find the select element with name="idExpansion"
    sets = []
    select_tag = soup.find("select", {"name": "idExpansion"})
    if select_tag:
        for option in select_tag.find_all("option"):
            set_name = option.text.strip()
            set_id = option.get("value")
            if set_name and set_id and set_id != "0":  # Exclude "All" or empty values
                # Format the set name for the URL
                formatted_set_name = re.sub(r'[^\w\s-]', '', set_name)  # Remove special characters except spaces and hyphens
                formatted_set_name = formatted_set_name.replace(" ", "-")  # Replace spaces with hyphens
                formatted_set_name = formatted_set_name.replace("+", "")  # Remove '+' characters
                link = f"{base_url}{formatted_set_name}"
                sets.append({
                    "name": set_name,
                    "id": set_id,
                    "link": link
                })

    # Save the extracted data to a JSON file
    with open("pokemon_sets.json", "w", encoding="utf-8") as json_file:
        json.dump(sets, json_file, indent=2, ensure_ascii=False)

    # Print the extracted data
    print("Extracted sets:")
    for set_data in sets:
        print(set_data)

    print("\nSet data saved to 'pokemon_sets.json'")
except Exception as e:
    print(f"Error: {e}")


Extracted sets:
{'name': '"W" Promos', 'id': '1606', 'link': 'https://www.cardmarket.com/en/Pokemon/Products/Singles/W-Promos'}
{'name': '10th Movie Commemoration Set', 'id': '4310', 'link': 'https://www.cardmarket.com/en/Pokemon/Products/Singles/10th-Movie-Commemoration-Set'}
{'name': '11th Movie Commemoration Set', 'id': '4299', 'link': 'https://www.cardmarket.com/en/Pokemon/Products/Singles/11th-Movie-Commemoration-Set'}
{'name': '151', 'id': '5402', 'link': 'https://www.cardmarket.com/en/Pokemon/Products/Singles/151'}
{'name': '25th Anniversary Collection', 'id': '4345', 'link': 'https://www.cardmarket.com/en/Pokemon/Products/Singles/25th-Anniversary-Collection'}
{'name': '25th Anniversary Edition', 'id': '4346', 'link': 'https://www.cardmarket.com/en/Pokemon/Products/Singles/25th-Anniversary-Edition'}
{'name': '25th Anniversary Golden Box', 'id': '4389', 'link': 'https://www.cardmarket.com/en/Pokemon/Products/Singles/25th-Anniversary-Golden-Box'}
{'name': 'ADV Expansion Pack', 'id

In [19]:
import json
import unidecode

def correct_links(input_file, output_file):
    """
    Reads a JSON file with incorrect links, processes the links to fix them, 
    and writes the corrected links to a new JSON file.
    
    :param input_file: Path to the original JSON file with incorrect links.
    :param output_file: Path to the new JSON file with corrected links.
    """
    with open(input_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    corrected_data = []

    for entry in data:
        name = entry['name']
        corrected_link = (
            'https://www.cardmarket.com/en/Pokemon/Products/Singles/' +
            unidecode.unidecode(name)       # Normalize accented characters
                .replace(' & ', '-')        # Replace " & " with "-"
                .replace(':', '')           # Remove colons
                .replace('.', '')           # Remove dots
                .replace(',', '')           # Remove commas
                .replace('"', '')           # Remove double quotes
                .replace("'", '')           # Remove single quotes
                .replace('+', '-')          # Replace plus signs with hyphens
                .replace('!', '')           # Remove exclamation marks
                .replace('...', '')         # Remove ellipses
                .replace('•', '')           # Remove bullet points
                .replace('*', '-')          # Replace asterisks with hyphens
                .replace(' ', '-')          # Replace spaces with "-"
        )
        # Ensure no double hyphens
        while '--' in corrected_link:
            corrected_link = corrected_link.replace('--', '-')
        
        corrected_link += '?idRarity=0&perSite=20'  # Add the fixed parameters

        corrected_entry = {
            'name': entry['name'],
            'id': entry['id'],
            'link': corrected_link
        }
        corrected_data.append(corrected_entry)

    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(corrected_data, file, indent=2, ensure_ascii=False)

    print(f"Corrected links saved to {output_file}")

# Example usage
input_file = 'pokemon_sets.json'  # Replace with the path to your original JSON file
output_file = 'corrected_pokemon_sets.json'  # Path for the new JSON file with corrected links
correct_links(input_file, output_file)


Corrected links saved to corrected_pokemon_sets.json


In [20]:
import json
import re

def validate_links(file_path):
    """
    Validates the links in a JSON file to ensure they do not contain problematic characters.
    Prints problematic links with their respective issues.

    :param file_path: Path to the JSON file with links to validate.
    """
    # Define a regex pattern for valid URL characters
    valid_url_pattern = re.compile(r'^[a-zA-Z0-9:/?&=_\-%.]+$')

    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    problematic_links = []

    for entry in data:
        link = entry['link']
        if not valid_url_pattern.match(link):
            problematic_links.append({'name': entry['name'], 'id': entry['id'], 'link': link})

    if problematic_links:
        print(f"Found {len(problematic_links)} problematic links:")
        for problem in problematic_links:
            print(f"Name: {problem['name']}, ID: {problem['id']}, Link: {problem['link']}")
    else:
        print("All links are valid!")

# Example usage
file_path = 'corrected_pokemon_sets.json'  # Replace with the path to your JSON file
validate_links(file_path)


All links are valid!


## Now check the console output and identify the problematic sets:
### they have eithor more than 15 pages of data, or invalid link

In [21]:
import re

def extract_problematic_sets(file_path, output_file):
    """
    Extracts names and links of sets that processed exactly 15 pages
    from a console output log file. Saves the problematic sets to a JSON file.

    :param file_path: Path to the console output file.
    :param output_file: Path to save the extracted problematic sets as JSON.
    """
    import json

    # Regular expressions to match relevant lines
    fetch_pattern = re.compile(r"Fetching: (.+?) \((https://.+?)\)")
    processing_pattern = re.compile(r"Processing page (\d+) for (.+)")
    last_page_pattern = re.compile(r"Reached the last page for (.+)")
    
    # Store data about sets
    sets = {}
    problematic_sets = []

    # Read the file line by line
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Match the "Fetching" line to get set name and link
            fetch_match = fetch_pattern.match(line)
            if fetch_match:
                name, link = fetch_match.groups()
                sets[name] = {"link": link, "pages": 0}

            # Match the "Processing page" line to track pages processed
            processing_match = processing_pattern.match(line)
            if processing_match:
                page_number, name = processing_match.groups()
                page_number = int(page_number)
                if name in sets:
                    sets[name]["pages"] = max(sets[name]["pages"], page_number)

            # Match the "Reached the last page" line to confirm last page processing
            last_page_match = last_page_pattern.match(line)
            if last_page_match:
                name = last_page_match.group(1)
                if name in sets and sets[name]["pages"] == 15:
                    problematic_sets.append({"name": name, "link": sets[name]["link"]})

    # Save problematic sets to JSON
    with open(output_file, 'w', encoding='utf-8') as output:
        json.dump(problematic_sets, output, indent=2, ensure_ascii=False)

    print(f"Extracted {len(problematic_sets)} problematic sets.")
    print(f"Results saved to {output_file}")

# Example usage
console_output_file = 'console_output.txt'  # Path to your console output file
problematic_sets_file = 'problematic_sets.json'  # Path to save problematic sets
extract_problematic_sets(console_output_file, problematic_sets_file)


Extracted 30 problematic sets.
Results saved to problematic_sets.json
