In [233]:
import os
import re
import json
import time
import requests
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup

In [234]:
project_name = 'pokemon-test-1'
output_file = Path(f'{project_name}.jsonl')

full_dataset = False
part_dataset = 236

In [235]:
POKEMON_LIST = "https://pokemondb.net/pokedex/all"

response = requests.get(POKEMON_LIST)
pokemon_soup_list = BeautifulSoup(response.text, "html.parser")

pokemon_list = list(dict.fromkeys(pokemon_soup_list.find_all('a', class_="ent-name")))

if full_dataset:
  pokemon_scope = len(pokemon_list)
else:
  pokemon_scope =  part_dataset

In [236]:
def list_to_str(l:list):
    if len(l) == 1:
        s = l[0]
    elif len(l) == 2:
        s = " and ".join(l)
    else:
        s = ", ".join(l[:-1]) + f", and {l[-1]}"

    return s

In [237]:
def build_core_doc(info: dict):
    name = info.get("name")
    natdex = info.get("id")
    desc = info.get("description")
    species = info.get("species")
    types = info.get("types") or []
    generation = info.get("generation")
    height = info.get("height")
    weight = info.get("weight")
    abilities = info.get("abilities") or []
    name_etymology = info.get("name_etymology")

    # Clean types (remove None/empty)
    clean_types = [t for t in types if t]
    if clean_types:
        type_str = "/".join(clean_types)
    else:
        type_str = "unknown"

    # Generation string
    gen_str = f"Generation {generation}" if generation else "an unknown generation"

    text_parts = []

    # Core identity sentence
    text_parts.append(f"{name} is a {type_str}-type Pokémon introduced in {gen_str}.")

    # National Pokédex number
    if natdex is not None:
        text_parts.append(f"It is number {natdex} in the National Pokédex.")

    # Species line
    if species:
        text_parts.append(f"{name} is classified as the {species} Pokémon.")

    # Height and weight
    if height is not None or weight is not None:
        hw_parts = []
        if height is not None:
            hw_parts.append(f"is {height} tall")
        if weight is not None:
            hw_parts.append(f"weighs {weight}")
        if hw_parts:
            text_parts.append(f"{name} " + " and ".join(hw_parts) + ".")

    # Abilities
    raw_abilities = info.get("abilities")

    if isinstance(raw_abilities, str):
        abilities_list = [a.strip() for a in raw_abilities.split(",") if a.strip()]
    elif isinstance(raw_abilities, (list, tuple, set)):
        abilities_list = [str(a).strip() for a in raw_abilities if a]
    else:
        abilities_list = []

    if abilities_list:
        abil_str = list_to_str(abilities_list)

        text_parts.append(f"{name} can have the abilities {abil_str}.")


    # Name etymology
    if name_etymology:
        text_parts.append(f"Its name is derived from {name_etymology}.")

    text = " ".join(text_parts).strip()

    return {
        "id": f"{name}-core" if name else f"{natdex}-core",
        "pokemon": name,
        "section": "core",
        "description": desc,
        "text": text,
        "metadata": {
            "National Dex Number": natdex,
            "Types": clean_types,
            "Generatin": generation,
            "Species": species,
            "Height": height,
            "Weight": weight,
            "Abilities": abilities_list,
            "Name Etymology": name_etymology,
        },
    }


In [238]:
def build_training_doc(info: dict):
    name = info["name"]
    ev_yield = info["ev_yield"]
    catch_rate = info["catch_rate"]
    base_friendship = info["base_friendship"]
    base_exp = info["base_exp"]
    growth_rate = info["growth_rate"]

    if isinstance(ev_yield, str):
        ev_list = [a.strip() for a in ev_yield.split(",") if a.strip()]
    elif isinstance(ev_yield, (list, tuple, set)):
        ev_list = [str(a).strip() for a in ev_yield if a]
    else:
        ev_list = []

    if ev_list:
        ev_str = list_to_str(ev_list)

    text_parts = []
    text_parts.append(f"For training purposes, defeating {name} yields {ev_str} EVs.")
    text_parts.append(f"It grants {base_exp} base experience points and follows a {growth_rate} experience growth rate.")
    text_parts.append(f"{name} has a catch rate of {catch_rate} and a base friendship of {base_friendship}.")

    text = " ".join(text_parts)

    return {
        "id": f"{name}-training",
        "pokemon": name,
        "section": "training",
        "text": text,
        "metadata": {
            "EV Yield": ev_yield,
            "Catch Rate": catch_rate,
            "Base Friendship": base_friendship,
            "Base Experience": base_exp,
            "Growth Rate": growth_rate,
        },
    }


In [239]:
def build_breeding_doc(info: dict):
    name = info["name"]
    egg_groups = info["egg_groups"]
    gender_male = info["gender_male"]
    gender_female = info["gender_female"]
    egg_cycles = info["egg_cycles"]

    # Egg groups text
    if isinstance(egg_groups, str):
        egg_list = [a.strip() for a in egg_groups.split(",") if a.strip()]
    elif isinstance(egg_groups, (list, tuple, set)):
        egg_list = [str(a).strip() for a in egg_groups if a]
    else:
        egg_list = []

    if egg_list:
        egg_str = list_to_str(egg_list)

    egg_text = f"{name} belongs to the {egg_str} Egg Group."

    # Gender ratio text (handle genderless species)
    if gender_male == 0 and gender_female == 0:
        gender_text = f"{name} is a genderless species."
    else:
        gender_text = (
            f"The typical gender ratio for {name} is "
            f"{gender_male}% male and {gender_female}% female."
        )

    # Egg cycles / hatch steps
    hatch_text_split = egg_cycles.split(" ")
    egg_cycles_str = hatch_text_split[0]
    steps = " ".join(hatch_text_split[1:])
    hatch_text = (
        f"Eggs take {egg_cycles_str} egg cycles {steps} to hatch."
    )

    text = " ".join([egg_text, gender_text, hatch_text])

    return {
        "id": f"{name}-breeding",
        "pokemon": name,
        "section": "breeding",
        "text": text,
        "metadata": {
            "eggGroups": egg_groups,
            "genderMale": gender_male,
            "genderFemale": gender_female,
            "eggCycles": egg_cycles,
        },
    }


In [240]:
def build_statistics_doc(info: dict):
    name = info["name"]

    # Base stats
    hp = info["base_hp"]
    atk = info["base_atk"]
    dfc = info["base_def"]
    satk = info["base_satk"]
    sdfc = info["base_sdef"]
    spd = info["base_spd"]

    # Level 100 min–max stats
    min_hp = info["min_hp"]
    max_hp = info["max_hp"]
    min_atk = info["min_atk"]
    max_atk = info["max_atk"]
    min_def = info["min_def"]
    max_def = info["max_def"]
    min_satk = info["min_satk"]
    max_satk = info["max_satk"]
    min_sdef = info["min_sdef"]
    max_sdef = info["max_sdef"]
    min_spd = info["min_spd"]
    max_spd = info["max_spd"]

    # Base stat total
    total = int(hp) + int(atk) + int(dfc) + int(satk) + int(sdfc) + int(spd)

    text = (
        f"{name} has a base stat total of {total}, with base stats of "
        f"{hp} HP, {atk} Attack, {dfc} Defense, {satk} Special Attack, "
        f"{sdfc} Special Defense, and {spd} Speed. "
        f"At level 100, {name}'s HP can range from {min_hp} to {max_hp}, "
        f"Attack from {min_atk} to {max_atk}, Defense from {min_def} to {max_def}, "
        f"Special Attack from {min_satk} to {max_satk}, "
        f"Special Defense from {min_sdef} to {max_sdef}, "
        f"and Speed from {min_spd} to {max_spd}, depending on its nature, IVs, and EVs."
    )

    return {
        "id": f"{name}-statistics",
        "pokemon": name,
        "section": "statistics",
        "text": text,
        "metadata": {
            "baseStats": {
                "hp": hp,
                "attack": atk,
                "defense": dfc,
                "spAttack": satk,
                "spDefense": sdfc,
                "speed": spd,
            },
            "baseStatTotal": total,
            "minStatsLevel100": {
                "hp": min_hp,
                "attack": min_atk,
                "defense": min_def,
                "spAttack": min_satk,
                "spDefense": min_sdef,
                "speed": min_spd,
            },
            "maxStatsLevel100": {
                "hp": max_hp,
                "attack": max_atk,
                "defense": max_def,
                "spAttack": max_satk,
                "spDefense": max_sdef,
                "speed": max_spd,
            },
        },
    }


In [241]:
def extract_evo_card(card):
    """
    Extract basic info from a single evolution 'card' element.
    Works on the <div class="infocard"> used in evolution chains.
    """
    name_el = card.select_one("a.ent-name")
    if not name_el:
        return {}

    name = name_el.get_text(strip=True)

    num_el = card.select_one("span.infocard-lg-data small")
    dex_number = num_el.get_text(strip=True) if num_el else None

    types = [t.get_text(strip=True) for t in card.select("a.itype")]

    return {
        "name": name,
        "dex_number": dex_number,
        "types": types,
    }


def parse_all_evolution_edges(soup):
    """
    Build evolution edges by walking every <span class="infocard-arrow"> and
    pairing it with the nearest previous/next <div class="infocard">.

    Each edge:
      { "from": <name>, "from_dex": <dex>, "to": <name>, "to_dex": <dex>, "method": <text> }
    """
    edges = []

    # All evolution arrows in the evolution chart(s)
    for arrow in soup.select("span.infocard-arrow"):
        method_text = arrow.get_text(" ", strip=True)
        if not method_text:
            continue

        # 'from' = nearest previous evolution card
        from_card = arrow.find_previous("div", class_="infocard")
        # 'to' = nearest next evolution card
        to_card = arrow.find_next("div", class_="infocard")

        if not from_card or not to_card:
            continue

        from_info = extract_evo_card(from_card)
        to_info = extract_evo_card(to_card)

        if not from_info or not to_info:
            continue

        edges.append(
            {
                "from": from_info["name"],
                "from_dex": from_info["dex_number"],
                "to": to_info["name"],
                "to_dex": to_info["dex_number"],
                "method": method_text,
            }
        )

    return edges


def get_no_evolution_text(soup, pokemon_name):
    """
    If the page states 'X does not evolve.', return that line, else a generic fallback.
    """
    for el in soup.find_all(["li", "p"]):
        txt = el.get_text(" ", strip=True)
        if "does not evolve" in txt:
            return txt
    return f"{pokemon_name} does not evolve."


def clean_method(method_raw: str) -> str:
    """
    Strip wrapping parentheses from '(Level 16)' etc.
    """
    return method_raw.strip().strip("()")


def build_evolution_doc(pokemon_data, soup):
    """
    Build a per-Pokémon evolution doc, using only edges that actually involve THIS Pokémon.
    """
    pokemon_name = pokemon_data["name"]
    pokemon_id = pokemon_data["id"]

    all_edges = parse_all_evolution_edges(soup)

    # Only edges this Pokémon participates in
    incoming = [e for e in all_edges if e["to"] == pokemon_name]
    outgoing = [e for e in all_edges if e["from"] == pokemon_name]

    if not incoming and not outgoing:
        # Truly standalone / no evolution family
        no_evo_text = get_no_evolution_text(soup, pokemon_name)
        evolution_doc = {
            "id": f"{pokemon_name.lower()}-evolutions",
            "pokemon": pokemon_name,
            "section": "evolutions",
            "text": no_evo_text,
            "metadata": {
                "pokemon_id": pokemon_id,
                "pokemon_name": pokemon_name,
                "evolution_edges": [],
                "has_evolutions": False,
            },
        }
        return evolution_doc

    sentences = []

    # 1) Pre-evolutions -> this Pokémon
    for e in incoming:
        method_clean = clean_method(e["method"])
        sentences.append(
            f"{e['from']} evolves into {pokemon_name} via {method_clean}."
        )

    # 2) This Pokémon -> later evolutions
    if len(outgoing) == 1:
        e = outgoing[0]
        method_clean = clean_method(e["method"])
        sentences.append(
            f"{pokemon_name} evolves into {e['to']} via {method_clean}."
        )
    elif len(outgoing) > 1:
        branch_parts = []
        for e in outgoing:
            method_clean = clean_method(e["method"])
            branch_parts.append(f"{e['to']} via {method_clean}")

        if len(branch_parts) > 1:
            branches_text = ", ".join(branch_parts[:-1]) + f", or {branch_parts[-1]}"
        else:
            branches_text = branch_parts[0]

        sentences.append(f"{pokemon_name} can evolve into {branches_text}.")

    evo_text = " ".join(sentences)

    relevant_edges = incoming + outgoing

    evolution_doc = {
        "id": f"{pokemon_name.lower()}-evolutions",
        "pokemon": pokemon_name,
        "section": "evolutions",
        "text": evo_text,
        "metadata": {
            "pokemon_id": pokemon_id,
            "pokemon_name": pokemon_name,
            "evolution_edges": relevant_edges,
            "has_evolutions": True,
        },
    }
    return evolution_doc

In [242]:
chunks = []

for index, pokemon in enumerate(pokemon_list[:pokemon_scope], start = 1):
    pokemon_url = "https://pokemondb.net" + pokemon["href"]

    response = requests.get(pokemon_url)
    pokemon_soup = BeautifulSoup(response.text, "html.parser")

    ###### Data Parsing
    ### Pokemon Info
    pokemon_id = int(pokemon_soup.find("th", string="National №").find_next("td").text)

    pokemon_name = pokemon_soup.find("h1").text.strip()

    pokemon_desc = pokemon_soup.find('div', class_='tabset-basics').find_all_previous("p")
    pokemon_desc = '|'.join(desc.text.strip() for desc in pokemon_desc).split('|')[::-1]
    pokemon_desc = ' '.join(pokemon_desc)

    species_data = pokemon_soup.find("th", string="Species").find_next("td").text.strip().replace(" Pokémon", "")

    height = int(float(pokemon_soup.find("th", string="Height").find_next("td").text.strip().split()[0])*100)
    weight = int(float(pokemon_soup.find("th", string="Weight").find_next("td").text.strip().split()[0])*100)

    type_elements = pokemon_soup.find("th", string="Type").find_next("td").find_all("a")
    type_info = ', '.join(type_element.text.strip() for type_element in type_elements).split(',')
    type_1 = type_info[0]
    if len(type_info) == 1:
        type_2 = None
    else:
        type_2 = type_info[1].strip()

    generation_title_element = pokemon_soup.find(class_="list-nav-title", string='In other generations')
    if generation_title_element:
        generation_all = generation_title_element.find_next_siblings('li')
        in_generation = ', '.join(generation_select.text.strip() for generation_select in generation_all)
    else:
        in_generation = '9'
    generation = int(in_generation[0])

    name_etymology_element = pokemon_soup.find(class_="list-nav-title", string='In other generations')
    if name_etymology_element:
        name_etymology_piece = pokemon_soup.find("dl", class_="etymology").find_all('dt')
        name_etymology_desc = pokemon_soup.find("dl", class_="etymology").find_all('dd')
        name_etymology = [f"{dt.text.strip()}: {dd.text.strip()}" for dt, dd in zip(name_etymology_piece, name_etymology_desc)]
        name_etymology = " | ".join(name_etymology)

    ability_elements = pokemon_soup.find("th", string="Abilities").find_next("td").find_all("a")
    abilities = ', '.join(ability_element.text.strip() for ability_element in ability_elements)

    ### Training Info
    ev_yield = pokemon_soup.find("th", string="EV yield").find_next("td").text.strip()

    catch_rate = pokemon_soup.find("th", string="Catch rate").find_next("td").text.strip().split()[0]

    base_friendship = pokemon_soup.find("th", string="Base Exp.").find_previous("td").text.strip().split()[0]

    base_exp = pokemon_soup.find("th", string="Base Exp.").find_next("td").text.strip().split()[0]

    growth_rate = pokemon_soup.find("th", string="Growth Rate").find_next("td").text.strip()

    ### Breeding Info
    gender = pokemon_soup.find("th", string="Gender").find_next("td").text.strip().split(', ')
    if len(gender) > 1:
        gender_male = gender[0]
        gender_male = gender_male.split('%')[0]
    else:
        gender_male = '0'
    if len(gender) > 1:
        gender_female = gender[1]
        gender_female = gender_female.split('%')[0]
    else:
        gender_female = '0'

    egg_groups = pokemon_soup.find("th", string="Egg Groups").find_next("td").text.strip().split(', ')

    egg_cycles = pokemon_soup.find("th", string="Egg cycles").find_next("td").text.replace("\t\t\t\t", " ").strip()


    ### Pokemon Stats
    hp_elements = pokemon_soup.find("th", string="HP").find_next_siblings("td", class_="cell-num")
    hp_stats = [hp_element.text.strip() for hp_element in hp_elements]
    base_hp, min_hp, max_hp = hp_stats

    atk_elements = pokemon_soup.find("th", string="Attack").find_next_siblings("td", class_="cell-num")
    atk_stats = [atk_element.text.strip() for atk_element in atk_elements]
    base_atk, min_atk, max_atk = atk_stats

    def_elements = pokemon_soup.find("th", string="Defense").find_next_siblings("td", class_="cell-num")
    def_stats = [def_element.text.strip() for def_element in def_elements]
    base_def, min_def, max_def = def_stats

    satk_elements = pokemon_soup.find("th", string="Sp. Atk").find_next_siblings("td", class_="cell-num")
    satk_stats = [satk_element.text.strip() for satk_element in satk_elements]
    base_satk, min_satk, max_satk = satk_stats

    sdef_elements = pokemon_soup.find("th", string="Sp. Def").find_next_siblings("td", class_="cell-num")
    sdef_stats = [sdef_element.text.strip() for sdef_element in sdef_elements]
    base_sdef, min_sdef, max_sdef = sdef_stats

    spd_elements = pokemon_soup.find("th", string="Speed").find_next_siblings("td", class_="cell-num")
    spd_stats = [spd_element.text.strip() for spd_element in spd_elements]
    base_spd, min_spd, max_spd = spd_stats

    pokemon_data = {
        "description": pokemon_desc,

        "id": pokemon_id,
        "name": pokemon_name,
        "species": species_data,
        "height": height,
        "weight": weight,
        "types": [type_1, type_2],
        "generation": generation,
        "name_etymology": name_etymology,
        "abilities": abilities,

        "ev_yield": ev_yield,
        "catch_rate": catch_rate,
        "base_friendship": base_friendship,
        "base_exp": base_exp,
        "growth_rate": growth_rate,

        "egg_groups": egg_groups,
        "gender_male": gender_male,
        "gender_female": gender_female,
        "egg_cycles": egg_cycles,

        "base_hp": base_hp,
        "min_hp": min_hp,
        "max_hp": max_hp,
        "base_atk": base_atk,
        "min_atk": min_atk,
        "max_atk": max_atk,
        "base_def": base_def,
        "min_def": min_def,
        "max_def": max_def,
        "base_satk": base_satk,
        "min_satk": min_satk,
        "max_satk": max_satk,
        "base_sdef": base_sdef,
        "min_sdef": min_sdef,
        "max_sdef": max_sdef,
        "base_spd": base_spd,
        "min_spd": min_spd,
        "max_spd": max_spd,
    }

    chunks.append(build_core_doc(pokemon_data))
    chunks.append(build_training_doc(pokemon_data))
    chunks.append(build_breeding_doc(pokemon_data))
    chunks.append(build_statistics_doc(pokemon_data))
    # chunks.append(build_evolution_doc(pokemon_data, pokemon_soup))

In [243]:
output_file.parent.mkdir(parents=True, exist_ok=True)

with output_file.open("w", encoding="utf-8") as f:
    for c in chunks:
        f.write(json.dumps(c, ensure_ascii=False) + "\n")

print(f"Wrote {len(chunks)} documents to {output_file.resolve()}")


Wrote 944 documents to /content/pokemon-test-1.jsonl
