In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

BASE_URL = "https://pokemondb.net"
GEN_IDS = ['gen-1', 'gen-2', 'gen-3', 'gen-4', 'gen-5', 'gen-6', 'gen-7', 'gen-8', 'gen-9']

def get_pokemon_links():
    links = []
    for gen_id in GEN_IDS:
        print(f"ðŸ”Ž RÃ©cupÃ©ration des PokÃ©mon pour {gen_id}...")
        pokedex_url = f"{BASE_URL}/pokedex/national#{gen_id}"
        res = requests.get(pokedex_url)
        soup = BeautifulSoup(res.text, 'html.parser')
        
        for a in soup.select("main div.infocard a.ent-name"):
            name = a.text.strip()
            link = BASE_URL + a['href']
            links.append((name, link))
    return links

def parse_pokemon_page(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    
    data = {"url": url}
    
    # PokÃ©dex Data Table
    dex_table = soup.find("table", class_="vitals-table")
    if dex_table:
        for row in dex_table.find_all("tr"):
            th = row.find("th")
            td = row.find("td")
            if th and td:
                label = th.text.strip()
                value = td.text.strip()
                data[label] = value

    # Base Stats Table
    stats_tables = soup.find_all("table", class_="vitals-table")
    if len(stats_tables) > 1:
        for row in stats_tables[1].find_all("tr"):
            cells = row.find_all(["th", "td"])
            if len(cells) >= 2:
                stat_name = cells[0].text.strip()
                stat_value = cells[1].text.strip()
                data[stat_name] = stat_value
    return data

def main(limit=None):
    all_pokemon_data = []
    pokemon_links = get_pokemon_links()
    print(f"{len(pokemon_links)} PokÃ©mon trouvÃ©s.")

    for i, (name, link) in enumerate(pokemon_links):
        if limit and i >= limit:
            break
        print(f"[{i+1}] {name} - {link}")
        try:
            info = parse_pokemon_page(link)
            info["Nom"] = name
            all_pokemon_data.append(info)
        except Exception as e:
            print(f"Erreur pour {name} : {e}")
        time.sleep(1)
    
    df = pd.DataFrame(all_pokemon_data)
    df.to_csv("pokemon_data.csv", index=False)
    print("Fichier 'pokemon_data.csv' gÃ©nÃ©rÃ©.")
    return df

if __name__ == "__main__":
    df = main(limit=None)  # EnlÃ¨ve le `limit` pour tout rÃ©cupÃ©rer

ðŸ”Ž RÃ©cupÃ©ration des PokÃ©mon pour gen-1...
ðŸ”Ž RÃ©cupÃ©ration des PokÃ©mon pour gen-2...
ðŸ”Ž RÃ©cupÃ©ration des PokÃ©mon pour gen-3...
ðŸ”Ž RÃ©cupÃ©ration des PokÃ©mon pour gen-4...
ðŸ”Ž RÃ©cupÃ©ration des PokÃ©mon pour gen-5...
ðŸ”Ž RÃ©cupÃ©ration des PokÃ©mon pour gen-6...
ðŸ”Ž RÃ©cupÃ©ration des PokÃ©mon pour gen-7...
ðŸ”Ž RÃ©cupÃ©ration des PokÃ©mon pour gen-8...
ðŸ”Ž RÃ©cupÃ©ration des PokÃ©mon pour gen-9...
9225 PokÃ©mon trouvÃ©s.
[1] Bulbasaur - https://pokemondb.net/pokedex/bulbasaur
[2] Ivysaur - https://pokemondb.net/pokedex/ivysaur
[3] Venusaur - https://pokemondb.net/pokedex/venusaur
[4] Charmander - https://pokemondb.net/pokedex/charmander
[5] Charmeleon - https://pokemondb.net/pokedex/charmeleon
[6] Charizard - https://pokemondb.net/pokedex/charizard
[7] Squirtle - https://pokemondb.net/pokedex/squirtle
[8] Wartortle - https://pokemondb.net/pokedex/wartortle
[9] Blastoise - https://pokemondb.net/pokedex/blastoise
[10] Caterpie - https://pokemondb.net/pokedex/caterpi

In [21]:
df 


Unnamed: 0,url,National â„–,Type,Species,Height,Weight,Abilities,Local â„–,EV yield,Catch rate,Base Friendship,Base Exp.,Growth Rate,Nom
0,https://pokemondb.net/pokedex/bulbasaur,0001,Grass Poison,Seed PokÃ©mon,0.7Â m (2â€²04â€³),6.9Â kg (15.2Â lbs),1. OvergrowChlorophyll (hidden ability),0001 (Red/Blue/Yellow)0226 (Gold/Silver/Crysta...,1 Sp. Atk,"45 (5.9% with PokÃ©Ball, full HP)",50 (normal),64,Medium Slow,Bulbasaur
1,https://pokemondb.net/pokedex/ivysaur,0002,Grass Poison,Seed PokÃ©mon,1.0Â m (3â€²03â€³),13.0Â kg (28.7Â lbs),1. OvergrowChlorophyll (hidden ability),0002 (Red/Blue/Yellow)0227 (Gold/Silver/Crysta...,"1 Sp. Atk, 1 Sp. Def","45 (5.9% with PokÃ©Ball, full HP)",50 (normal),142,Medium Slow,Ivysaur
2,https://pokemondb.net/pokedex/venusaur,0003,Grass Poison,Seed PokÃ©mon,2.0Â m (6â€²07â€³),100.0Â kg (220.5Â lbs),1. OvergrowChlorophyll (hidden ability),0003 (Red/Blue/Yellow)0228 (Gold/Silver/Crysta...,"2 Sp. Atk, 1 Sp. Def","45 (5.9% with PokÃ©Ball, full HP)",50 (normal),236,Medium Slow,Venusaur
3,https://pokemondb.net/pokedex/charmander,0004,Fire,Lizard PokÃ©mon,0.6Â m (2â€²00â€³),8.5Â kg (18.7Â lbs),1. BlazeSolar Power (hidden ability),0004 (Red/Blue/Yellow)0229 (Gold/Silver/Crysta...,1 Speed,"45 (5.9% with PokÃ©Ball, full HP)",50 (normal),62,Medium Slow,Charmander
4,https://pokemondb.net/pokedex/charmeleon,0005,Fire,Flame PokÃ©mon,1.1Â m (3â€²07â€³),19.0Â kg (41.9Â lbs),1. BlazeSolar Power (hidden ability),0005 (Red/Blue/Yellow)0230 (Gold/Silver/Crysta...,"1 Sp. Atk, 1 Speed","45 (5.9% with PokÃ©Ball, full HP)",50 (normal),142,Medium Slow,Charmeleon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9220,https://pokemondb.net/pokedex/raging-bolt,1021,Electric Dragon,Paradox PokÃ©mon,5.2Â m (17â€²01â€³),480.0Â kg (1058.2Â lbs),1. Protosynthesis,0237 (The Indigo Disk),3 Sp. Atk,"10 (1.3% with PokÃ©Ball, full HP)",â€”,â€”,Slow,Raging Bolt
9221,https://pokemondb.net/pokedex/iron-boulder,1022,Rock Psychic,Paradox PokÃ©mon,1.5Â m (4â€²11â€³),162.5Â kg (358.3Â lbs),1. Quark Drive,0239 (The Indigo Disk),3 Speed,"10 (1.3% with PokÃ©Ball, full HP)",â€”,â€”,Slow,Iron Boulder
9222,https://pokemondb.net/pokedex/iron-crown,1023,Steel Psychic,Paradox PokÃ©mon,1.6Â m (5â€²03â€³),156.0Â kg (343.9Â lbs),1. Quark Drive,0238 (The Indigo Disk),3 Sp. Atk,"10 (1.3% with PokÃ©Ball, full HP)",â€”,â€”,Slow,Iron Crown
9223,https://pokemondb.net/pokedex/terapagos,1024,Normal,Tera PokÃ©mon,0.2Â m (0â€²08â€³),6.5Â kg (14.3Â lbs),1. Tera Shift,0240 (The Indigo Disk),1 Defense,"255 (33.3% with PokÃ©Ball, full HP)",â€”,â€”,Slow,Terapagos


In [22]:
df.to_csv(r"C:\Users\dthia\Music\dataset_pokemon.csv", index=False)
print("âœ… DataFrame enregistrÃ© dans 'C:\\Users\\dthia\\Music\\output.csv'")

âœ… DataFrame enregistrÃ© dans 'C:\Users\dthia\Music\output.csv'


In [23]:
import os
import re
import pandas as pd
from google.cloud import bigquery

# === Ã‰tape 1 : Nettoyage du CSV ===

# Chemin original
original_csv_path = r"C:\Users\dthia\Music\dataset_pokemon.csv"
# Chemin temporaire (fichier nettoyÃ©)
cleaned_csv_path = r"C:\Users\dthia\Music\dataset_pokemon_cleaned.csv"

# Lire le CSV
df = pd.read_csv(original_csv_path)

# Nettoyer les noms de colonnes
def clean_column_name(col):
    col = col.strip()
    col = col.replace("â„–", "No")  # Cas spÃ©cifique dÃ©jÃ  identifiÃ©
    col = re.sub(r"[^\w]", "_", col)  # Remplace tout caractÃ¨re non alphanumÃ©rique par "_"
    col = re.sub(r"__+", "_", col)    # RÃ©duit les doubles/triples "_" en un seul
    col = col.strip("_")              # EnlÃ¨ve "_" au dÃ©but ou Ã  la fin
    return col

df.columns = [clean_column_name(col) for col in df.columns]

# Sauvegarder dans un nouveau fichier
df.to_csv(cleaned_csv_path, index=False)

# === Ã‰tape 2 : Chargement dans BigQuery ===

# Authentification
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"C:\Users\dthia\Music\absolute-code-459720-c0-527bb2919d37.json"

# Identifiants GCP
project_id = "absolute-code-459720-c0"
dataset_id = "laka_dataset_pokemon"
table_name = "dataset_pokemon"

# Initialiser le client BigQuery
client = bigquery.Client(project=project_id)

# Config du job de chargement
job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,
    autodetect=True,
    write_disposition="WRITE_TRUNCATE"
)

# RÃ©fÃ©rence complÃ¨te Ã  la table
table_ref = f"{project_id}.{dataset_id}.{table_name}"

# Charger les donnÃ©es
with open(cleaned_csv_path, "rb") as source_file:
    load_job = client.load_table_from_file(source_file, table_ref, job_config=job_config)

# Attendre la fin du job
load_job.result()

# VÃ©rifier le rÃ©sultat
table = client.get_table(table_ref)
print(f"âœ… {table.num_rows} lignes importÃ©es dans {table_ref}.")


âœ… 9225 lignes importÃ©es dans absolute-code-459720-c0.laka_dataset_pokemon.dataset_pokemon.
