In [1]:
import pandas as pd
import numpy as np
import sys
import time
import logging
import warnings
from pathlib import Path
from datetime import datetime

# Suprimir TODOS los logs y warnings
logging.disable(logging.CRITICAL)
warnings.filterwarnings('ignore')

project_root = Path.cwd().parent.parent
sys.path.append(str(project_root))

from wrappers.fbref_data import extract_data as fbref_extract_data
from wrappers.understat_data import extract_data as understat_extract_data
from scrappers._config import LEAGUE_DICT

pd.set_option('display.max_columns', None)
print(f"Setup completo - Ligas disponibles: {len(LEAGUE_DICT)}")

Setup completo - Ligas disponibles: 24


In [2]:
# Configuración completa ganadores BDO con TODAS las competiciones
BDO_WINNERS = {
    2017: {
        "player": "Cristiano Ronaldo",
        "teams": ["Real Madrid"],
        "season": "17-18", 
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey"
        ]
    },
    2018: {
        "player": "Luka Modric",
        "teams": ["Real Madrid"],
        "season": "18-19",
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey", 
            "INT-World Cup"
        ]
    },
    2019: {
        "player": "Lionel Messi",
        "teams": ["Barcelona"],
        "season": "19-20",
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey",
            "INT-Copa America"
        ]
    },
    2020: {
        "player": "Robert Lewandowski", 
        "teams": ["Bayern Munich"],
        "season": "20-21",
        "competitions": [
            "GER-Bundesliga",
            "INT-Champions League",
            "GER-DFB-Pokal"
        ]
    },
    2021: {
        "player": "Lionel Messi",
        "teams": ["Barcelona", "Paris S-G"],
        "season": "21-22",
        "competitions": [
            "ESP-La Liga",
            "FRA-Ligue 1", 
            "INT-Champions League",
            "ESP-Copa del Rey",
            "FRA-Coupe de France",
            "INT-Copa America"
        ]
    },
    2022: {
        "player": "Karim Benzema",
        "teams": ["Real Madrid"],
        "season": "21-22",
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey"
        ]
    },
    2023: {
        "player": "Lionel Messi",
        "teams": ["Paris S-G", "Inter Miami"],
        "season": "22-23",
        "competitions": [
            "FRA-Ligue 1",
            "INT-Champions League",
            "FRA-Coupe de France",
            "INT-World Cup"
        ]
    },
    2024: {
        "player": "Rodri",
        "teams": ["Manchester City"],
        "season": "23-24",
        "competitions": [
            "ENG-Premier League",
            "INT-Champions League", 
            "ENG-FA Cup",
            "ENG-EFL Cup",
            "INT-European Championship"
        ]
    }
}

total_competitions = sum(len(w['competitions']) for w in BDO_WINNERS.values())
print(f"Configurados {len(BDO_WINNERS)} ganadores, {total_competitions} competiciones totales")

Configurados 8 ganadores, 32 competiciones totales


In [3]:
def extract_winner_data(bdo_year, config):
    """Extrae datos completos de un ganador BDO."""
    player = config["player"]
    season = config["season"]
    competitions = config["competitions"]
    
    print(f"\n{player} ({bdo_year}) - {len(competitions)} competiciones")
    all_data = []
    
    for comp in competitions:
        if comp not in LEAGUE_DICT:
            print(f"  {comp}: no configurada")
            continue
            
        try:
            print(f"  {comp}: extrayendo...")
            
            # FBref data
            fbref_data = fbref_extract_data(player, "player", comp, season)
            if not fbref_data:
                print(f"    No encontrado")
                continue
                
            final_data = fbref_data.copy()
            
            # Understat para ligas domésticas
            if comp in ["ESP-La Liga", "ENG-Premier League", "ITA-Serie A", 
                       "GER-Bundesliga", "FRA-Ligue 1"]:
                try:
                    understat_data = understat_extract_data(player, "player", comp, season)
                    if understat_data:
                        understat_fields = [
                            'understat_xg_chain', 'understat_xg_buildup', 
                            'understat_npxg_plus_xa', 'understat_key_passes'
                        ]
                        for field in understat_fields:
                            if field in understat_data:
                                final_data[field] = understat_data[field]
                        print(f"    + Understat")
                except:
                    pass
            
            # Metadatos
            final_data.update({
                'bdo_year': bdo_year,
                'bdo_player': player,
                'comp_type': 'domestic' if 'INT-' not in comp else 'international'
            })
            
            all_data.append(final_data)
            print(f"    OK ({len(final_data)} campos)")
            
            time.sleep(8)
            
        except Exception as e:
            print(f"    Error: {str(e)[:50]}")
    
    print(f"  Completado: {len(all_data)}/{len(competitions)}")
    return all_data

print("Función extract_winner_data definida")

Función extract_winner_data definida


In [None]:
# Scraping sistemático
start_time = datetime.now()
all_data = []

print(f"Inicio scraping: {start_time.strftime('%H:%M:%S')}")

for year in sorted(BDO_WINNERS.keys()):
    try:
        winner_data = extract_winner_data(year, BDO_WINNERS[year])
        if winner_data:
            all_data.extend(winner_data)
            print(f"  Añadidos {len(winner_data)} registros")
        time.sleep(10)
    except Exception as e:
        print(f"  Error crítico: {str(e)}")

end_time = datetime.now()
print(f"\nScraping completado: {end_time.strftime('%H:%M:%S')}")
print(f"Tiempo total: {end_time - start_time}")
print(f"Registros recolectados: {len(all_data)}")

Inicio scraping: 09:49:46

Cristiano Ronaldo (2017) - 3 competiciones
  ESP-La Liga: extrayendo...
    + Understat
    OK (129 campos)
  INT-Champions League: extrayendo...
    OK (147 campos)
  ESP-Copa del Rey: extrayendo...
    No encontrado
  Completado: 2/3
  Añadidos 2 registros

Luka Modric (2018) - 4 competiciones
  ESP-La Liga: extrayendo...
    + Understat
    OK (160 campos)
  INT-Champions League: extrayendo...
    OK (156 campos)
  ESP-Copa del Rey: extrayendo...
    OK (131 campos)
  INT-World Cup: extrayendo...
    OK (156 campos)
  Completado: 4/4
  Añadidos 4 registros

Lionel Messi (2019) - 4 competiciones
  ESP-La Liga: extrayendo...
    + Understat
    OK (160 campos)
  INT-Champions League: extrayendo...
    OK (156 campos)
  ESP-Copa del Rey: extrayendo...
    OK (131 campos)
  INT-Copa America: extrayendo...
    No encontrado
  Completado: 3/4
  Añadidos 3 registros

Robert Lewandowski (2020) - 3 competiciones
  GER-Bundesliga: extrayendo...
    + Understat
    O

In [None]:
# Consolidación
if all_data:
    df = pd.DataFrame(all_data)
    print(f"DataFrame: {df.shape[0]} filas x {df.shape[1]} columnas")
    
    print(f"\nGanadores: {sorted(df['bdo_player'].unique())}")
    print(f"Años: {sorted(df['bdo_year'].unique())}")
    print(f"Competiciones: {df['league'].nunique()}")
    
    # Registros por ganador
    counts = df.groupby(['bdo_year', 'bdo_player']).size()
    print(f"\nRegistros por ganador:")
    for (year, player), count in counts.items():
        print(f"  {year}: {player} - {count}")
    
    # Métricas principales disponibles
    key_metrics = ['goals', 'assists', 'minutes_played', 'expected_goals']
    available = [m for m in key_metrics if m in df.columns]
    print(f"\nMétricas clave disponibles: {available}")
    
    if available:
        print(f"\nTotales por ganador:")
        summary = df.groupby(['bdo_year', 'bdo_player'])[available].sum()
        print(summary)
else:
    print("Sin datos recolectados")

In [None]:
# Export
if 'df' in locals() and not df.empty:
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f"bdo_complete_dataset_{timestamp}.csv"
    
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"Dataset exportado: {filename}")
    print(f"Tamaño: {df.shape[0]} registros x {df.shape[1]} columnas")
    
    # Export por tipo
    if 'comp_type' in df.columns:
        for comp_type in df['comp_type'].unique():
            subset = df[df['comp_type'] == comp_type]
            subset_file = f"bdo_{comp_type}_{timestamp}.csv"
            subset.to_csv(subset_file, index=False, encoding='utf-8')
            print(f"  {comp_type}: {subset_file} ({len(subset)} registros)")
    
    print(f"\nDataset completo para análisis BDO 2024 listo")
else:
    print("Sin datos para exportar")