In [1]:
import pandas as pd
import numpy as np
import sys
import time
import logging
import warnings
from pathlib import Path
from datetime import datetime

# Suprimir TODOS los logs y warnings
logging.disable(logging.CRITICAL)
warnings.filterwarnings('ignore')

project_root = Path.cwd().parent.parent
sys.path.append(str(project_root))

from wrappers.fbref_data import extract_data as fbref_extract_data
from wrappers.understat_data import extract_data as understat_extract_data
from scrappers._config import LEAGUE_DICT

pd.set_option('display.max_columns', None)
print(f"Setup completo - Ligas disponibles: {len(LEAGUE_DICT)}")

Setup completo - Ligas disponibles: 24


In [2]:
BDO_WINNERS = {
    2014: {
        "player": "Cristiano Ronaldo",
        "teams": ["Real Madrid"],
        "seasons": {
            "ESP-La Liga": "14-15",
            "INT-Champions League": "14-15",
            "ESP-Copa del Rey": "14-15"
        },
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey"
        ]
    },
    2015: {
        "player": "Lionel Messi",
        "teams": ["Barcelona"],
        "seasons": {
            "ESP-La Liga": "14-15",
            "INT-Champions League": "14-15",
            "ESP-Copa del Rey": "14-15"
        },
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey"
        ]
    },
    2016: {
        "player": "Cristiano Ronaldo",
        "teams": ["Real Madrid"],
        "seasons": {
            "ESP-La Liga": "15-16",
            "INT-Champions League": "15-16",
            "ESP-Copa del Rey": "15-16",
            "INT-European Championship": "2016"  
        },
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey",
            "INT-European Championship"
        ]
    },
    2017: {
        "player": "Cristiano Ronaldo",
        "teams": ["Real Madrid"],
        "seasons": {
            "ESP-La Liga": "16-17",
            "INT-Champions League": "16-17",
            "ESP-Copa del Rey": "16-17"
        },
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey"
        ]
    },
    2018: {
        "player": "Luka Modrić",
        "teams": ["Real Madrid"],
        "seasons": {
            "ESP-La Liga": "17-18",
            "INT-Champions League": "17-18",
            "ESP-Copa del Rey": "17-18",
            "INT-World Cup": "2018" 
        },
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey",
            "INT-World Cup"
        ]
    },
    2019: {
        "player": "Lionel Messi",
        "teams": ["Barcelona"],
        "seasons": {
            "ESP-La Liga": "18-19",
            "INT-Champions League": "18-19",
            "ESP-Copa del Rey": "18-19",
        },
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey",
        ]
    },
    2020: {
        "player": "Robert Lewandowski",
        "teams": ["Bayern Munich"],
        "seasons": {
            "GER-Bundesliga": "19-20",
            "INT-Champions League": "19-20",
            "GER-DFB-Pokal": "19-20"
        },
        "competitions": [
            "GER-Bundesliga",
            "INT-Champions League",
            "GER-DFB-Pokal"
        ]
    },
    2021: {
        "player": "Lionel Messi",
        "teams": ["Barcelona"],
        "seasons": {
            "ESP-La Liga": "20-21",
            "INT-Champions League": "20-21",
            "ESP-Copa del Rey": "20-21",
        },
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey",
        ]
    },
    2022: {
        "player": "Karim Benzema",
        "teams": ["Real Madrid"],
        "seasons": {
            "ESP-La Liga": "21-22",
            "INT-Champions League": "21-22",
            "ESP-Copa del Rey": "21-22"
        },
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey"
        ]
    },
    2023: {
        "player": "Lionel Messi",
        "teams": ["Paris S-G"],
        "seasons": {
            "FRA-Ligue 1": "22-23",
            "INT-Champions League": "22-23",
            "FRA-Coupe de France": "22-23",
            "INT-World Cup": "2022"  
        },
        "competitions": [
            "FRA-Ligue 1",
            "INT-Champions League",
            "FRA-Coupe de France",
            "INT-World Cup"
        ]
    },
    2024: {
        "player": "Rodri",
        "teams": ["Manchester City"],
        "seasons": {
            "ENG-Premier League": "23-24",
            "INT-Champions League": "23-24",
            "ENG-FA Cup": "23-24",
            "ENG-EFL Cup": "23-24",
            "INT-European Championship": "2024"  # CONFIRMADO disponible
        },
        "competitions": [
            "ENG-Premier League",
            "INT-Champions League",
            "ENG-FA Cup",
            "ENG-EFL Cup",
            "INT-European Championship"
        ]
    }
}

total_competitions = sum(len(w['competitions']) for w in BDO_WINNERS.values())
print(f"Configurados {len(BDO_WINNERS)} ganadores (2014-2024), {total_competitions} competiciones totales")
print("Solo incluye años CONFIRMADAMENTE disponibles en FBref")

Configurados 11 ganadores (2014-2024), 38 competiciones totales
Solo incluye años CONFIRMADAMENTE disponibles en FBref


In [3]:
def extract_winner_data(bdo_year, config):
    """Extrae datos completos de un ganador BDO."""
    player = config["player"]
    seasons = config["seasons"]
    competitions = config["competitions"]
    
    print(f"\n{player} ({bdo_year}) - {len(competitions)} competiciones")
    all_data = []
    
    for comp in competitions:
        if comp not in LEAGUE_DICT:
            print(f"  {comp}: no configurada")
            continue
        
        season = seasons.get(comp)
        if not season:
            print(f"  {comp}: sin temporada definida")
            continue
            
        try:
            print(f"  {comp} ({season}): extrayendo...")
            
            # Detectar competiciones internacionales por temporada de 4 dígitos
            if len(season) == 4 and season.isdigit():
                print(f"    Usando scraper directo")
                try:
                    from scrappers.fbref import FBref
                    
                    scraper = FBref(leagues=[comp], seasons=[season])
                    players_data = scraper.read_player_season_stats(stat_type="standard")
                    
                    if players_data is not None and not players_data.empty:
                        # CRUCIAL: Reset index para convertir jugador de index a columna
                        df_reset = players_data.reset_index()
                        print(f"    Encontrados {len(df_reset)} jugadores")
                        
                        # Buscar jugador con múltiples variaciones
                        player_variants = [player, player.split()[0], player.split()[-1]]
                        
                        player_match = None
                        for variant in player_variants:
                            matches = df_reset[
                                df_reset['player'].str.contains(variant, case=False, na=False, regex=False)
                            ]
                            if not matches.empty:
                                # Flatten MultiIndex columns si es necesario
                                if hasattr(matches.columns, 'levels'):
                                    matches.columns = [f'{col[0]}_{col[1]}' if col[1] else col[0] 
                                                     for col in matches.columns]
                                
                                player_match = matches.iloc[0].to_dict()
                                print(f"    Encontrado: {matches.iloc[0]['player']}")
                                break
                        
                        fbref_data = player_match
                    else:
                        print(f"    Sin datos para {comp} {season}")
                        fbref_data = None
                        
                except Exception as e:
                    print(f"    Error scraper: {str(e)}")
                    import traceback
                    traceback.print_exc()
                    fbref_data = None
            else:
                # Wrapper para competiciones domésticas
                fbref_data = fbref_extract_data(player, "player", comp, season)
                    
            if not fbref_data:
                print(f"    No encontrado")
                continue
                
            final_data = fbref_data.copy()
            
            # Understat para ligas domésticas
            if comp in ["ESP-La Liga", "ENG-Premier League", "ITA-Serie A", 
                       "GER-Bundesliga", "FRA-Ligue 1"] and "-" in season:
                try:
                    understat_data = understat_extract_data(player, "player", comp, season)
                    if understat_data:
                        for field in ['understat_xg_chain', 'understat_xg_buildup', 
                                    'understat_npxg_plus_xa', 'understat_key_passes']:
                            if field in understat_data:
                                final_data[field] = understat_data[field]
                        print(f"    + Understat")
                except:
                    pass
            
            # Metadatos
            final_data.update({
                'bdo_year': bdo_year,
                'bdo_player': player,
                'comp_type': 'international' if len(season) == 4 else 'domestic',
                'season_used': season
            })
            
            all_data.append(final_data)
            print(f"    OK ({len(final_data)} campos)")
            time.sleep(8)
            
        except Exception as e:
            print(f"    Error: {str(e)}")
            import traceback
            traceback.print_exc()
    
    print(f"  Completado: {len(all_data)}/{len(competitions)}")
    return all_data

print("Función CORREGIDA: maneja MultiIndex correctamente")

Función CORREGIDA: maneja MultiIndex correctamente


In [4]:
# Scraping sistemático
start_time = datetime.now()
all_data = []

print(f"Inicio scraping: {start_time.strftime('%H:%M:%S')}")

for year in sorted(BDO_WINNERS.keys()):
    try:
        winner_data = extract_winner_data(year, BDO_WINNERS[year])
        if winner_data:
            all_data.extend(winner_data)
            print(f"  Añadidos {len(winner_data)} registros")
        time.sleep(10)
    except Exception as e:
        print(f"  Error crítico: {str(e)}")

end_time = datetime.now()
print(f"\nScraping completado: {end_time.strftime('%H:%M:%S')}")
print(f"Tiempo total: {end_time - start_time}")
print(f"Registros recolectados: {len(all_data)}")

Inicio scraping: 15:39:14

Cristiano Ronaldo (2014) - 3 competiciones
  ESP-La Liga (14-15): extrayendo...
Loading Cristiano Ronaldo from cache
Loading Cristiano Ronaldo from Understat cache
    + Understat
    OK (136 campos)
  INT-Champions League (14-15): extrayendo...
Loading Cristiano Ronaldo from cache
    OK (132 campos)
  ESP-Copa del Rey (14-15): extrayendo...
Loading Cristiano Ronaldo from cache
    OK (132 campos)
  Completado: 3/3
  Añadidos 3 registros

Lionel Messi (2015) - 3 competiciones
  ESP-La Liga (14-15): extrayendo...
Loading Lionel Messi from cache
Loading Lionel Messi from Understat cache
    + Understat
    OK (136 campos)
  INT-Champions League (14-15): extrayendo...
Loading Lionel Messi from cache
    OK (132 campos)
  ESP-Copa del Rey (14-15): extrayendo...
Loading Lionel Messi from cache
    OK (132 campos)
  Completado: 3/3
  Añadidos 3 registros

Cristiano Ronaldo (2016) - 4 competiciones
  ESP-La Liga (15-16): extrayendo...
Loading Cristiano Ronaldo from

In [5]:
# Consolidación
if all_data:
    df = pd.DataFrame(all_data)
    print(f"DataFrame: {df.shape[0]} filas x {df.shape[1]} columnas")
    
    print(f"\nGanadores: {sorted(df['bdo_player'].unique())}")
    print(f"Años: {sorted(df['bdo_year'].unique())}")
    print(f"Competiciones: {df['league'].nunique()}")
    
    # Registros por ganador
    counts = df.groupby(['bdo_year', 'bdo_player']).size()
    print(f"\nRegistros por ganador:")
    for (year, player), count in counts.items():
        print(f"  {year}: {player} - {count}")
    
    # Métricas principales disponibles
    key_metrics = ['goals', 'assists', 'minutes_played', 'expected_goals']
    available = [m for m in key_metrics if m in df.columns]
    print(f"\nMétricas clave disponibles: {available}")
    
    if available:
        print(f"\nTotales por ganador:")
        summary = df.groupby(['bdo_year', 'bdo_player'])[available].sum()
        print(summary)
else:
    print("Sin datos recolectados")

DataFrame: 38 filas x 233 columnas

Ganadores: ['Cristiano Ronaldo', 'Karim Benzema', 'Lionel Messi', 'Luka Modrić', 'Robert Lewandowski', 'Rodri']
Años: [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Competiciones: 12

Registros por ganador:
  2014: Cristiano Ronaldo - 3
  2015: Lionel Messi - 3
  2016: Cristiano Ronaldo - 4
  2017: Cristiano Ronaldo - 3
  2018: Luka Modrić - 4
  2019: Lionel Messi - 3
  2020: Robert Lewandowski - 3
  2021: Lionel Messi - 3
  2022: Karim Benzema - 3
  2023: Lionel Messi - 4
  2024: Rodri - 5

Métricas clave disponibles: ['goals', 'assists', 'minutes_played', 'expected_goals']

Totales por ganador:
                             goals  assists  minutes_played  expected_goals
bdo_year bdo_player                                                        
2014     Cristiano Ronaldo    59.0     20.0          4283.0             0.0
2015     Lionel Messi         58.0     27.0          5062.0             0.0
2016     Cristiano Ronaldo    51.0  

In [6]:
# Export
if 'df' in locals() and not df.empty:
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f"bdo_complete_dataset_{timestamp}.csv"
    
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"Dataset exportado: {filename}")
    print(f"Tamaño: {df.shape[0]} registros x {df.shape[1]} columnas")
    
    # Export por tipo
    if 'comp_type' in df.columns:
        for comp_type in df['comp_type'].unique():
            subset = df[df['comp_type'] == comp_type]
            subset_file = f"bdo_{comp_type}_{timestamp}.csv"
            subset.to_csv(subset_file, index=False, encoding='utf-8')
            print(f"  {comp_type}: {subset_file} ({len(subset)} registros)")
    
    print(f"\nDataset completo para análisis BDO 2024 listo")
else:
    print("Sin datos para exportar")

Dataset exportado: bdo_complete_dataset_20250909_154637.csv
Tamaño: 38 registros x 233 columnas


  domestic: bdo_domestic_20250909_154637.csv (34 registros)
  international: bdo_international_20250909_154637.csv (4 registros)

Dataset completo para análisis BDO 2024 listo
