In [1]:
import pandas as pd
import numpy as np
import sys
import time
import logging
import warnings
from pathlib import Path
from datetime import datetime

# Suprimir TODOS los logs y warnings
logging.disable(logging.CRITICAL)
warnings.filterwarnings('ignore')

project_root = Path.cwd().parent.parent
sys.path.append(str(project_root))

from wrappers.fbref_data import extract_data as fbref_extract_data
from wrappers.understat_data import extract_data as understat_extract_data
from scrappers._config import LEAGUE_DICT

pd.set_option('display.max_columns', None)
print(f"Setup completo - Ligas disponibles: {len(LEAGUE_DICT)}")

Setup completo - Ligas disponibles: 24


In [2]:
# Configuración completa ganadores BDO con temporadas correctas según FBref
BDO_WINNERS = {
    2017: {
        "player": "Cristiano Ronaldo",
        "teams": ["Real Madrid"],
        "seasons": {
            "ESP-La Liga": "17-18",
            "INT-Champions League": "17-18",
            "INT-World Cup": "2018"  # Mundial usa año completo
        },
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League", 
            "INT-World Cup"
        ]
    },
    2018: {
        "player": "Luka Modric",
        "teams": ["Real Madrid"],
        "seasons": {
            "ESP-La Liga": "18-19",
            "INT-Champions League": "18-19",
            "ESP-Copa del Rey": "18-19",
            "INT-World Cup": "2018"  # Mundial 2018
        },
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey",
            "INT-World Cup"
        ]
    },
    2019: {
        "player": "Lionel Messi",
        "teams": ["Barcelona"],
        "seasons": {
            "ESP-La Liga": "18-19",  # Temporada BDO 2019
            "INT-Champions League": "18-19",
            "ESP-Copa del Rey": "18-19",
            "INT-Copa America": "2019"  # Copa América usa año completo
        },
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey",
            "INT-Copa America"
        ]
    },
    2020: {
        "player": "Robert Lewandowski",
        "teams": ["Bayern Munich"],
        "seasons": {
            "GER-Bundesliga": "19-20",
            "INT-Champions League": "19-20",
            "GER-DFB-Pokal": "19-20"
        },
        "competitions": [
            "GER-Bundesliga",
            "INT-Champions League",
            "GER-DFB-Pokal"
        ]
    },
    2021: {
        "player": "Lionel Messi",
        "teams": ["Barcelona", "Paris S-G"],
        "seasons": {
            "ESP-La Liga": "20-21",  # Con Barcelona
            "FRA-Ligue 1": "21-22",  # Con PSG
            "INT-Champions League": "21-22",  # Con PSG
            "ESP-Copa del Rey": "20-21",  # Con Barcelona - ganó
            "INT-Copa America": "2021"  # Copa América 2021 - ganó
        },
        "competitions": [
            "ESP-La Liga",
            "FRA-Ligue 1",
            "INT-Champions League", 
            "ESP-Copa del Rey",
            "INT-Copa America"
        ]
    },
    2022: {
        "player": "Karim Benzema",
        "teams": ["Real Madrid"],
        "seasons": {
            "ESP-La Liga": "21-22",
            "INT-Champions League": "21-22",
            "ESP-Copa del Rey": "21-22"
        },
        "competitions": [
            "ESP-La Liga",
            "INT-Champions League",
            "ESP-Copa del Rey"
        ]
    },
    2023: {
        "player": "Lionel Messi",
        "teams": ["Paris S-G", "Inter Miami"],
        "seasons": {
            "FRA-Ligue 1": "22-23",  # Con PSG
            "INT-Champions League": "22-23",  # Con PSG
            "FRA-Coupe de France": "22-23",  # Con PSG
            "INT-World Cup": "2022"  # Mundial Qatar 2022 - ganó
        },
        "competitions": [
            "FRA-Ligue 1",
            "INT-Champions League",
            "FRA-Coupe de France",
            "INT-World Cup"
        ]
    },
    2024: {
        "player": "Rodri",
        "teams": ["Manchester City"],
        "seasons": {
            "ENG-Premier League": "23-24",
            "INT-Champions League": "23-24",
            "ENG-FA Cup": "23-24",
            "ENG-EFL Cup": "23-24",
            "INT-European Championship": "2024"  # Euro 2024 - ganó
        },
        "competitions": [
            "ENG-Premier League",
            "INT-Champions League",
            "ENG-FA Cup", 
            "ENG-EFL Cup",
            "INT-European Championship"
        ]
    }
}

total_competitions = sum(len(w['competitions']) for w in BDO_WINNERS.values())
print(f"Configurados {len(BDO_WINNERS)} ganadores, {total_competitions} competiciones totales")

Configurados 8 ganadores, 31 competiciones totales


In [3]:
def extract_winner_data(bdo_year, config):
    """Extrae datos completos de un ganador BDO."""
    player = config["player"]
    seasons = config["seasons"]  # Ahora usa seasons dict
    competitions = config["competitions"]
    
    print(f"\n{player} ({bdo_year}) - {len(competitions)} competiciones")
    all_data = []
    
    for comp in competitions:
        if comp not in LEAGUE_DICT:
            print(f"  {comp}: no configurada")
            continue
        
        # Obtener temporada específica para esta competición
        season = seasons.get(comp)
        if not season:
            print(f"  {comp}: sin temporada definida")
            continue
            
        try:
            print(f"  {comp} ({season}): extrayendo...")
            
            # FBref data
            fbref_data = fbref_extract_data(player, "player", comp, season)
            if not fbref_data:
                print(f"    No encontrado")
                continue
                
            final_data = fbref_data.copy()
            
            # Understat para ligas domésticas
            if comp in ["ESP-La Liga", "ENG-Premier League", "ITA-Serie A", 
                       "GER-Bundesliga", "FRA-Ligue 1"]:
                try:
                    understat_data = understat_extract_data(player, "player", comp, season)
                    if understat_data:
                        understat_fields = [
                            'understat_xg_chain', 'understat_xg_buildup', 
                            'understat_npxg_plus_xa', 'understat_key_passes'
                        ]
                        for field in understat_fields:
                            if field in understat_data:
                                final_data[field] = understat_data[field]
                        print(f"    + Understat")
                except:
                    pass
            
            # Metadatos
            final_data.update({
                'bdo_year': bdo_year,
                'bdo_player': player,
                'comp_type': 'domestic' if 'INT-' not in comp else 'international',
                'season_used': season
            })
            
            all_data.append(final_data)
            print(f"    OK ({len(final_data)} campos)")
            
            time.sleep(8)
            
        except Exception as e:
            print(f"    Error: {str(e)[:50]}")
    
    print(f"  Completado: {len(all_data)}/{len(competitions)}")
    return all_data

print("Función extract_winner_data actualizada")

Función extract_winner_data actualizada


In [4]:
# Scraping sistemático
start_time = datetime.now()
all_data = []

print(f"Inicio scraping: {start_time.strftime('%H:%M:%S')}")

for year in sorted(BDO_WINNERS.keys()):
    try:
        winner_data = extract_winner_data(year, BDO_WINNERS[year])
        if winner_data:
            all_data.extend(winner_data)
            print(f"  Añadidos {len(winner_data)} registros")
        time.sleep(10)
    except Exception as e:
        print(f"  Error crítico: {str(e)}")

end_time = datetime.now()
print(f"\nScraping completado: {end_time.strftime('%H:%M:%S')}")
print(f"Tiempo total: {end_time - start_time}")
print(f"Registros recolectados: {len(all_data)}")

Inicio scraping: 11:47:34

Cristiano Ronaldo (2017) - 3 competiciones
  ESP-La Liga (17-18): extrayendo...
Loading Cristiano Ronaldo from cache
Loading Cristiano Ronaldo from Understat cache
    + Understat
    OK (130 campos)
  INT-Champions League (17-18): extrayendo...
Loading Cristiano Ronaldo from cache
    OK (148 campos)
  INT-World Cup (2018): extrayendo...
Input validation failed: season must be in YY-YY format, got '2018'
Suggestions:
  - Use season format like '23-24', '22-23', etc.
    No encontrado
  Completado: 2/3
  Añadidos 2 registros

Luka Modric (2018) - 4 competiciones
  ESP-La Liga (18-19): extrayendo...
Loading Luka Modric from cache
Loading Luka Modric from Understat cache
    + Understat
    OK (161 campos)
  INT-Champions League (18-19): extrayendo...
Loading Luka Modric from cache
    OK (157 campos)
  ESP-Copa del Rey (18-19): extrayendo...
Loading Luka Modric from cache
    OK (132 campos)
  INT-World Cup (2018): extrayendo...
Input validation failed: season

In [5]:
# Consolidación
if all_data:
    df = pd.DataFrame(all_data)
    print(f"DataFrame: {df.shape[0]} filas x {df.shape[1]} columnas")
    
    print(f"\nGanadores: {sorted(df['bdo_player'].unique())}")
    print(f"Años: {sorted(df['bdo_year'].unique())}")
    print(f"Competiciones: {df['league'].nunique()}")
    
    # Registros por ganador
    counts = df.groupby(['bdo_year', 'bdo_player']).size()
    print(f"\nRegistros por ganador:")
    for (year, player), count in counts.items():
        print(f"  {year}: {player} - {count}")
    
    # Métricas principales disponibles
    key_metrics = ['goals', 'assists', 'minutes_played', 'expected_goals']
    available = [m for m in key_metrics if m in df.columns]
    print(f"\nMétricas clave disponibles: {available}")
    
    if available:
        print(f"\nTotales por ganador:")
        summary = df.groupby(['bdo_year', 'bdo_player'])[available].sum()
        print(summary)
else:
    print("Sin datos recolectados")

DataFrame: 25 filas x 202 columnas

Ganadores: ['Cristiano Ronaldo', 'Karim Benzema', 'Lionel Messi', 'Luka Modric', 'Robert Lewandowski', 'Rodri']
Años: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Competiciones: 10

Registros por ganador:


  2017: Cristiano Ronaldo - 2
  2018: Luka Modric - 3
  2019: Lionel Messi - 3
  2020: Robert Lewandowski - 3
  2021: Lionel Messi - 4
  2022: Karim Benzema - 3
  2023: Lionel Messi - 3
  2024: Rodri - 4

Métricas clave disponibles: ['goals', 'assists', 'minutes_played', 'expected_goals']

Totales por ganador:
                             goals  assists  minutes_played  expected_goals
bdo_year bdo_player                                                        
2017     Cristiano Ronaldo      41        7            3455            40.0
2018     Luka Modric             3        6            3120             2.7
2019     Lionel Messi           51       18            3939            30.3
2020     Robert Lewandowski     55       10            4040            45.2
2021     Lionel Messi           44       24            6316            36.9
2022     Karim Benzema          42       13            3982            32.2
2023     Lionel Messi           20       20            3540            18.1
2024

In [6]:
# Export
if 'df' in locals() and not df.empty:
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f"bdo_complete_dataset_{timestamp}.csv"
    
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"Dataset exportado: {filename}")
    print(f"Tamaño: {df.shape[0]} registros x {df.shape[1]} columnas")
    
    # Export por tipo
    if 'comp_type' in df.columns:
        for comp_type in df['comp_type'].unique():
            subset = df[df['comp_type'] == comp_type]
            subset_file = f"bdo_{comp_type}_{timestamp}.csv"
            subset.to_csv(subset_file, index=False, encoding='utf-8')
            print(f"  {comp_type}: {subset_file} ({len(subset)} registros)")
    
    print(f"\nDataset completo para análisis BDO 2024 listo")
else:
    print("Sin datos para exportar")

Dataset exportado: bdo_complete_dataset_20250909_120011.csv
Tamaño: 25 registros x 202 columnas
  domestic: bdo_domestic_20250909_120011.csv (17 registros)
  international: bdo_international_20250909_120011.csv (8 registros)

Dataset completo para análisis BDO 2024 listo
