In [None]:
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../..')))

from viz.match_data import extract_match_complete
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

OUTPUT_DIR = "downloaded_files/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [2]:
LEAGUE = "ESP-La Liga"
SEASON = "25-26"
TEAM = "Barcelona"
NUM_MATCHES = 14
OUTPUT_DIR = "downloaded_files/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Configuración:")
print(f"  Liga: {LEAGUE}")
print(f"  Temporada: {SEASON}")
print(f"  Equipo: {TEAM}")
print(f"  Partidos: {NUM_MATCHES}")

Configuración:
  Liga: ESP-La Liga
  Temporada: 25-26
  Equipo: Barcelona
  Partidos: 14


In [3]:
match_data = [
    {'date': '2025-08-16', 'home_team': 'Mallorca', 'away_team': 'Barcelona', 'ws_id': 1913918, 'us_id': 29160},
    {'date': '2025-08-23', 'home_team': 'Levante', 'away_team': 'Barcelona', 'ws_id': 1913888, 'us_id': 29171},
    {'date': '2025-08-31', 'home_team': 'Rayo Vallecano', 'away_team': 'Barcelona', 'ws_id': 1913904, 'us_id': 29188},
    {'date': '2025-09-14', 'home_team': 'Barcelona', 'away_team': 'Valencia', 'ws_id': 1913922, 'us_id': 29194},
    {'date': '2025-09-21', 'home_team': 'Barcelona', 'away_team': 'Getafe', 'ws_id': 1913903, 'us_id': 29199},
    {'date': '2025-09-25', 'home_team': 'Real Oviedo', 'away_team': 'Barcelona', 'ws_id': 1913936, 'us_id': 29213},
    {'date': '2025-09-28', 'home_team': 'Barcelona', 'away_team': 'Real Sociedad', 'ws_id': 1913949, 'us_id': 29220},
    {'date': '2025-10-05', 'home_team': 'Sevilla', 'away_team': 'Barcelona', 'ws_id': 1913956, 'us_id': 29233},
    {'date': '2025-10-18', 'home_team': 'Barcelona', 'away_team': 'Girona', 'ws_id': 1913969, 'us_id': 29240},
    {'date': '2025-10-26', 'home_team': 'Real Madrid', 'away_team': 'Barcelona', 'ws_id': 1913998, 'us_id': 29254},
    {'date': '2025-11-02', 'home_team': 'Barcelona', 'away_team': 'Elche', 'ws_id': 1913963, 'us_id': 29260},
    {'date': '2025-11-09', 'home_team': 'Celta Vigo', 'away_team': 'Barcelona', 'ws_id': 1913991, 'us_id': 29274},
    {'date': '2025-11-22', 'home_team': 'Barcelona', 'away_team': 'Athletic Club', 'ws_id': 1914008, 'us_id': 29284},
    {'date': '2025-11-29', 'home_team': 'Barcelona', 'away_team': 'Alaves', 'ws_id': 1914018, 'us_id': 29294}
]

print(f"Total partidos: {len(match_data)}")

Total partidos: 14


In [None]:
all_events = []

# Ruta absoluta a viz/data
VIZ_DATA_PATH = os.path.abspath(os.path.join(os.getcwd(), '../../viz/data'))

print("Iniciando extracción...\n")
print("=" * 70)

# Parámetros de reintentos
MAX_RETRIES = 3
DELAY_BETWEEN_MATCHES = 15
DELAY_BETWEEN_RETRIES = 30

for i, match in enumerate(match_data):
    print(f"\n[{i+1}/{len(match_data)}] {match['home_team']} vs {match['away_team']}")

    success = False
    for retry in range(MAX_RETRIES):
        try:
            # 1. Extract match complete (genera CSVs en viz/data/)
            result = extract_match_complete(
                ws_id=match['ws_id'],
                us_id=match['us_id'],
                league=LEAGUE,
                season=SEASON,
                home_team=match['home_team'],
                away_team=match['away_team'],
                match_date=match['date']
            )

            # 2. Leer match_events.csv (YA TIENE xG MERGEADO)
            events_path = os.path.join(VIZ_DATA_PATH, 'match_events.csv')
            
            if not os.path.exists(events_path):
                print(f"  ✗ CSV no generado: {events_path}")
                if retry < MAX_RETRIES - 1:
                    print(f"  ⟳ Reintento {retry + 1}/{MAX_RETRIES} en {DELAY_BETWEEN_RETRIES}s...")
                    time.sleep(DELAY_BETWEEN_RETRIES)
                continue
                
            events = pd.read_csv(events_path)

            # 3. Filtrar solo eventos de Barcelona
            barca_events = events[events['team'].str.contains('Barcelona', case=False, na=False)].copy()

            if len(barca_events) == 0:
                print(f"  Sin eventos de Barcelona")
                if retry < MAX_RETRIES - 1:
                    print(f"  ⟳ Reintento {retry + 1}/{MAX_RETRIES} en {DELAY_BETWEEN_RETRIES}s...")
                    time.sleep(DELAY_BETWEEN_RETRIES)
                continue

            # 4. Add metadata
            barca_events['match_date'] = match['date']
            barca_events['opponent'] = match['away_team'] if 'Barcelona' in match['home_team'] else match['home_team']
            barca_events['venue'] = 'Home' if 'Barcelona' in match['home_team'] else 'Away'
            barca_events['match_id_ws'] = match['ws_id']
            barca_events['match_id_us'] = match['us_id']
            barca_events['match_number'] = i + 1

            # 5. Save checkpoint
            all_events.append(barca_events)

            checkpoint_file = f"{OUTPUT_DIR}match_{i+1:02d}_{match['home_team']}_vs_{match['away_team']}.csv"
            barca_events.to_csv(checkpoint_file, index=False, encoding='utf-8')

            xg_total = barca_events['xg'].sum() if 'xg' in barca_events.columns else 0
            print(f"  ✓ {len(barca_events)} eventos | xG: {xg_total:.2f}")
            
            success = True
            break

        except Exception as e:
            print(f"  ✗ Error (intento {retry + 1}/{MAX_RETRIES}): {str(e)[:100]}")
            if retry < MAX_RETRIES - 1:
                print(f"  ⟳ Reintento en {DELAY_BETWEEN_RETRIES}s...")
                time.sleep(DELAY_BETWEEN_RETRIES)
            else:
                import traceback
                traceback.print_exc()
    
    if not success:
        print(f"  ✗ FALLO FINAL tras {MAX_RETRIES} intentos")
    
    # Delay entre partidos (solo si no es el último)
    if i < len(match_data) - 1:
        print(f"  ⏸ Esperando {DELAY_BETWEEN_MATCHES}s antes del siguiente partido...")
        time.sleep(DELAY_BETWEEN_MATCHES)

print("\n" + "=" * 70)
print(f"Extracción completada: {len(all_events)}/{len(match_data)} partidos")

Iniciando extracción...


[1/14] Mallorca vs Barcelona

Extracting: Mallorca vs Barcelona (2025-08-16)
--------------------------------------------------


In [None]:
if all_events:
    df_all_events = pd.concat(all_events, ignore_index=True)
    print(f"Total eventos Barcelona: {len(df_all_events):,}")
    print(f"xG total: {df_all_events['xg'].sum():.2f}")
else:
    df_all_events = pd.DataFrame()

In [None]:
if not df_all_events.empty:
    print("ESTADÍSTICAS")
    print("=" * 70)
    print(f"\nEventos por tipo (top 10):")
    for event_type, count in df_all_events['event_type'].value_counts().head(10).items():
        print(f"  {event_type}: {count:,}")
    print(f"\nTotal disparos: {len(df_all_events[df_all_events['xg'] > 0])}")
    print(f"xG total: {df_all_events['xg'].sum():.2f}")

In [None]:
if not df_all_events.empty:
    final_file = f"{OUTPUT_DIR}barcelona_first14_{SEASON}_FINAL.csv"
    df_all_events.to_csv(final_file, index=False, encoding='utf-8')
    print(f"Guardado: {final_file}")
    print(f"{len(df_all_events):,} eventos")

In [None]:
print("=" * 70)
print("EXTRACCIÓN COMPLETADA")
print("=" * 70)
print(f"Partidos: {len(all_events)}/{len(match_data)}")
print(f"Archivo: {final_file}")
print("=" * 70)