In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../..')))

from scrappers import WhoScored
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Output directory
os.makedirs('../data', exist_ok=True)

In [None]:
# SEASON 24/25
print("="*60)
print("ARSENAL 24/25 - FIRST 11 PREMIER LEAGUE MATCHES")
print("="*60)

ws_2425 = WhoScored(leagues=["ENG-Premier League"], seasons=["24-25"])
schedule_2425 = ws_2425.read_schedule()

schedule_2425_reset = schedule_2425.reset_index()
arsenal_2425 = schedule_2425_reset[
    (schedule_2425_reset['home_team'].str.contains('Arsenal', case=False, na=False)) |
    (schedule_2425_reset['away_team'].str.contains('Arsenal', case=False, na=False))
].copy()

arsenal_2425 = arsenal_2425[['date', 'home_team', 'away_team', 'game_id']].copy()
arsenal_2425 = arsenal_2425.rename(columns={'game_id': 'match_id'})
arsenal_2425 = arsenal_2425.sort_values('date').reset_index(drop=True)

# First 11 matches only
arsenal_2425_first11 = arsenal_2425.head(11)

print(f"Total partidos Arsenal en PL 24/25: {len(arsenal_2425)}")
print(f"Primeros 11 partidos:")
arsenal_2425_first11

In [None]:
# Scrape events for 24/25
csv_path_2425 = '../data/arsenal_events_2425_first11.csv'

if os.path.exists(csv_path_2425):
    print(f"Cargando desde CSV: {csv_path_2425}")
    df_2425_events = pd.read_csv(csv_path_2425)
    print(f"Cargados: {len(df_2425_events)} eventos")
else:
    print("Scrapeando eventos 24/25...")
    all_events_2425 = []
    
    for idx, match in arsenal_2425_first11.iterrows():
        print(f"\n[{idx+1}/11] {match['home_team']} vs {match['away_team']}")
        
        try:
            events = ws_2425.read_events(match_id=int(match['match_id']))
            
            # Filter Arsenal events only
            arsenal_events = events[events['team'].str.contains('Arsenal', case=False, na=False)].copy()
            
            if len(arsenal_events) > 0:
                arsenal_events['match_date'] = match['date']
                arsenal_events['opponent'] = match['away_team'] if 'Arsenal' in match['home_team'] else match['home_team']
                arsenal_events['venue'] = 'Home' if 'Arsenal' in match['home_team'] else 'Away'
                arsenal_events['match_id'] = match['match_id']
                arsenal_events['match_number'] = idx + 1
                all_events_2425.append(arsenal_events)
                print(f"  {len(arsenal_events)} eventos de Arsenal")
            else:
                print(f"  Sin eventos de Arsenal")
                
        except Exception as e:
            print(f"  Error: {e}")
            continue
    
    if len(all_events_2425) > 0:
        df_2425_events = pd.concat(all_events_2425, ignore_index=True)
        df_2425_events.to_csv(csv_path_2425, index=False)
        print(f"\nGuardado: {csv_path_2425}")
        print(f"Total eventos: {len(df_2425_events)}")
    else:
        df_2425_events = pd.DataFrame()

print(f"\nResumen 24/25:")
print(f"Total eventos: {len(df_2425_events)}")
if len(df_2425_events) > 0:
    print(f"\nTipos de eventos:")
    print(df_2425_events['type'].value_counts().head(10))

In [None]:
# SEASON 25/26
print("\n" + "="*60)
print("ARSENAL 25/26 - FIRST 11 PREMIER LEAGUE MATCHES")
print("="*60)

ws_2526 = WhoScored(leagues=["ENG-Premier League"], seasons=["25-26"])
schedule_2526 = ws_2526.read_schedule()

schedule_2526_reset = schedule_2526.reset_index()
arsenal_2526 = schedule_2526_reset[
    (schedule_2526_reset['home_team'].str.contains('Arsenal', case=False, na=False)) |
    (schedule_2526_reset['away_team'].str.contains('Arsenal', case=False, na=False))
].copy()

arsenal_2526 = arsenal_2526[['date', 'home_team', 'away_team', 'game_id']].copy()
arsenal_2526 = arsenal_2526.rename(columns={'game_id': 'match_id'})
arsenal_2526 = arsenal_2526.sort_values('date').reset_index(drop=True)

# First 11 matches only
arsenal_2526_first11 = arsenal_2526.head(11)

print(f"Total partidos Arsenal en PL 25/26: {len(arsenal_2526)}")
print(f"Primeros 11 partidos:")
arsenal_2526_first11

In [None]:
# Scrape events for 25/26
csv_path_2526 = '../data/arsenal_events_2526_first11.csv'

if os.path.exists(csv_path_2526):
    print(f"Cargando desde CSV: {csv_path_2526}")
    df_2526_events = pd.read_csv(csv_path_2526)
    print(f"Cargados: {len(df_2526_events)} eventos")
else:
    print("Scrapeando eventos 25/26...")
    all_events_2526 = []
    
    for idx, match in arsenal_2526_first11.iterrows():
        print(f"\n[{idx+1}/11] {match['home_team']} vs {match['away_team']}")
        
        try:
            events = ws_2526.read_events(match_id=int(match['match_id']))
            
            # Filter Arsenal events only
            arsenal_events = events[events['team'].str.contains('Arsenal', case=False, na=False)].copy()
            
            if len(arsenal_events) > 0:
                arsenal_events['match_date'] = match['date']
                arsenal_events['opponent'] = match['away_team'] if 'Arsenal' in match['home_team'] else match['home_team']
                arsenal_events['venue'] = 'Home' if 'Arsenal' in match['home_team'] else 'Away'
                arsenal_events['match_id'] = match['match_id']
                arsenal_events['match_number'] = idx + 1
                all_events_2526.append(arsenal_events)
                print(f"  {len(arsenal_events)} eventos de Arsenal")
            else:
                print(f"  Sin eventos de Arsenal")
                
        except Exception as e:
            print(f"  Error: {e}")
            continue
    
    if len(all_events_2526) > 0:
        df_2526_events = pd.concat(all_events_2526, ignore_index=True)
        df_2526_events.to_csv(csv_path_2526, index=False)
        print(f"\nGuardado: {csv_path_2526}")
        print(f"Total eventos: {len(df_2526_events)}")
    else:
        df_2526_events = pd.DataFrame()

print(f"\nResumen 25/26:")
print(f"Total eventos: {len(df_2526_events)}")
if len(df_2526_events) > 0:
    print(f"\nTipos de eventos:")
    print(df_2526_events['type'].value_counts().head(10))

In [None]:
# Final summary
print("\n" + "="*60)
print("EXTRACTION COMPLETE")
print("="*60)
print(f"\nArchivos generados:")
print(f"  1. {csv_path_2425}")
print(f"     Eventos: {len(df_2425_events) if 'df_2425_events' in dir() else 'N/A'}")
print(f"  2. {csv_path_2526}")
print(f"     Eventos: {len(df_2526_events) if 'df_2526_events' in dir() else 'N/A'}")