# Arsenal 25/26 - Missing Matches Extraction

This notebook extracts the 2 missing matches from Arsenal's 25/26 season and merges them with the existing data.

**Missing Matches:**
1. Arsenal vs Crystal Palace (2025-10-26) - WS: 1903198, US: 28859
2. Sunderland vs Arsenal (2025-11-08) - WS: 1903224, US: 28885

In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../..')))

from viz.match_data import extract_match_complete
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

print("Setup complete")

Setup complete


In [2]:
# Define missing matches
missing_matches = [
    {
        'ws_id': 1903198,
        'us_id': 28859,
        'home_team': 'Arsenal',
        'away_team': 'Crystal Palace',
        'date': '2025-10-26 14:00:00',
        'match_number': 9
    },
    {
        'ws_id': 1903224,
        'us_id': 28885,
        'home_team': 'Sunderland',
        'away_team': 'Arsenal',
        'date': '2025-11-08 15:00:00',
        'match_number': 10
    }
]

print("Missing matches defined:")
for m in missing_matches:
    print(f"  {m['home_team']} vs {m['away_team']} (WS: {m['ws_id']}, US: {m['us_id']})")

Missing matches defined:
  Arsenal vs Crystal Palace (WS: 1903198, US: 28859)
  Sunderland vs Arsenal (WS: 1903224, US: 28885)


In [3]:
# Extract missing matches
print("="*60)
print("EXTRACTING MISSING MATCHES")
print("="*60)

all_missing_events = []

for idx, match in enumerate(missing_matches, 1):
    print(f"\n[{idx}/2] {match['home_team']} vs {match['away_team']}")
    
    try:
        extract_match_complete(
            ws_id=match['ws_id'],
            us_id=match['us_id'],
            league='ENG-Premier League',
            season='25-26',
            home_team=match['home_team'],
            away_team=match['away_team'],
            match_date=match['date']
        )
        
        events = pd.read_csv('../../viz/data/match_events.csv')
        
        # Add match context
        is_home = 'Arsenal' in match['home_team']
        events['match_date'] = match['date']
        events['opponent'] = match['away_team'] if is_home else match['home_team']
        events['arsenal_venue'] = 'Home' if is_home else 'Away'
        events['match_id_ws'] = match['ws_id']
        events['match_id_us'] = match['us_id']
        events['match_number'] = match['match_number']
        
        all_missing_events.append(events)
        print(f"  ✓ {len(events)} eventos extraídos")
        
    except Exception as e:
        print(f"  ✗ Error: {e}")
        continue

if all_missing_events:
    df_missing = pd.concat(all_missing_events, ignore_index=True)
    print(f"\nTotal eventos de partidos faltantes: {len(df_missing)}")
else:
    df_missing = pd.DataFrame()
    print("\n⚠ No se extrajeron eventos")

EXTRACTING MISSING MATCHES

[1/2] Arsenal vs Crystal Palace

Extracting: Arsenal vs Crystal Palace (2025-10-26 14:00:00)
--------------------------------------------------


update.go:85: cannot change mount namespace according to change mount (/var/lib/snapd/hostfs/usr/local/share/doc /usr/local/share/doc none bind,ro 0 0): cannot write to "/var/lib/snapd/hostfs/usr/local/share/doc" because it would affect the host in "/var/lib/snapd"
update.go:85: cannot change mount namespace according to change mount (/var/lib/snapd/hostfs/usr/share/cups/doc-root /usr/share/cups/doc-root none bind,ro 0 0): cannot write to "/var/lib/snapd/hostfs/usr/share/cups/doc-root" because it would affect the host in "/var/lib/snapd"
update.go:85: cannot change mount namespace according to change mount (/var/lib/snapd/hostfs/usr/share/gimp/2.0/help /usr/share/gimp/2.0/help none bind,ro 0 0): cannot write to "/var/lib/snapd/hostfs/usr/share/gimp/2.0/help" because it would affect the host in "/var/lib/snapd"
update.go:85: cannot change mount namespace according to change mount (/var/lib/snapd/hostfs/usr/share/gtk-doc /usr/share/gtk-doc none bind,ro 0 0): cannot write to "/var/lib/sna

1. match_events.csv: 1810 events
2. player_network.csv: 254 records
3. match_aggregates.csv: 68 records
4. spatial_analysis.csv: 50 spatial records
5. match_info.csv: 87 info records

Shots: 17 | Goals: 1 | xG: 1.33
  ✓ 1810 eventos extraídos

[2/2] Sunderland vs Arsenal

Extracting: Sunderland vs Arsenal (2025-11-08 15:00:00)
--------------------------------------------------


1. match_events.csv: 1671 events
2. player_network.csv: 223 records
3. match_aggregates.csv: 63 records
4. spatial_analysis.csv: 50 spatial records
5. match_info.csv: 78 info records

Shots: 23 | Goals: 4 | xG: 3.80
  ✓ 1671 eventos extraídos

Total eventos de partidos faltantes: 3481


In [4]:
# Load existing 9 matches
existing_csv = '../data/arsenal_matches_2526_complete.csv'

print("="*60)
print("CARGANDO DATOS EXISTENTES")
print("="*60)

if os.path.exists(existing_csv):
    df_existing = pd.read_csv(existing_csv)
    print(f"✓ Cargados {len(df_existing)} eventos de {df_existing['match_number'].nunique()} partidos")
    print(f"  Partidos: {sorted(df_existing['match_number'].unique())}")
else:
    print(f"⚠ Archivo no encontrado: {existing_csv}")
    df_existing = pd.DataFrame()

CARGANDO DATOS EXISTENTES
✓ Cargados 16056 eventos de 9 partidos
  Partidos: [1, 2, 3, 4, 5, 6, 7, 8, 11]


In [5]:
# Merge all matches
print("="*60)
print("FUSIONANDO DATOS")
print("="*60)

if len(df_existing) > 0 and len(df_missing) > 0:
    # Combine
    df_complete = pd.concat([df_existing, df_missing], ignore_index=True)
    
    # Sort by match_number
    df_complete = df_complete.sort_values(['match_number', 'minute']).reset_index(drop=True)
    
    # Save
    df_complete.to_csv(existing_csv, index=False)
    
    print(f"✓ Guardado: {existing_csv}")
    print(f"  Total eventos: {len(df_complete)}")
    print(f"  Total partidos: {df_complete['match_number'].nunique()}")
    print(f"  Partidos: {sorted(df_complete['match_number'].unique())}")
else:
    print("⚠ No se pudo fusionar: datos faltantes")
    df_complete = df_existing if len(df_existing) > 0 else df_missing

FUSIONANDO DATOS
✓ Guardado: ../data/arsenal_matches_2526_complete.csv
  Total eventos: 19537
  Total partidos: 11
  Partidos: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


In [6]:
# Final validation
print("="*60)
print("VALIDACIÓN FINAL")
print("="*60)

if len(df_complete) > 0:
    shots = df_complete[df_complete['event_type'].str.contains('Shot|Goal', case=False, na=False)]
    
    print(f"\nArsenal 25/26 - Datos completos:")
    print(f"  Total eventos: {len(df_complete)}")
    print(f"  Total partidos: {df_complete['match_number'].nunique()}")
    print(f"  Tiros totales: {len(shots)}")
    print(f"  xG total: {shots['xg'].sum():.2f}")
    print(f"  Columnas: {len(df_complete.columns)}")
    
    print(f"\nPartidos por número:")
    for num in sorted(df_complete['match_number'].unique()):
        match_events = df_complete[df_complete['match_number'] == num]
        opponent = match_events['opponent'].iloc[0]
        venue = match_events['arsenal_venue'].iloc[0]
        print(f"  {num}: vs {opponent} ({venue}) - {len(match_events)} eventos")
    
    if df_complete['match_number'].nunique() == 11:
        print("\n✓ ÉXITO: Los 11 partidos están completos")
    else:
        print(f"\n⚠ ADVERTENCIA: Solo {df_complete['match_number'].nunique()} partidos (esperados 11)")
else:
    print("⚠ No hay datos para validar")

VALIDACIÓN FINAL

Arsenal 25/26 - Datos completos:
  Total eventos: 19537
  Total partidos: 11
  Tiros totales: 243
  xG total: 26.29
  Columnas: 61

Partidos por número:
  1: vs Manchester United (Away) - 1751 eventos
  2: vs Leeds (Home) - 1727 eventos
  3: vs Liverpool (Away) - 1713 eventos
  4: vs Nottingham Forest (Home) - 1800 eventos
  5: vs Manchester City (Home) - 1894 eventos
  6: vs Newcastle (Away) - 1735 eventos
  7: vs West Ham (Home) - 1755 eventos
  8: vs Fulham (Away) - 1838 eventos
  9: vs Crystal Palace (Home) - 1810 eventos
  10: vs Sunderland (Away) - 1671 eventos
  11: vs Burnley (Away) - 1843 eventos

✓ ÉXITO: Los 11 partidos están completos
