# Messi Season Analysis - Data Loading

Carga y consolidación de datos de todos los partidos de la temporada 2025.

## 1. Setup

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 2. Configuración

In [2]:
DATA_DIR = './data'

CSV_FILES = [
    'match_events.csv',
    'player_network.csv',
    'match_aggregates.csv',
    'spatial_analysis.csv',
    'match_info.csv'
]

print(f"Data directory: {os.path.abspath(DATA_DIR)}")

Data directory: /home/jaime/FD/data/blog/notebooks/data


## 3. Escanear Carpetas

In [3]:
match_folders = sorted([f for f in os.listdir(DATA_DIR) 
                       if os.path.isdir(os.path.join(DATA_DIR, f))])

print(f"Total partidos: {len(match_folders)}")
print(f"\nPrimeros 5:")
for folder in match_folders[:5]:
    print(f"  - {folder}")

Total partidos: 38

Primeros 5:
  - 2025-02-23_InterMiamiCF_NewYorkCityFC
  - 2025-03-03_HoustonDynamoFC_InterMiamiCF
  - 2025-03-09_InterMiamiCF_CharlotteFC
  - 2025-03-16_AtlantaUnited_InterMiamiCF
  - 2025-03-29_InterMiamiCF_PhiladelphiaUnion


## 4. Cargar Todos los Datos

In [4]:
def load_all_matches():
    all_events = []
    all_network = []
    all_aggregates = []
    all_spatial = []
    all_info = []
    
    for idx, folder in enumerate(match_folders, 1):
        folder_path = os.path.join(DATA_DIR, folder)
        match_date = folder.split('_')[0]
        
        try:
            # Events
            df = pd.read_csv(os.path.join(folder_path, 'match_events.csv'))
            df['match_folder'] = folder
            df['match_date'] = match_date
            all_events.append(df)
            
            # Network
            df = pd.read_csv(os.path.join(folder_path, 'player_network.csv'))
            df['match_folder'] = folder
            df['match_date'] = match_date
            all_network.append(df)
            
            # Aggregates
            df = pd.read_csv(os.path.join(folder_path, 'match_aggregates.csv'))
            df['match_folder'] = folder
            df['match_date'] = match_date
            all_aggregates.append(df)
            
            # Spatial
            df = pd.read_csv(os.path.join(folder_path, 'spatial_analysis.csv'))
            df['match_folder'] = folder
            df['match_date'] = match_date
            all_spatial.append(df)
            
            # Info
            df = pd.read_csv(os.path.join(folder_path, 'match_info.csv'))
            df['match_folder'] = folder
            df['match_date'] = match_date
            all_info.append(df)
            
            if idx % 10 == 0:
                print(f"Cargados: {idx}/{len(match_folders)}")
                
        except Exception as e:
            print(f"Error en {folder}: {e}")
    
    print(f"\nConsolidando...")
    
    dfs = {
        'events': pd.concat(all_events, ignore_index=True),
        'network': pd.concat(all_network, ignore_index=True),
        'aggregates': pd.concat(all_aggregates, ignore_index=True),
        'spatial': pd.concat(all_spatial, ignore_index=True),
        'info': pd.concat(all_info, ignore_index=True)
    }
    
    print(f"\nDataFrames consolidados:")
    for name, df in dfs.items():
        print(f"  {name:12s}: {len(df):,} rows")
    
    return dfs

data = load_all_matches()

Cargados: 10/38
Cargados: 20/38
Cargados: 30/38

Consolidando...

DataFrames consolidados:
  events      : 72,761 rows
  network     : 10,264 rows
  aggregates  : 2,529 rows
  spatial     : 1,881 rows
  info        : 3,130 rows


## 5. Asignar a Variables

In [5]:
df_events = data['events']
df_network = data['network']
df_aggregates = data['aggregates']
df_spatial = data['spatial']
df_info = data['info']

print(f"df_events: {len(df_events):,} rows")
print(f"df_network: {len(df_network):,} rows")
print(f"df_aggregates: {len(df_aggregates):,} rows")
print(f"df_spatial: {len(df_spatial):,} rows")
print(f"df_info: {len(df_info):,} rows")

df_events: 72,761 rows
df_network: 10,264 rows
df_aggregates: 2,529 rows
df_spatial: 1,881 rows
df_info: 3,130 rows


## 6. Filtrar Solo Messi

In [6]:
df_messi = df_events[df_events['player'] == 'Lionel Messi'].copy()

print(f"Eventos de Messi: {len(df_messi):,}")
print(f"Partidos: {df_messi['match_folder'].nunique()}")
print(f"Promedio eventos/partido: {len(df_messi) / df_messi['match_folder'].nunique():.1f}")
print(f"\nTipos de eventos:")
print(df_messi['event_type'].value_counts())

Eventos de Messi: 2,814
Partidos: 32
Promedio eventos/partido: 87.9

Tipos de eventos:
event_type
Pass              1555
Carry              460
TakeOn             209
BallTouch          101
SavedShot           98
BallRecovery        70
Foul                59
Dispossessed        55
CornerAwarded       46
MissedShots         33
Goal                32
Tackle              23
OffsidePass         12
Aerial              12
OffsideGiven        10
Challenge            9
BlockedPass          6
Card                 4
Error                4
ShotOnPost           4
Interception         4
GoodSkill            4
SubstitutionOn       2
Clearance            2
Name: count, dtype: int64


## 7. Validar Columnas Disponibles

In [7]:
print("Columnas en df_events:")
print(df_events.columns.tolist())

Columnas en df_events:
['game_id', 'period', 'minute', 'second', 'expanded_minute', 'type', 'outcome_type', 'team_id', 'team', 'player_id', 'player', 'x', 'y', 'end_x', 'end_y', 'goal_mouth_y', 'goal_mouth_z', 'blocked_x', 'blocked_y', 'qualifiers', 'is_touch', 'is_shot', 'is_goal', 'card_type', 'related_event_id', 'related_player_id', 'match_id', 'data_source', 'pass_length', 'is_longball', 'is_header', 'is_cross', 'is_through_ball', 'shot_body_part', 'is_assist', 'field_zone', 'is_successful', 'event_type', 'possession_sequence', 'next_player', 'distance_to_goal', 'pass_distance', 'event_id', 'take_ons_in_carry', 'xthreat', 'xthreat_gen', 'is_pre_assist', 'possession_id', 'possession_team', 'is_progressive', 'is_box_entry', 'pass_outcome', 'action_type', 'zone_id', 'xg', 'match_folder', 'match_date']


In [8]:
print("Columnas en df_messi:")
print(df_messi.columns.tolist())

Columnas en df_messi:
['game_id', 'period', 'minute', 'second', 'expanded_minute', 'type', 'outcome_type', 'team_id', 'team', 'player_id', 'player', 'x', 'y', 'end_x', 'end_y', 'goal_mouth_y', 'goal_mouth_z', 'blocked_x', 'blocked_y', 'qualifiers', 'is_touch', 'is_shot', 'is_goal', 'card_type', 'related_event_id', 'related_player_id', 'match_id', 'data_source', 'pass_length', 'is_longball', 'is_header', 'is_cross', 'is_through_ball', 'shot_body_part', 'is_assist', 'field_zone', 'is_successful', 'event_type', 'possession_sequence', 'next_player', 'distance_to_goal', 'pass_distance', 'event_id', 'take_ons_in_carry', 'xthreat', 'xthreat_gen', 'is_pre_assist', 'possession_id', 'possession_team', 'is_progressive', 'is_box_entry', 'pass_outcome', 'action_type', 'zone_id', 'xg', 'match_folder', 'match_date']


## 8. Sample de Datos de Messi

In [9]:
df_messi[['match_date', 'minute', 'event_type', 'outcome_type', 
          'x', 'y', 'xthreat_gen', 'is_progressive']].head(20)

Unnamed: 0,match_date,minute,event_type,outcome_type,x,y,xthreat_gen,is_progressive
77,2025-02-23,2.0,Pass,Unsuccessful,70.0,46.0,0.0,False
94,2025-02-23,4.0,Pass,Unsuccessful,70.1,51.8,0.0,False
96,2025-02-23,4.0,CornerAwarded,Successful,98.9,29.6,0.0,False
98,2025-02-23,4.0,Pass,Successful,99.5,0.5,0.014093,False
100,2025-02-23,4.0,Pass,Successful,79.7,14.6,0.049859,False
102,2025-02-23,4.0,Pass,Successful,94.0,33.9,0.020413,False
139,2025-02-23,9.0,Pass,Unsuccessful,70.0,17.1,0.0,False
147,2025-02-23,10.0,Pass,Successful,72.0,22.1,0.057518,False
148,2025-02-23,10.0,Carry,Successful,58.1,37.4,0.045799,False
164,2025-02-23,11.0,SavedShot,Successful,92.8,51.0,0.0,False
