In [1]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

from database.connection import get_db_manager
from tfm.query_helpers import (
    query_player_pool,
    add_exogenous_player,
    validate_required_metrics,
    get_positions,
    POSITIONS
)
from tfm.algorithms import find_similar_players_kmeans, find_similar_players_cosine

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Query pool con filtros Transfermarkt - CASO: Baena vs Moleiro
# Temporada 24/25 - BIG 5 LEAGUES

big5_leagues = ['ENG-Premier League', 'ESP-La Liga', 'ITA-Serie A', 'GER-Bundesliga', 'FRA-Ligue 1']

# Query cada liga y concatenar
pools = []
for league in big5_leagues:
    try:
        pool = query_player_pool(
            league=league,
            season='2425',
            positions=['CAM', 'LM', 'LW'],      # Mediapunta, Medio Izq, Extremo Izq
            max_market_value=25_000_000,        # Máximo 25M EUR
            min_minutes=1250,                   # Mínimo 1250 minutos jugados
            max_age=28                          # Máximo 28 años
        )
        pools.append(pool)
        print(f"{league}: {len(pool)} jugadores")
    except Exception as e:
        print(f"Error en {league}: {e}")

pool_df = pd.concat(pools, ignore_index=True)

print(f"\nPool total: {len(pool_df)} jugadores de las Big 5")
print(f"\nDistribución por liga:")
print(pool_df['league'].value_counts())
print(f"\nPosiciones en pool:")
print(pool_df.groupby(
    pool_df['transfermarkt_metrics'].apply(
        lambda x: x.get('transfermarkt_position_specific') if x else None
    )
).size())

2025-12-03 13:52:47,760 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-03 13:52:47,920 - database.connection - INFO - Database connection successful
2025-12-03 13:52:48,012 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-03 13:52:48,036 - database.connection - INFO - Database connection successful
2025-12-03 13:52:48,058 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-03 13:52:48,076 - database.connection - INFO - Database connection successful
2025-12-03 13:52:48,098 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-03 13:52:48,116 - database.connection - INFO - Database connection successful
2025-12-03 13:52:48,137 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-03 13:52:48,156 - database.connection - INFO - Database connection successful


ENG-Premier League: 11 jugadores
ESP-La Liga: 20 jugadores
ITA-Serie A: 10 jugadores
GER-Bundesliga: 12 jugadores
FRA-Ligue 1: 15 jugadores

Pool total: 68 jugadores de las Big 5

Distribución por liga:
league
ESP-La Liga           20
FRA-Ligue 1           15
GER-Bundesliga        12
ENG-Premier League    11
ITA-Serie A           10
Name: count, dtype: int64

Posiciones en pool:
transfermarkt_metrics
CAM    32
LM      4
LW     32
dtype: int64


In [3]:
# Target exógeno: Baena tiene 55M EUR, no está en pool (max 25M)
full_df = add_exogenous_player(
    pool_df=pool_df,
    player_name='Alex Baena',
    league='ESP-La Liga',
    season='2425',
    team='Villarreal'
)
print(f"DataFrame con target exógeno: {len(full_df)} jugadores")

# NOTA: Solo usar si target YA está en pool
# full_df = pool_df.copy()
# print(f"Usando pool completo: {len(full_df)} jugadores")

2025-12-03 13:52:48,195 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-03 13:52:48,220 - database.connection - INFO - Database connection successful


Added exogenous player: Alex Baena (Villarreal, ESP-La Liga 2425)
Total players in DataFrame: 69
DataFrame con target exógeno: 69 jugadores


In [4]:
# Función extract_metrics (del notebook Etta_Eyong_radars.ipynb)
def extract_metrics(df, col_name):
    result = pd.DataFrame(index=df.index)
    
    all_keys = set()
    for _, row in df.iterrows():
        if isinstance(row[col_name], dict):
            all_keys.update(row[col_name].keys())
    
    for key in all_keys:
        values = []
        for _, row in df.iterrows():
            if isinstance(row[col_name], dict) and key in row[col_name]:
                raw_value = row[col_name][key]
                converted_value = _convert_to_float(raw_value)
                values.append(converted_value)
            else:
                values.append(np.nan)
        
        valid_count = pd.Series(values).notna().sum()
        if valid_count >= 5:
            result[key] = values
    
    return result

def _convert_to_float(value):
    if isinstance(value, (int, float)):
        return float(value)
    if value is None or pd.isna(value):
        return np.nan
    if isinstance(value, str):
        if value.strip() == '' or value.lower().strip() in ['nan', 'none', 'null', '-']:
            return np.nan
        try:
            return float(value)
        except (ValueError, TypeError):
            return np.nan
    return np.nan

# Extraer JSONB → columnas
fbref_nums = extract_metrics(full_df, 'fbref_metrics')
understat_nums = extract_metrics(full_df, 'understat_metrics')
transfermarkt_nums = extract_metrics(full_df, 'transfermarkt_metrics')  # NUEVO

print(f"FBref: {fbref_nums.shape[1]} métricas")
print(f"Understat: {understat_nums.shape[1]} métricas")
print(f"Transfermarkt: {transfermarkt_nums.shape[1]} campos")

FBref: 185 métricas
Understat: 10 métricas
Transfermarkt: 2 campos


In [5]:
# Métricas a EXCLUIR de normalización per90
exclude_per90 = {
    # Porcentajes
    'pass_completion_pct', 'shots_on_target_pct', 'Take-Ons_Succ%', 'Take-Ons_Tkld%',
    'Aerial Duels_Won%', 'Challenges_Tkl%', 'Save%', 'Launched_Cmp%', 'Crosses_Stp%',
    # Ya per 90
    'shots_per_90', 'GA90', 'GCA_GCA90', 'SCA_SCA90', 'Team Success_+/-90', 'SoT/90', 'Sweeper_#OPA/90',
    # Ratios
    'npxG/Sh', 'xG+xAG', 'non_penalty_xG_plus_xAG', 'avg_shot_distance', 'Passes_AvgLen', 'Goal Kicks_AvgLen',
    # Metadata
    'minutes_per_match', 'matches_played', 'matches_started', 'minutes_played',
    'wins', 'draws', 'losses', 'Min%', 'Starts_Mn/Start', 'Subs_Mn/Sub',
    # Understat
    'understat_buildup_involvement_pct', 'understat_player_id', 'understat_team_id',
    # NUEVO: Transfermarkt (NO normalizar, no son rates)
    'transfermarkt_transfermarkt_player_id', 'transfermarkt_market_value_eur',
    'transfermarkt_birth_date', 'transfermarkt_club',
    'transfermarkt_contract_start_date', 'transfermarkt_contract_end_date',
    'transfermarkt_contract_is_current', 'transfermarkt_position_specific',
    'transfermarkt_primary_foot'
}

# Calcular per90 FBref
fbref_per90 = fbref_nums.loc[:, ~fbref_nums.columns.isin(exclude_per90)]
fbref_per90 = (fbref_per90.div(fbref_nums['minutes_played'], axis=0) * 90).round(3)
fbref_per90.columns = [f'{col}_per90' for col in fbref_per90.columns]

# Calcular per90 Understat
understat_per90 = understat_nums.loc[:, ~understat_nums.columns.isin(exclude_per90)]
understat_per90 = (understat_per90.div(fbref_nums['minutes_played'], axis=0) * 90).round(3)
understat_per90.columns = [f'{col}_per90' for col in understat_per90.columns]

print(f"Per90 FBref: {fbref_per90.shape[1]}")
print(f"Per90 Understat: {understat_per90.shape[1]}")

Per90 FBref: 153
Per90 Understat: 7


In [6]:
# DataFrame final consolidado
base_cols = ['unique_player_id', 'player_name', 'team', 'league', 'season', 'position']

df_final = pd.concat([
    full_df[base_cols],
    fbref_nums,
    understat_nums,
    transfermarkt_nums,  # NUEVO
    fbref_per90,
    understat_per90
], axis=1)

print(f"DataFrame final: {df_final.shape[0]} jugadores × {df_final.shape[1]} columnas")
print(f"\nPrimeras columnas per90:")
print([c for c in df_final.columns if c.endswith('_per90')][:5])

DataFrame final: 69 jugadores × 363 columnas

Primeras columnas per90:
['Goal Kicks_Att_per90', 'Team Success_+/-_per90', 'Passes_Thr_per90', 'Starts_Starts_per90', 'SCA Types_Def_per90']


In [7]:
# Buscar target por nombre
target_name = 'Baena'  # Cambiar según caso
player = df_final[df_final['player_name'].str.contains(target_name, case=False, na=False)]

if len(player) == 0:
    print(f"ERROR: No se encontró jugador con '{target_name}'")
    target_id = None
elif len(player) > 1:
    print(f"ADVERTENCIA: {len(player)} jugadores encontrados:")
    print(player[['player_name', 'team', 'league', 'season']])
    print("\nUsando el primero. Especifica más si es incorrecto.")
    target_id = player.iloc[0]['unique_player_id']
else:
    target_id = player.iloc[0]['unique_player_id']
    print(f"Target identificado: {player.iloc[0]['player_name']} ({player.iloc[0]['team']})")
    print(f"ID: {target_id}")

Target identificado: Alex Baena (Villarreal)
ID: eb4c447d1a00eb39


In [8]:
# Validación pre-ejecución (solo FBref, Understat es opcional)
if target_id:
    try:
        missing = validate_required_metrics(
            df_final,
            target_id,
            required_fbref=['goals', 'assists', 'shots', 'minutes_played'],
            required_understat=None,  # Understat es opcional, FBref tiene xG
            raise_on_missing=True
        )
        print("Target válido: Todas las métricas requeridas presentes")
    except ValueError as e:
        print(f"ERROR DE VALIDACIÓN:\n{e}")
        target_id = None

Target válido: Todas las métricas requeridas presentes


In [9]:
if target_id:
    result_kmeans = find_similar_players_kmeans(
        df=df_final,
        target_player_id=target_id,
        n_similar=10,
        k_clusters=None,  # Método del codo
        use_pca=False
    )

    print("\n" + "="*60)
    print("RESULTADOS K-MEANS")
    print("="*60)
    print("\nCLUSTER INFO:")
    print(result_kmeans['cluster_info'])
    print("\nDISTRIBUCIÓN DISTANCIAS:")
    print(result_kmeans['distances_distribution'])
    print("\nTOP 10 SIMILARES:")
    print(result_kmeans['similar_players'][['player_name', 'team', 'euclidean_distance', 'distance_percentile']].to_string(index=False))
else:
    print("ERROR: No se puede ejecutar sin target válido")

Target: Alex Baena (Villarreal, ESP-La Liga)
Features seleccionados automáticamente: 141 (solo _per90, excl. GK)
Jugadores tras limpieza: 69 (eliminados 0 con NaNs en métricas CORE)
Clusters óptimos (método del codo): 8
Target asignado a cluster: 7
Jugadores en cluster 7: 10
Top 10 similares encontrados

RESULTADOS K-MEANS

CLUSTER INFO:
{'cluster_id': 7, 'n_players': 10, 'avg_distance': 14.345748726682336, 'std_distance': 5.129826031647118, 'silhouette_score': 0.05707175065515459}

DISTRIBUCIÓN DISTANCIAS:
{'min': 0.0, 'q25': 13.887018913475458, 'median': 15.804561868060008, 'q75': 16.43661506037087, 'max': 19.791215384083635}

TOP 10 SIMILARES:
           player_name            team  euclidean_distance  distance_percentile
         Julian Brandt        Dortmund           13.348729                 20.0
      Marcus Tavernier     Bournemouth           13.848504                 30.0
         Romano Schmid   Werder Bremen           14.002564                 40.0
Hákon Arnar Haraldsson   

In [10]:
# OPCIONAL: Ejecutar también PCA + Cosine
if target_id:
    result_cosine = find_similar_players_cosine(
        df=df_final,
        target_player_id=target_id,
        n_similar=10,
        pca_variance=0.85
    )

    print("\n" + "="*60)
    print("RESULTADOS PCA + COSINE")
    print("="*60)
    print("\nPCA INFO:")
    print(result_cosine['pca_info'])
    print("\nSCORE DISTRIBUTION:")
    print(result_cosine['score_distribution'])
    print("\nTOP 10 SIMILARES:")
    print(result_cosine['similar_players'][['player_name', 'team', 'cosine_similarity', 'similarity_percentile']].to_string(index=False))

Target: Alex Baena (Villarreal, ESP-La Liga)
Features seleccionados automáticamente: 141 (solo _per90, excl. GK)
Jugadores tras limpieza: 69 (eliminados 0 con NaNs en métricas CORE)
PCA: 20 componentes (varianza explicada: 85.7%)
Reducción: 141 → 20 dimensiones
Top 10 similares encontrados
Rango similitud: [0.4302, 0.7084]

RESULTADOS PCA + COSINE

PCA INFO:
{'n_components': 20, 'explained_variance_ratio': 0.8566041619926739, 'original_dimensions': 141, 'reduced_dimensions': 20, 'compression_ratio': 0.14184397163120568, 'top_5_components_variance': [0.17008229506717865, 0.11662387240379278, 0.086435340025953, 0.0708999099275149, 0.061350768300368556]}

SCORE DISTRIBUTION:
{'min': -0.6985201953188143, 'q5': -0.4974702865261674, 'q25': -0.2953860785554157, 'median': -0.11842156385655214, 'q75': 0.1818566478550583, 'q95': 0.6056499220095458, 'max': 0.7083639571989272, 'mean': -0.0505176562877865, 'std': 0.3456100429367862}

TOP 10 SIMILARES:
           player_name          team  cosine_si

In [11]:
# Análisis adicional (si hay otro jugador de interés)
# Ejemplo: Verificar si un reemplazo específico aparece en el top-10

# Buscar reemplazo
replacement_name = 'Moleiro'  # Cambiar según caso
replacement = df_final[df_final['player_name'].str.contains(replacement_name, case=False, na=False)]

if len(replacement) > 0 and target_id:
    replacement_id = replacement.iloc[0]['unique_player_id']
    
    # Verificar en resultados K-Means
    if 'result_kmeans' in locals():
        in_kmeans = replacement_id in result_kmeans['similar_players']['unique_player_id'].values
        if in_kmeans:
            row = result_kmeans['similar_players'][result_kmeans['similar_players']['unique_player_id'] == replacement_id].iloc[0]
            print(f"\n[KMEANS] {replacement_name} EN TOP-10:")
            print(f"  Posición: {list(result_kmeans['similar_players']['unique_player_id']).index(replacement_id) + 1}")
            print(f"  Distancia: {row['euclidean_distance']:.4f}")
            print(f"  Percentil: {row['distance_percentile']:.1f}")
        else:
            print(f"\n[KMEANS] {replacement_name} NO está en top-10")
    
    # Verificar en resultados Cosine
    if 'result_cosine' in locals():
        in_cosine = replacement_id in result_cosine['similar_players']['unique_player_id'].values
        if in_cosine:
            row = result_cosine['similar_players'][result_cosine['similar_players']['unique_player_id'] == replacement_id].iloc[0]
            print(f"\n[COSINE] {replacement_name} EN TOP-10:")
            print(f"  Posición: {list(result_cosine['similar_players']['unique_player_id']).index(replacement_id) + 1}")
            print(f"  Similitud: {row['cosine_similarity']:.4f}")
            print(f"  Percentil: {row['similarity_percentile']:.1f}")
        else:
            print(f"\n[COSINE] {replacement_name} NO está en top-10")
else:
    print("\nSkip: No hay reemplazo especificado o target inválido")


[KMEANS] Moleiro NO está en top-10

[COSINE] Moleiro NO está en top-10


In [12]:
# ============================================================
# OPCIÓN EXTRA: ANÁLISIS MULTI-TEMPORADA DE BAENA
# ============================================================
# Esta celda es OPCIONAL - permite añadir temporadas previas de Baena
# al dataset y generar listas de similares para cada versión

USE_MULTI_SEASON = True  # Cambiar a True para activar análisis multi-temporada

if USE_MULTI_SEASON:
    print("="*60)
    print("ANÁLISIS MULTI-TEMPORADA: BAENA 23/24 + 24/25")
    print("="*60)
    
    # Añadir Baena 2324 al pool
    pool_with_baena_2324 = add_exogenous_player(
        pool_df=pool_df,
        player_name='Alex Baena',
        league='ESP-La Liga',
        season='2324',
        team='Villarreal'
    )
    
    # Reconstruir full_df con ambas temporadas (pool + Baena 2425 + Baena 2324)
    # Ya tenemos pool_df (68 jugadores) + Baena 2425 en full_df
    # Ahora añadimos Baena 2324
    full_df_multi = pool_with_baena_2324.copy()
    
    print(f"\nDataFrame multi-temporada: {len(full_df_multi)} jugadores")
    
    # Extraer métricas
    fbref_nums_multi = extract_metrics(full_df_multi, 'fbref_metrics')
    understat_nums_multi = extract_metrics(full_df_multi, 'understat_metrics')
    transfermarkt_nums_multi = extract_metrics(full_df_multi, 'transfermarkt_metrics')
    
    # Calcular per90
    fbref_per90_multi = fbref_nums_multi.loc[:, ~fbref_nums_multi.columns.isin(exclude_per90)]
    fbref_per90_multi = (fbref_per90_multi.div(fbref_nums_multi['minutes_played'], axis=0) * 90).round(3)
    fbref_per90_multi.columns = [f'{col}_per90' for col in fbref_per90_multi.columns]
    
    understat_per90_multi = understat_nums_multi.loc[:, ~understat_nums_multi.columns.isin(exclude_per90)]
    understat_per90_multi = (understat_per90_multi.div(fbref_nums_multi['minutes_played'], axis=0) * 90).round(3)
    understat_per90_multi.columns = [f'{col}_per90' for col in understat_per90_multi.columns]
    
    # DataFrame final multi-temporada
    df_final_multi = pd.concat([
        full_df_multi[base_cols],
        fbref_nums_multi,
        understat_nums_multi,
        transfermarkt_nums_multi,
        fbref_per90_multi,
        understat_per90_multi
    ], axis=1)
    
    print(f"DataFrame final multi: {df_final_multi.shape[0]} jugadores × {df_final_multi.shape[1]} columnas")
    
    # Identificar Baenas
    baenas = df_final_multi[df_final_multi['player_name'].str.contains('Baena', case=False, na=False)]
    print(f"\nBaenas encontrados: {len(baenas)}")
    print(baenas[['player_name', 'team', 'season', 'unique_player_id']].to_string(index=False))
    
    # Análisis para cada Baena
    for idx, baena_row in baenas.iterrows():
        baena_season = baena_row['season']
        baena_id_temp = baena_row['unique_player_id']
        
        print(f"\n{'='*60}")
        print(f"TARGET: Baena temporada {baena_season}")
        print(f"{'='*60}")
        
        try:
            # K-Means
            result_kmeans_temp = find_similar_players_kmeans(
                df=df_final_multi,
                target_player_id=baena_id_temp,
                n_similar=10,
                k_clusters=None,
                use_pca=False
            )
            
            print(f"\n[K-MEANS] Top 10 similares a Baena {baena_season}:")
            print(result_kmeans_temp['similar_players'][['player_name', 'team', 'season', 'euclidean_distance']].to_string(index=False))
            
            # Cosine
            result_cosine_temp = find_similar_players_cosine(
                df=df_final_multi,
                target_player_id=baena_id_temp,
                n_similar=10,
                pca_variance=0.85
            )
            
            print(f"\n[COSINE] Top 10 similares a Baena {baena_season}:")
            print(result_cosine_temp['similar_players'][['player_name', 'team', 'season', 'cosine_similarity']].to_string(index=False))
            
        except Exception as e:
            print(f"ERROR procesando Baena {baena_season}: {e}")
    
    print("\n" + "="*60)
    print("Análisis multi-temporada completado")
    print("="*60)
else:
    print("\n" + "="*60)
    print("Análisis multi-temporada desactivado")
    print("="*60)
    print("Cambiar USE_MULTI_SEASON=True en la celda para activar")
    print("Esto añadirá Baena 23/24 al dataset y generará")
    print("listas de top-10 similares para cada temporada")

2025-12-03 13:52:49,517 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-03 13:52:49,546 - database.connection - INFO - Database connection successful


ANÁLISIS MULTI-TEMPORADA: BAENA 23/24 + 24/25
Added exogenous player: Alex Baena (Villarreal, ESP-La Liga 2324)
Total players in DataFrame: 69

DataFrame multi-temporada: 69 jugadores
DataFrame final multi: 69 jugadores × 363 columnas

Baenas encontrados: 1
player_name       team season unique_player_id
 Alex Baena Villarreal   2324 eb4c447d1a00eb39

TARGET: Baena temporada 2324
Target: Alex Baena (Villarreal, ESP-La Liga)
Features seleccionados automáticamente: 141 (solo _per90, excl. GK)
Jugadores tras limpieza: 69 (eliminados 0 con NaNs en métricas CORE)
Clusters óptimos (método del codo): 8
Target asignado a cluster: 4
Jugadores en cluster 4: 13
Top 10 similares encontrados

[K-MEANS] Top 10 similares a Baena 2324:
           player_name          team season  euclidean_distance
      Marcus Tavernier   Bournemouth   2425           14.215754
         Julian Brandt      Dortmund   2425           15.483203
Hákon Arnar Haraldsson         Lille   2425           15.591533
      Sebastian