In [1]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

from database.connection import get_db_manager
from tfm.query_helpers import query_player_pool, add_exogenous_player
from tfm.algorithms import find_similar_players_cosine

import warnings
warnings.filterwarnings('ignore')

In [2]:
big5_leagues = ['ENG-Premier League', 'ESP-La Liga', 'ITA-Serie A', 'GER-Bundesliga', 'FRA-Ligue 1']

pools = []
for league in big5_leagues:
    try:
        pool = query_player_pool(
            league=league,
            season='2425',
            positions=['LM', 'LW'],
            max_market_value=25_000_000,
            min_minutes=1250,
            max_age=28
        )
        pools.append(pool)
        print(f"{league}: {len(pool)} jugadores")
    except Exception as e:
        print(f"Error en {league}: {e}")

pool_df = pd.concat(pools, ignore_index=True)

print(f"\nPool total: {len(pool_df)} jugadores")
print(f"\nDistribución por liga:")
print(pool_df['league'].value_counts())

2025-12-05 15:36:28,511 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-05 15:36:28,646 - database.connection - INFO - Database connection successful
2025-12-05 15:36:28,671 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-05 15:36:28,713 - database.connection - INFO - Database connection successful
2025-12-05 15:36:28,732 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-05 15:36:28,770 - database.connection - INFO - Database connection successful
2025-12-05 15:36:28,788 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-05 15:36:28,820 - database.connection - INFO - Database connection successful
2025-12-05 15:36:28,839 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev


ENG-Premier League: 6 jugadores
ESP-La Liga: 13 jugadores
ITA-Serie A: 4 jugadores
GER-Bundesliga: 2 jugadores


2025-12-05 15:36:28,878 - database.connection - INFO - Database connection successful


FRA-Ligue 1: 11 jugadores

Pool total: 36 jugadores

Distribución por liga:
league
ESP-La Liga           13
FRA-Ligue 1           11
ENG-Premier League     6
ITA-Serie A            4
GER-Bundesliga         2
Name: count, dtype: int64


In [3]:
full_df = add_exogenous_player(
    pool_df=pool_df,
    player_name='Alex Baena',
    league='ESP-La Liga',
    season='2425',
    team='Villarreal'
)

print(f"DataFrame con Baena: {len(full_df)} jugadores")

2025-12-05 15:36:28,930 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-05 15:36:28,964 - database.connection - INFO - Database connection successful


Added exogenous player: Alex Baena (Villarreal, ESP-La Liga 2425)
Total players in DataFrame: 37
DataFrame con Baena: 37 jugadores


In [4]:
def extract_metrics(df, col_name):
    result = pd.DataFrame(index=df.index)
    all_keys = set()
    for _, row in df.iterrows():
        if isinstance(row[col_name], dict):
            all_keys.update(row[col_name].keys())
    
    for key in all_keys:
        values = []
        for _, row in df.iterrows():
            if isinstance(row[col_name], dict) and key in row[col_name]:
                raw_value = row[col_name][key]
                converted_value = _convert_to_float(raw_value)
                values.append(converted_value)
            else:
                values.append(np.nan)
        
        valid_count = pd.Series(values).notna().sum()
        if valid_count >= 5:
            result[key] = values
    
    return result

def _convert_to_float(value):
    if isinstance(value, (int, float)):
        return float(value)
    if value is None or pd.isna(value):
        return np.nan
    if isinstance(value, str):
        if value.strip() == '' or value.lower().strip() in ['nan', 'none', 'null', '-']:
            return np.nan
        try:
            return float(value)
        except (ValueError, TypeError):
            return np.nan
    return np.nan

fbref_nums = extract_metrics(full_df, 'fbref_metrics')
understat_nums = extract_metrics(full_df, 'understat_metrics')
transfermarkt_nums = extract_metrics(full_df, 'transfermarkt_metrics')

print(f"FBref: {fbref_nums.shape[1]} métricas")
print(f"Understat: {understat_nums.shape[1]} métricas")
print(f"Transfermarkt: {transfermarkt_nums.shape[1]} campos")

FBref: 145 métricas
Understat: 10 métricas
Transfermarkt: 2 campos


In [5]:
exclude_per90 = {
    'minutes_played', 'age', 'birth_year', 'games_started', 'minutes_per_game',
    'minutes_per_start', 'games', 'games_subs', 'unused_sub', 'points_per_game',
    'on_goals_for', 'on_goals_against', 'plus_minus', 'plus_minus_per90',
    'plus_minus_wowy', 'on_xg_for', 'on_xg_against', 'xg_plus_minus',
    'xg_plus_minus_per90', 'xg_plus_minus_wowy'
}

fbref_per90 = fbref_nums.loc[:, ~fbref_nums.columns.isin(exclude_per90)]
fbref_per90 = (fbref_per90.div(fbref_nums['minutes_played'], axis=0) * 90).round(3)
fbref_per90.columns = [f'{col}_per90' for col in fbref_per90.columns]

understat_per90 = understat_nums.loc[:, ~understat_nums.columns.isin(exclude_per90)]
understat_per90 = (understat_per90.div(fbref_nums['minutes_played'], axis=0) * 90).round(3)
understat_per90.columns = [f'{col}_per90' for col in understat_per90.columns]

print(f"Per90 FBref: {fbref_per90.shape[1]} columnas")
print(f"Per90 Understat: {understat_per90.shape[1]} columnas")

Per90 FBref: 144 columnas
Per90 Understat: 10 columnas


In [6]:
base_cols = ['unique_player_id', 'player_name', 'team', 'league', 'season', 'position']

df_final = pd.concat([
    full_df[base_cols],
    fbref_nums,
    understat_nums,
    transfermarkt_nums,
    fbref_per90,
    understat_per90
], axis=1)

print(f"DataFrame final: {df_final.shape[0]} jugadores × {df_final.shape[1]} columnas")

DataFrame final: 37 jugadores × 317 columnas


In [7]:
target = df_final[df_final['player_name'].str.contains('Baena', case=False, na=False)]
replacement = df_final[df_final['player_name'].str.contains('Moleiro', case=False, na=False)]

if len(target) == 0:
    raise ValueError("Baena no encontrado")
if len(replacement) == 0:
    raise ValueError("Moleiro no encontrado")

target_id = target.iloc[0]['unique_player_id']
replacement_id = replacement.iloc[0]['unique_player_id']

print(f"Target: {target.iloc[0]['player_name']} ({target.iloc[0]['team']})")
print(f"  ID: {target_id}")
print(f"\nReemplazo: {replacement.iloc[0]['player_name']} ({replacement.iloc[0]['team']})")
print(f"  ID: {replacement_id}")

Target: Alex Baena (Villarreal)
  ID: eb4c447d1a00eb39

Reemplazo: Alberto Moleiro (Las Palmas)
  ID: e3039aff904c9c54


In [8]:
result = find_similar_players_cosine(
    df=df_final,
    target_player_id=target_id,
    n_similar=30,
    pca_variance=0.85,
    replacement_id=replacement_id,
    robust_scaling=False
)

Target: Alex Baena (Villarreal, ESP-La Liga)
Features: 154 (solo _per90, excl. GK)
Jugadores: 37 (eliminados 0 con NaNs CORE)
PCA: 15 componentes (varianza: 85.4%)
Reducción: 154 → 15 dimensiones
Top-30 encontrados
Rango similitud: [-0.2478, 0.5084]


In [9]:
top30 = result['similar_players'][[
    'rank', 'player_name', 'team', 'league',
    'cosine_similarity', 'validation_status'
]].copy()

top30['cosine_similarity'] = top30['cosine_similarity'].round(4)

print("TOP-30 SIMILARES A BAENA")
print("="*80)
print(top30.to_string(index=False))

TOP-30 SIMILARES A BAENA
 rank          player_name            team             league  cosine_similarity validation_status
    1     Sebastian Nanasi      Strasbourg        FRA-Ligue 1             0.5084          VALIDADO
    2        Dwight McNeil         Everton ENG-Premier League             0.4875          VALIDADO
    3           Robin Hack        Gladbach     GER-Bundesliga             0.4681          VALIDADO
    4        Chris Führich       Stuttgart     GER-Bundesliga             0.4057          VALIDADO
    5        Anass Zaroury            Lens        FRA-Ligue 1             0.3837          VALIDADO
    6          Aron Dønnum        Toulouse        FRA-Ligue 1             0.2581          VALIDADO
    7       Osame Sahraoui           Lille        FRA-Ligue 1             0.2453          VALIDADO
    8            Raúl Moro      Valladolid        ESP-La Liga             0.1634          VALIDADO
    9          Samuel Lino Atlético Madrid        ESP-La Liga             0.1584    

In [10]:
repl_info = result['replacement_info']

print("\n" + "="*80)
print("VALIDACIÓN: MOLEIRO COMO REEMPLAZO DE BAENA")
print("="*80)
print(f"Jugador: {repl_info['player_name']}")
print(f"Equipo: {repl_info['team']} ({repl_info['league']})")
print(f"Posición en ranking: #{repl_info['rank']}")
print(f"Similitud coseno: {repl_info['cosine_similarity']:.4f}")
print(f"Percentil: {repl_info['similarity_percentile']:.1f}%")
print(f"Status: {repl_info['validation_status']}")
print("="*80)


VALIDACIÓN: MOLEIRO COMO REEMPLAZO DE BAENA
Jugador: Alberto Moleiro
Equipo: Las Palmas (ESP-La Liga)
Posición en ranking: #17
Similitud coseno: -0.1063
Percentil: 52.8%
Status: PARCIAL


In [11]:
pca_info = result['pca_info']

print("\nINFORMACIÓN PCA")
print("="*80)
print(f"Dimensiones originales: {pca_info['original_dimensions']}")
print(f"Componentes retenidos: {pca_info['n_components']}")
print(f"Varianza explicada: {pca_info['explained_variance_ratio']:.1%}")
print(f"Ratio de compresión: {pca_info['compression_ratio']:.1%}")
print("\nVarianza por componente (top-5):")
for i, var in enumerate(pca_info['top_5_components_variance'], 1):
    print(f"  PC{i}: {var:.1%}")


INFORMACIÓN PCA
Dimensiones originales: 154
Componentes retenidos: 15
Varianza explicada: 85.4%
Ratio de compresión: 9.7%

Varianza por componente (top-5):
  PC1: 17.4%
  PC2: 12.5%
  PC3: 10.1%
  PC4: 9.3%
  PC5: 7.5%


In [12]:
dist = result['score_distribution']

print("\nDISTRIBUCIÓN DE SIMILITUDES")
print("="*80)
print(f"Mínimo:   {dist['min']:.4f}")
print(f"Q5:       {dist['q5']:.4f}")
print(f"Q25:      {dist['q25']:.4f}")
print(f"Mediana:  {dist['median']:.4f}")
print(f"Q75:      {dist['q75']:.4f}")
print(f"Q95:      {dist['q95']:.4f}")
print(f"Máximo:   {dist['max']:.4f}")
print(f"Media:    {dist['mean']:.4f}")
print(f"Desv.Est: {dist['std']:.4f}")


DISTRIBUCIÓN DE SIMILITUDES
Mínimo:   -0.7906
Q5:       -0.4478
Q25:      -0.2245
Mediana:  -0.1319
Q75:      0.1506
Q95:      0.4729
Máximo:   0.5084
Media:    -0.0485
Desv.Est: 0.2952
