In [2]:
import pandas as pd
import numpy as np
import sys
import os

# Subir tres niveles desde tfm/teams/villarreal/ hasta data/
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..')))

from database.connection import get_db_manager
from tfm.helpers.query_helpers import query_player_pool, add_exogenous_player
from tfm.helpers.algorithms import find_similar_players_cosine

import warnings
warnings.filterwarnings('ignore')

In [3]:
big5_leagues = ['ENG-Premier League', 'ESP-La Liga', 'ITA-Serie A', 'GER-Bundesliga', 'FRA-Ligue 1']

pools = []
for league in big5_leagues:
    try:
        pool = query_player_pool(
            league=league,
            season='2425',
            positions=['LW'],
            max_market_value=30_000_000,
            min_minutes=900,
            max_age=26
        )
        pools.append(pool)
        print(f"{league}: {len(pool)} jugadores")
    except Exception as e:
        print(f"Error en {league}: {e}")

pool_df = pd.concat(pools, ignore_index=True)

print(f"\nPool total: {len(pool_df)} jugadores")
print(f"\nDistribución por liga:")
print(pool_df['league'].value_counts())

2025-12-08 16:55:24,012 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-08 16:55:24,479 - database.connection - INFO - Database connection successful
2025-12-08 16:55:24,583 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-08 16:55:24,627 - database.connection - INFO - Database connection successful
2025-12-08 16:55:24,711 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-08 16:55:24,765 - database.connection - INFO - Database connection successful


ENG-Premier League: 9 jugadores
ESP-La Liga: 15 jugadores


2025-12-08 16:55:24,837 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-08 16:55:24,868 - database.connection - INFO - Database connection successful
2025-12-08 16:55:24,910 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-08 16:55:24,953 - database.connection - INFO - Database connection successful


ITA-Serie A: 6 jugadores
GER-Bundesliga: 7 jugadores
FRA-Ligue 1: 13 jugadores

Pool total: 50 jugadores

Distribución por liga:
league
ESP-La Liga           15
FRA-Ligue 1           13
ENG-Premier League     9
GER-Bundesliga         7
ITA-Serie A            6
Name: count, dtype: int64


In [5]:
full_df = add_exogenous_player(
    pool_df=pool_df,
    player_name='Alex Baena',
    league='ESP-La Liga',
    season='2425',
    team='Villarreal'
)

print(f"DataFrame con Target: {len(full_df)} jugadores")

2025-12-08 16:55:50,918 - database.connection - INFO - Connecting to database: localhost:5432/footballdecoded_dev
2025-12-08 16:55:50,963 - database.connection - INFO - Database connection successful


Added exogenous player: Alex Baena (Villarreal, ESP-La Liga 2425)
Total players in DataFrame: 51
DataFrame con Target: 51 jugadores


In [6]:
def extract_metrics(df, col_name):
    result = pd.DataFrame(index=df.index)
    all_keys = set()
    for _, row in df.iterrows():
        if isinstance(row[col_name], dict):
            all_keys.update(row[col_name].keys())
    
    for key in all_keys:
        values = []
        for _, row in df.iterrows():
            if isinstance(row[col_name], dict) and key in row[col_name]:
                raw_value = row[col_name][key]
                converted_value = _convert_to_float(raw_value)
                values.append(converted_value)
            else:
                values.append(np.nan)
        
        valid_count = pd.Series(values).notna().sum()
        if valid_count >= 5:
            result[key] = values
    
    return result

def _convert_to_float(value):
    if isinstance(value, (int, float)):
        return float(value)
    if value is None or pd.isna(value):
        return np.nan
    if isinstance(value, str):
        if value.strip() == '' or value.lower().strip() in ['nan', 'none', 'null', '-']:
            return np.nan
        try:
            return float(value)
        except (ValueError, TypeError):
            return np.nan
    return np.nan

fbref_nums = extract_metrics(full_df, 'fbref_metrics')
understat_nums = extract_metrics(full_df, 'understat_metrics')
transfermarkt_nums = extract_metrics(full_df, 'transfermarkt_metrics')

print(f"FBref: {fbref_nums.shape[1]} métricas")
print(f"Understat: {understat_nums.shape[1]} métricas")
print(f"Transfermarkt: {transfermarkt_nums.shape[1]} campos")

FBref: 185 métricas
Understat: 10 métricas
Transfermarkt: 2 campos


In [7]:
exclude_per90 = {
    'minutes_played', 'age', 'birth_year', 'games_started', 'minutes_per_game',
    'minutes_per_start', 'games', 'games_subs', 'unused_sub', 'points_per_game',
    'on_goals_for', 'on_goals_against', 'plus_minus', 'plus_minus_per90',
    'plus_minus_wowy', 'on_xg_for', 'on_xg_against', 'xg_plus_minus',
    'xg_plus_minus_per90', 'xg_plus_minus_wowy'
}

fbref_per90 = fbref_nums.loc[:, ~fbref_nums.columns.isin(exclude_per90)]
fbref_per90 = (fbref_per90.div(fbref_nums['minutes_played'], axis=0) * 90).round(3)
fbref_per90.columns = [f'{col}_per90' for col in fbref_per90.columns]

understat_per90 = understat_nums.loc[:, ~understat_nums.columns.isin(exclude_per90)]
understat_per90 = (understat_per90.div(fbref_nums['minutes_played'], axis=0) * 90).round(3)
understat_per90.columns = [f'{col}_per90' for col in understat_per90.columns]

print(f"Per90 FBref: {fbref_per90.shape[1]} columnas")
print(f"Per90 Understat: {understat_per90.shape[1]} columnas")

Per90 FBref: 184 columnas
Per90 Understat: 10 columnas


In [8]:
base_cols = ['unique_player_id', 'player_name', 'team', 'league', 'season', 'position']

df_final = pd.concat([
    full_df[base_cols],
    fbref_nums,
    understat_nums,
    transfermarkt_nums,
    fbref_per90,
    understat_per90
], axis=1)

print(f"DataFrame final: {df_final.shape[0]} jugadores × {df_final.shape[1]} columnas")

DataFrame final: 51 jugadores × 397 columnas


In [9]:
target = df_final[df_final['player_name'].str.contains('Baena', case=False, na=False)]
replacement = df_final[df_final['player_name'].str.contains('Moleiro', case=False, na=False)]

if len(target) == 0:
    raise ValueError("Target no encontrado")
if len(replacement) == 0:
    raise ValueError("Replacement no encontrado")

target_id = target.iloc[0]['unique_player_id']
replacement_id = replacement.iloc[0]['unique_player_id']

print(f"Target: {target.iloc[0]['player_name']} ({target.iloc[0]['team']})")
print(f"  ID: {target_id}")
print(f"\nReemplazo: {replacement.iloc[0]['player_name']} ({replacement.iloc[0]['team']})")
print(f"  ID: {replacement_id}")

Target: Alex Baena (Villarreal)
  ID: eb4c447d1a00eb39

Reemplazo: Alberto Moleiro (Las Palmas)
  ID: e3039aff904c9c54


In [10]:
result = find_similar_players_cosine(
    df=df_final,
    target_player_id=target_id,
    n_similar=30,
    pca_variance=0.85,
    replacement_id=replacement_id,
    robust_scaling=False
)

Target: Alex Baena (Villarreal, ESP-La Liga)
Features: 170 (solo _per90, excl. GK)
Jugadores: 51 (eliminados 0 con NaNs CORE)
PCA: 19 componentes (varianza: 85.9%)
Reducción: 170 → 19 dimensiones
Top-30 encontrados
Rango similitud: [-0.1840, 0.5391]


In [None]:
from tfm.helpers.viz_helpers import plot_top10_ranking

output_path = plot_top10_ranking(
    result=result,
    df_data=full_df,
    save_path='baena_moleiro_top10_ranking.png',
    target_face_path='/home/jaime/FD/data/blog/caras/tfm/Baena.png',
    highlight_target=True,
    dpi=300
)

print(f"Visualización generada: {output_path}")

from IPython.display import Image, display
display(Image(filename=output_path))

In [12]:
pca_info = result['pca_info']

print("\nINFORMACIÓN PCA")
print("="*80)
print(f"Dimensiones originales: {pca_info['original_dimensions']}")
print(f"Componentes retenidos: {pca_info['n_components']}")
print(f"Varianza explicada: {pca_info['explained_variance_ratio']:.1%}")
print(f"Ratio de compresión: {pca_info['compression_ratio']:.1%}")
print("\nVarianza por componente (top-5):")
for i, var in enumerate(pca_info['top_5_components_variance'], 1):
    print(f"  PC{i}: {var:.1%}")


INFORMACIÓN PCA
Dimensiones originales: 170
Componentes retenidos: 19
Varianza explicada: 85.9%
Ratio de compresión: 11.2%

Varianza por componente (top-5):
  PC1: 17.7%
  PC2: 10.2%
  PC3: 9.3%
  PC4: 7.8%
  PC5: 6.4%


In [13]:
dist = result['score_distribution']

print("\nDISTRIBUCIÓN DE SIMILITUDES")
print("="*80)
print(f"Mínimo:   {dist['min']:.4f}")
print(f"Q5:       {dist['q5']:.4f}")
print(f"Q25:      {dist['q25']:.4f}")
print(f"Mediana:  {dist['median']:.4f}")
print(f"Q75:      {dist['q75']:.4f}")
print(f"Q95:      {dist['q95']:.4f}")
print(f"Máximo:   {dist['max']:.4f}")
print(f"Media:    {dist['mean']:.4f}")
print(f"Desv.Est: {dist['std']:.4f}")


DISTRIBUCIÓN DE SIMILITUDES
Mínimo:   -0.6660
Q5:       -0.5060
Q25:      -0.2606
Mediana:  -0.0947
Q75:      0.1490
Q95:      0.4239
Máximo:   0.5391
Media:    -0.0553
Desv.Est: 0.2940
