# Ejemplo: K-Means Similarity - Jackson → Sørloth

Validación de la cadena de sustitución: Nicolas Jackson (vendido 37M€) → Alexander Sørloth (fichado 10M€)

**Temporada análisis**: 22/23 (X-1 rule)

In [None]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

from database.connection import get_db_manager
from tfm.algorithms import find_similar_players_kmeans

import warnings
warnings.filterwarnings('ignore')

## 1. Query DB con Filtros

In [None]:
# Conexión
db = get_db_manager()

# Query: La Liga 22/23, delanteros
query = """
SELECT 
    unique_player_id,
    player_name,
    team,
    league,
    season,
    position,
    fbref_metrics,
    understat_metrics
FROM footballdecoded.players_domestic
WHERE league = 'ESP-La Liga'
AND season = '2223'
ORDER BY team, player_name
"""

df_raw = pd.read_sql(query, db.engine)
db.close()

print(f"Total jugadores: {len(df_raw)}")
print(f"Columnas: {df_raw.columns.tolist()}")

In [None]:
# Filtro: Solo delanteros con mínimo 200 minutos
df_filtered = df_raw[
    (df_raw['position'].str.contains('FW', na=False)) &
    (df_raw['fbref_metrics'].apply(lambda x: x.get('minutes_played', 0) if x else 0) >= 200)
].copy()

print(f"Delanteros con 200+ minutos: {len(df_filtered)}")
df_filtered[['player_name', 'team', 'position']].head(10)

## 2. Extracción JSONB

In [None]:
# Función extract_metrics (del notebook Etta_Eyong_radars.ipynb)
def extract_metrics(df, col_name):
    result = pd.DataFrame(index=df.index)
    
    all_keys = set()
    for _, row in df.iterrows():
        if isinstance(row[col_name], dict):
            all_keys.update(row[col_name].keys())
    
    for key in all_keys:
        values = []
        for _, row in df.iterrows():
            if isinstance(row[col_name], dict) and key in row[col_name]:
                raw_value = row[col_name][key]
                converted_value = _convert_to_float(raw_value)
                values.append(converted_value)
            else:
                values.append(np.nan)
        
        valid_count = pd.Series(values).notna().sum()
        if valid_count >= 5:
            result[key] = values
    
    return result

def _convert_to_float(value):
    if isinstance(value, (int, float)):
        return float(value)
    if value is None or pd.isna(value):
        return np.nan
    if isinstance(value, str):
        if value.strip() == '' or value.lower().strip() in ['nan', 'none', 'null', '-']:
            return np.nan
        try:
            return float(value)
        except (ValueError, TypeError):
            return np.nan
    return np.nan

# Extraer
fbref_nums = extract_metrics(df_filtered, 'fbref_metrics')
understat_nums = extract_metrics(df_filtered, 'understat_metrics')

print(f"FBref: {fbref_nums.shape[1]} métricas")
print(f"Understat: {understat_nums.shape[1]} métricas")

## 3. Normalización per90 (Exacto como Etta_Eyong_radars.ipynb)

In [None]:
# Métricas a EXCLUIR de normalización per90
exclude_per90 = {
    # Porcentajes
    'pass_completion_pct', 'shots_on_target_pct', 'Take-Ons_Succ%', 'Take-Ons_Tkld%',
    'Aerial Duels_Won%', 'Challenges_Tkl%', 'Save%', 'Launched_Cmp%', 'Crosses_Stp%',
    # Ya per 90
    'shots_per_90', 'GA90', 'GCA_GCA90', 'SCA_SCA90', 'Team Success_+/-90', 'SoT/90', 'Sweeper_#OPA/90',
    # Ratios
    'npxG/Sh', 'xG+xAG', 'non_penalty_xG_plus_xAG', 'avg_shot_distance', 'Passes_AvgLen', 'Goal Kicks_AvgLen',
    # Metadata
    'minutes_per_match', 'matches_played', 'matches_started', 'minutes_played',
    'wins', 'draws', 'losses', 'Min%', 'Starts_Mn/Start', 'Subs_Mn/Sub',
    # Understat
    'understat_buildup_involvement_pct', 'understat_player_id', 'understat_team_id'
}

# Calcular per90 FBref
fbref_per90 = fbref_nums.loc[:, ~fbref_nums.columns.isin(exclude_per90)]
fbref_per90 = (fbref_per90.div(fbref_nums['minutes_played'], axis=0) * 90).round(3)
fbref_per90.columns = [f'{col}_per90' for col in fbref_per90.columns]

# Calcular per90 Understat
understat_per90 = understat_nums.loc[:, ~understat_nums.columns.isin(exclude_per90)]
understat_per90 = (understat_per90.div(fbref_nums['minutes_played'], axis=0) * 90).round(3)
understat_per90.columns = [f'{col}_per90' for col in understat_per90.columns]

print(f"Per90 FBref: {fbref_per90.shape[1]}")
print(f"Per90 Understat: {understat_per90.shape[1]}")

## 4. Combinar Todas las Métricas

In [None]:
# DataFrame final consolidado
base_cols = ['unique_player_id', 'player_name', 'team', 'league', 'season', 'position']

df_final = pd.concat([
    df_filtered[base_cols],
    fbref_nums,
    understat_nums,
    fbref_per90,
    understat_per90
], axis=1)

print(f"DataFrame final: {df_final.shape[0]} jugadores × {df_final.shape[1]} columnas")
print(f"\nColumnas básicas: {base_cols}")
print(f"Métricas per90: {[c for c in df_final.columns if c.endswith('_per90')][:5]}...")

## 5. Buscar Jackson y Sørloth

In [None]:
# Buscar Jackson
jackson = df_final[df_final['player_name'].str.contains('Jackson', case=False, na=False)]
print("JACKSON:")
print(jackson[['unique_player_id', 'player_name', 'team']])

jackson_id = jackson.iloc[0]['unique_player_id'] if len(jackson) > 0 else None

# Buscar Sørloth
sorloth = df_final[df_final['player_name'].str.contains('Sørloth|Sorloth', case=False, na=False)]
print("\nSØRLOTH:")
print(sorloth[['unique_player_id', 'player_name', 'team']])

sorloth_id = sorloth.iloc[0]['unique_player_id'] if len(sorloth) > 0 else None

## 6. Ejecutar Algoritmo K-Means

In [None]:
if jackson_id:
    result = find_similar_players_kmeans(
        df=df_final,
        target_player_id=jackson_id,
        n_similar=10,
        k_clusters=None,  # Método del codo
        use_pca=False
    )
    
    print("\n" + "="*60)
    print("RESULTADOS K-MEANS")
    print("="*60)
else:
    print("ERROR: Jackson no encontrado en dataset")

## 7. Analizar Resultados

In [None]:
print("\nCLUSTER INFO:")
print(result['cluster_info'])

print("\nDISTRIBUCIÓN DISTANCIAS:")
print(result['distances_distribution'])

print("\nMETADATA:")
print(result['metadata'])

In [None]:
print("\nTOP 10 JUGADORES MÁS SIMILARES A JACKSON:")
print("="*80)
similar = result['similar_players']
print(similar[['player_name', 'team', 'euclidean_distance', 'distance_percentile']].to_string(index=False))

In [None]:
# ¿Sørloth aparece en el top-10?
if sorloth_id:
    sorloth_in_top = similar['unique_player_id'].isin([sorloth_id]).any()
    
    if sorloth_in_top:
        sorloth_row = similar[similar['unique_player_id'] == sorloth_id].iloc[0]
        print(f"\n✅ SØRLOTH ENCONTRADO EN TOP-10:")
        print(f"   Posición: {similar[similar['unique_player_id'] == sorloth_id].index[0] + 1}")
        print(f"   Distancia euclidiana: {sorloth_row['euclidean_distance']:.4f}")
        print(f"   Percentil: {sorloth_row['distance_percentile']:.1f}")
        
        # Validar criterios TFM
        q25 = result['distances_distribution']['q25']
        silhouette = result['cluster_info']['silhouette_score']
        
        print(f"\n   VALIDACIÓN TFM:")
        print(f"   - Co-pertenencia cluster: ✅ (ambos en cluster {result['target_cluster']})")
        print(f"   - Distancia ≤ Q25 ({q25:.4f}): {'✅' if sorloth_row['euclidean_distance'] <= q25 else '❌'}")
        print(f"   - Silhouette > 0.3: {'✅' if silhouette > 0.3 else '❌'} ({silhouette:.3f})")
    else:
        print(f"\n❌ SØRLOTH NO ESTÁ EN TOP-10")
        print(f"   Revisar si está en el mismo cluster o si distancia es alta")
else:
    print("\n⚠️ Sørloth no encontrado en dataset (puede estar en otra liga en 22/23)")