# Gerónimo Rulli vs Luiz Júnior - PCA Similarity
## Villarreal - Portero

**Context:**
- Gerónimo Rulli: Cantera → vendido 8M€
- Luiz Júnior: Fichado 12M€ desde Liga Portuguesa
- Season 23/24: Rulli (Ajax), Luiz Júnior (Famalicão)

In [None]:
import pandas as pd
import numpy as np
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..')))

from database.connection import get_db_manager
from tfm.helpers.query_helpers import query_player_pool, add_exogenous_player
from tfm.helpers.algorithms import find_similar_players_cosine

import warnings
warnings.filterwarnings('ignore')

In [None]:
big5_leagues = ['ENG-Premier League', 'ESP-La Liga', 'ITA-Serie A', 'GER-Bundesliga', 'FRA-Ligue 1']

pools = []
for league in big5_leagues:
    try:
        pool = query_player_pool(league=league, season='2324', positions=['GK'], max_market_value=30_000_000, min_minutes=900, max_age=32)
        pools.append(pool)
        print(f"{league}: {len(pool)} jugadores")
    except Exception as e:
        print(f"Error: {e}")

try:
    pool_pt = query_player_pool(league='POR-Primeira Liga', season='2324', positions=['GK'], max_market_value=30_000_000, min_minutes=900, max_age=32, table_type='extras')
    pools.append(pool_pt)
    print(f"POR-Primeira Liga: {len(pool_pt)} jugadores")
except Exception as e:
    print(f"Error Portugal: {e}")

pool_df = pd.concat(pools, ignore_index=True)
print(f"\nPool: {len(pool_df)} porteros")

In [None]:
full_df = add_exogenous_player(pool_df=pool_df, player_name='Gerónimo Rulli', league='NED-Eredivisie', season='2324', team='Ajax')
full_df = add_exogenous_player(pool_df=full_df, player_name='Luiz Júnior', league='POR-Primeira Liga', season='2324', team='Famalicão')
print(f"DataFrame: {len(full_df)} jugadores")

In [None]:
def extract_metrics(df, col_name):
    result = pd.DataFrame(index=df.index)
    all_keys = set()
    for _, row in df.iterrows():
        if isinstance(row[col_name], dict):
            all_keys.update(row[col_name].keys())
    for key in all_keys:
        values = []
        for _, row in df.iterrows():
            if isinstance(row[col_name], dict) and key in row[col_name]:
                raw_value = row[col_name][key]
                converted_value = _convert_to_float(raw_value)
                values.append(converted_value)
            else:
                values.append(np.nan)
        valid_count = pd.Series(values).notna().sum()
        if valid_count >= 5:
            result[key] = values
    return result

def _convert_to_float(value):
    if isinstance(value, (int, float)):
        return float(value)
    if value is None or pd.isna(value):
        return np.nan
    if isinstance(value, str):
        if value.strip() == '' or value.lower().strip() in ['nan', 'none', 'null', '-']:
            return np.nan
        try:
            return float(value)
        except (ValueError, TypeError):
            return np.nan
    return np.nan

fbref_nums = extract_metrics(full_df, 'fbref_metrics')
understat_nums = extract_metrics(full_df, 'understat_metrics')
transfermarkt_nums = extract_metrics(full_df, 'transfermarkt_metrics')
print(f"Métricas: {fbref_nums.shape[1]} FBref")

In [None]:
exclude_normalization = {'minutes_played', 'age', 'birth_year', 'games_started', 'minutes_per_game', 'minutes_per_start', 'games', 'games_subs', 'unused_sub', 'points_per_game', 'on_goals_for', 'on_goals_against', 'plus_minus', 'plus_minus_per90', 'plus_minus_wowy', 'on_xg_for', 'on_xg_against', 'xg_plus_minus', 'xg_plus_minus_per90', 'xg_plus_minus_wowy', 'Touches_Touches'}

fbref_per100 = fbref_nums.loc[:, ~fbref_nums.columns.isin(exclude_normalization)]
fbref_per100 = (fbref_per100.div(fbref_nums['Touches_Touches'], axis=0) * 100).round(3)
fbref_per100.columns = [f'{col}_per100touches' for col in fbref_per100.columns]

understat_per100 = understat_nums.loc[:, ~understat_nums.columns.isin(exclude_normalization)]
understat_per100 = (understat_per100.div(fbref_nums['Touches_Touches'], axis=0) * 100).round(3)
understat_per100.columns = [f'{col}_per100touches' for col in understat_per100.columns]
print(f"Per100: {fbref_per100.shape[1]} + {understat_per100.shape[1]}")

In [None]:
base_cols = ['unique_player_id', 'player_name', 'team', 'league', 'season', 'position']
df_final = pd.concat([full_df[base_cols], fbref_nums, understat_nums, transfermarkt_nums, fbref_per100, understat_per100], axis=1)
print(f"DataFrame final: {df_final.shape[0]} × {df_final.shape[1]}")

In [None]:
target = df_final[df_final['player_name'].str.contains('Rulli', case=False, na=False)]
target = target[target['team'].str.contains('Ajax', case=False, na=False)]

replacement = df_final[df_final['player_name'].str.contains('Luiz', case=False, na=False)]
replacement = replacement[replacement['team'].str.contains('Famalicão', case=False, na=False)]

if len(target) == 0 or len(replacement) == 0:
    raise ValueError("Jugador no encontrado")

target_id = target.iloc[0]['unique_player_id']
replacement_id = replacement.iloc[0]['unique_player_id']
print(f"Target: {target.iloc[0]['player_name']} | Replacement: {replacement.iloc[0]['player_name']}")

In [None]:
result = find_similar_players_cosine(df=df_final, target_player_id=target_id, n_similar=30, pca_variance=0.85, replacement_id=replacement_id, robust_scaling=False)

In [None]:
from tfm.helpers.viz_helpers import plot_top10_ranking
from IPython.display import Image, display

output_path = plot_top10_ranking(result=result, df_data=full_df, save_path='rulli_luiz_top10_ranking.png', target_face_path=None, highlight_target=True, dpi=300)
print(f"Visualización: {output_path}")
display(Image(filename=output_path))

In [None]:
pca_info = result['pca_info']
print(f"\nPCA: {pca_info['n_components']} componentes ({pca_info['explained_variance_ratio']:.1%} varianza)")

In [None]:
dist = result['score_distribution']
print(f"\nSimilitud - Min: {dist['min']:.4f} | Median: {dist['median']:.4f} | Max: {dist['max']:.4f}")