# Samuel Chukwueze vs Tajon Buchanan - PCA\n## Villarreal - Extremo\n\n**Context:**\n- Chukwueze vendido 21M→Milan | Buchanan cedido+opción 9M (23/24)

In [None]:
import pandas as pd\nimport numpy as np\nimport sys\nimport os\n\nsys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..')))\n\nfrom database.connection import get_db_manager\nfrom tfm.helpers.query_helpers import query_player_pool, add_exogenous_player\nfrom tfm.helpers.algorithms import find_similar_players_cosine\n\nimport warnings\nwarnings.filterwarnings('ignore')

In [None]:
big5_leagues = ['ENG-Premier League', 'ESP-La Liga', 'ITA-Serie A', 'GER-Bundesliga', 'FRA-Ligue 1']\n\npools = []\nfor league in big5_leagues:\n    try:\n        pool = query_player_pool(\n            league=league,\n            season='2324',\n            positions=['FW'],\n            max_market_value=30_000_000,\n            min_minutes=900,\n            max_age=28\n        )\n        pools.append(pool)\n        print(f"{league}: {len(pool)} jugadores")\n    except Exception as e:\n        print(f"Error en {league}: {e}")\n\npool_df = pd.concat(pools, ignore_index=True)\nprint(f"\\nPool total: {len(pool_df)} jugadores")

In [None]:
full_df = add_exogenous_player(pool_df=pool_df, player_name='Samuel Chukwueze', league='ESP-La Liga', season='2324', team='AC Milan')\nfull_df = add_exogenous_player(pool_df=full_df, player_name='Tajon Buchanan', league='ESP-La Liga', season='2324', team='Brugge')\nprint(f"DataFrame: {len(full_df)} jugadores")

In [None]:
def extract_metrics(df, col_name):\n    result = pd.DataFrame(index=df.index)\n    all_keys = set()\n    for _, row in df.iterrows():\n        if isinstance(row[col_name], dict):\n            all_keys.update(row[col_name].keys())\n    for key in all_keys:\n        values = []\n        for _, row in df.iterrows():\n            if isinstance(row[col_name], dict) and key in row[col_name]:\n                raw_value = row[col_name][key]\n                converted_value = _convert_to_float(raw_value)\n                values.append(converted_value)\n            else:\n                values.append(np.nan)\n        valid_count = pd.Series(values).notna().sum()\n        if valid_count >= 5:\n            result[key] = values\n    return result\n\ndef _convert_to_float(value):\n    if isinstance(value, (int, float)):\n        return float(value)\n    if value is None or pd.isna(value):\n        return np.nan\n    if isinstance(value, str):\n        if value.strip() == '' or value.lower().strip() in ['nan', 'none', 'null', '-']:\n            return np.nan\n        try:\n            return float(value)\n        except (ValueError, TypeError):\n            return np.nan\n    return np.nan\n\nfbref_nums = extract_metrics(full_df, 'fbref_metrics')\nunderstat_nums = extract_metrics(full_df, 'understat_metrics')\ntransfermarkt_nums = extract_metrics(full_df, 'transfermarkt_metrics')\nprint(f"Métricas: {fbref_nums.shape[1]} FBref, {understat_nums.shape[1]} Understat")

In [None]:
exclude_normalization = {'minutes_played', 'age', 'birth_year', 'games_started', 'minutes_per_game', 'minutes_per_start', 'games', 'games_subs', 'unused_sub', 'points_per_game', 'on_goals_for', 'on_goals_against', 'plus_minus', 'plus_minus_per90', 'plus_minus_wowy', 'on_xg_for', 'on_xg_against', 'xg_plus_minus', 'xg_plus_minus_per90', 'xg_plus_minus_wowy', 'Touches_Touches'}\n\nfbref_per100 = fbref_nums.loc[:, ~fbref_nums.columns.isin(exclude_normalization)]\nfbref_per100 = (fbref_per100.div(fbref_nums['Touches_Touches'], axis=0) * 100).round(3)\nfbref_per100.columns = [f'{col}_per100touches' for col in fbref_per100.columns]\n\nunderstat_per100 = understat_nums.loc[:, ~understat_nums.columns.isin(exclude_normalization)]\nunderstat_per100 = (understat_per100.div(fbref_nums['Touches_Touches'], axis=0) * 100).round(3)\nunderstat_per100.columns = [f'{col}_per100touches' for col in understat_per100.columns]\nprint(f"Per100: {fbref_per100.shape[1]} + {understat_per100.shape[1]}")

In [None]:
base_cols = ['unique_player_id', 'player_name', 'team', 'league', 'season', 'position']\ndf_final = pd.concat([full_df[base_cols], fbref_nums, understat_nums, transfermarkt_nums, fbref_per100, understat_per100], axis=1)\nprint(f"DataFrame final: {df_final.shape[0]} × {df_final.shape[1]}")

In [None]:
target = df_final[df_final['player_name'].str.contains('Chukwueze', case=False, na=False)]\ntarget = target[target['team'].str.contains('AC', case=False, na=False)]\n\nreplacement = df_final[df_final['player_name'].str.contains('Buchanan', case=False, na=False)]\nreplacement = replacement[replacement['team'].str.contains('Brugge', case=False, na=False)]\n\nif len(target) == 0 or len(replacement) == 0:\n    raise ValueError("Jugador no encontrado")\n\ntarget_id = target.iloc[0]['unique_player_id']\nreplacement_id = replacement.iloc[0]['unique_player_id']\nprint(f"Target: {target.iloc[0]['player_name']} | Replacement: {replacement.iloc[0]['player_name']}")

In [None]:
result = find_similar_players_cosine(df=df_final, target_player_id=target_id, n_similar=30, pca_variance=0.85, replacement_id=replacement_id, robust_scaling=False)

In [None]:
from tfm.helpers.viz_helpers import plot_top10_ranking\nfrom IPython.display import Image, display\n\noutput_path = plot_top10_ranking(result=result, df_data=full_df, save_path='chukwueze_buchanan_top10_ranking.png', target_face_path=None, highlight_target=True, dpi=300)\nprint(f"Visualización: {output_path}")\ndisplay(Image(filename=output_path))

In [None]:
pca_info = result['pca_info']\nprint(f"\\nPCA: {pca_info['n_components']} componentes ({pca_info['explained_variance_ratio']:.1%} varianza)")

In [None]:
dist = result['score_distribution']\nprint(f"\\nSimilitud - Min: {dist['min']:.4f} | Median: {dist['median']:.4f} | Max: {dist['max']:.4f}")