In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

from database.connection import get_db_manager
from similarity.data_preparation import DataPreparator
from similarity.feature_engineering import FeatureEngineer
from similarity.umap_reducer import UMAPReducer
from similarity.gmm_clustering import GMMClusterer
from similarity.player_similarity import PlayerSimilarity
from similarity.visualization import SimilarityVisualizer

In [None]:
TABLE_TYPE = 'domestic'
LEAGUES = ['ENG-Premier League', 'ESP-La Liga', 'ITA-Serie A', 'GER-Bundesliga', 'FRA-Ligue 1']
SEASONS = ['2425', '2526']
POSITION_FILTER = 'FW'
MIN_MINUTES = 450
MAX_AGE = 35

print(f"Configuracion: {LEAGUES}")
print(f"Temporadas: {SEASONS}")
print(f"Posicion: {POSITION_FILTER}, Min minutos: {MIN_MINUTES}")

In [None]:
db = get_db_manager()
data_prep = DataPreparator(db_manager=db, table_type=TABLE_TYPE)

all_seasons_data = []

for season in SEASONS:
    print(f"\nCargando temporada {season}...")
    df_season = data_prep.load_players(
        leagues=LEAGUES,
        season=season,
        position_filter=POSITION_FILTER,
        min_minutes=MIN_MINUTES,
        max_age=MAX_AGE
    )
    print(f"  Jugadores cargados temporada {season}: {len(df_season)}")
    all_seasons_data.append(df_season)

df_raw = pd.concat(all_seasons_data, ignore_index=True)
print(f"\nTotal jugadores ambas temporadas: {len(df_raw)}")
print(f"Distribucion por temporada:")
print(df_raw['season'].value_counts())

In [None]:
df_metrics = data_prep.extract_all_metrics()
print(f"Metricas extraidas: {df_metrics.shape[1]} columnas")
print(f"Shape total: {df_metrics.shape}")

In [None]:
df_clean = data_prep.handle_missing_values(
    strategy='median_by_position',
    max_missing_pct=0.4
)
print(f"Datos limpios: {df_clean.shape}")

In [None]:
df_outliers = data_prep.detect_outliers(
    method='isolation_forest',
    contamination=0.05
)
print(f"Outliers detectados: {df_outliers['is_outlier'].sum()}")
print(f"Outliers %: {df_outliers['is_outlier'].sum() / len(df_outliers) * 100:.1f}%")

In [None]:
feature_eng = FeatureEngineer(position_type='FW')

df_selected = feature_eng.select_relevant_features(
    df_outliers,
    exclude_gk_metrics=True,
    min_variance=0.01
)
print(f"Features seleccionadas: {len(feature_eng.selected_features)}")

In [None]:
df_uncorrelated = feature_eng.remove_correlated_features(
    df_selected,
    threshold=0.95
)
print(f"Features no redundantes: {len(feature_eng.selected_features)}")

In [None]:
df_normalized = feature_eng.normalize_by_position(
    df_uncorrelated,
    method='standard',
    fit_per_position=True
)
print("Normalizacion completada por posicion")
print("Nota: Normalizacion valida incluso con temporada 25/26 incompleta")
print("Razon: Metricas per90 ya normalizadas por minutos jugados")

In [None]:
X, metadata_df = feature_eng.prepare_for_umap(df_normalized, return_dataframe=True)
print(f"Matriz features para UMAP: {X.shape}")
print(f"Metadata shape: {metadata_df.shape}")

In [None]:
umap_reducer = UMAPReducer(
    n_components=5,
    n_neighbors=20,
    min_dist=0.0,
    metric='euclidean',
    random_state=42
)

print("Ejecutando UMAP (esto puede tardar 30-60 segundos)...")
X_umap = umap_reducer.fit_transform(X, verbose=True)
print(f"\nUMAP completado: {X_umap.shape}")

In [None]:
embedding_df = umap_reducer.get_embedding_dataframe(metadata_df)
print(f"Embedding DataFrame: {embedding_df.shape}")
print(embedding_df[['player_name', 'team', 'season', 'league', 'umap_1', 'umap_2']].head())

In [None]:
gmm_clusterer = GMMClusterer(
    covariance_type='full',
    max_iter=200,
    random_state=42
)

print("Buscando numero optimo de clusters...")
optimal_results = gmm_clusterer.find_optimal_clusters(
    X_umap,
    min_clusters=4,
    max_clusters=12,
    criterion='bic'
)

print(f"\nNumero optimo clusters: {optimal_results['optimal_n']}")
print(f"BIC scores: {optimal_results['scores']}")

In [None]:
gmm_clusterer.fit(X_umap, n_components=optimal_results['optimal_n'])
print(f"GMM fitted con {gmm_clusterer.n_components} clusters")
print(f"Converged: {gmm_clusterer.model.converged_}")
print(f"\nTamanos clusters:")
print(pd.Series(gmm_clusterer.labels_hard).value_counts().sort_index())

In [None]:
cluster_assignments = gmm_clusterer.get_cluster_assignments_df(metadata_df)
embedding_df = pd.merge(
    embedding_df,
    cluster_assignments[['unique_player_id', 'cluster_id', 'cluster_confidence']],
    on='unique_player_id',
    how='left'
)
print(f"Embedding con clusters: {embedding_df.shape}")

In [None]:
similarity_engine = PlayerSimilarity(
    embedding_df=embedding_df,
    gmm_proba=gmm_clusterer.labels_proba,
    feature_df=df_normalized,
    weights={'umap_distance': 0.50, 'gmm_probability': 0.30, 'feature_similarity': 0.20}
)
print("Motor similitud inicializado")

In [None]:
lewandowski_2425 = embedding_df[
    (embedding_df['player_name'].str.contains('Lewandowski', case=False)) &
    (embedding_df['season'] == '2425')
]

lewandowski_2526 = embedding_df[
    (embedding_df['player_name'].str.contains('Lewandowski', case=False)) &
    (embedding_df['season'] == '2526')
]

print("Lewandowski encontrado:")
print(f"  Temporada 24/25: {len(lewandowski_2425)} registros")
if len(lewandowski_2425) > 0:
    print(f"    ID: {lewandowski_2425['unique_player_id'].iloc[0]}")
    print(f"    Team: {lewandowski_2425['team'].iloc[0]}")
    print(f"    League: {lewandowski_2425['league'].iloc[0]}")

print(f"\n  Temporada 25/26: {len(lewandowski_2526)} registros")
if len(lewandowski_2526) > 0:
    print(f"    ID: {lewandowski_2526['unique_player_id'].iloc[0]}")
    print(f"    Team: {lewandowski_2526['team'].iloc[0]}")
    print(f"    League: {lewandowski_2526['league'].iloc[0]}")

In [None]:
if len(lewandowski_2425) == 0:
    print("ERROR: Lewandowski temporada 24/25 no encontrado")
    print("\nJugadores Barcelona 24/25 disponibles:")
    barca_2425 = embedding_df[
        (embedding_df['team'].str.contains('Barcelona', case=False)) &
        (embedding_df['season'] == '2425')
    ]
    print(barca_2425[['player_name', 'team', 'season']].sort_values('player_name'))
else:
    lewa_2425_id = lewandowski_2425['unique_player_id'].iloc[0]
    
    print("="*80)
    print("REEMPLAZOS LEWANDOWSKI TEMPORADA 2024/25")
    print("="*80)
    
    similar_2425 = similarity_engine.find_similar_players(
        player_identifier=lewa_2425_id,
        top_n=10,
        filters={
            'exclude_same_team': True,
            'max_age': 32
        },
        return_scores=True
    )
    
    similar_2425['rank'] = range(1, len(similar_2425) + 1)
    
    print(f"\nTop 10 jugadores similares a Lewandowski (temporada 24/25):")
    print("\n" + similar_2425[[
        'rank', 'player_name', 'team', 'league', 'season',
        'similarity_score', 'umap_distance', 'gmm_similarity'
    ]].to_string(index=False))
    
    print(f"\n\nDETALLE TOP 3:")
    for i in range(min(3, len(similar_2425))):
        row = similar_2425.iloc[i]
        print(f"\n{i+1}. {row['player_name']} ({row['team']}, {row['league']})")
        print(f"   Temporada: {row['season']}")
        print(f"   Similarity score: {row['similarity_score']:.3f}")
        print(f"   UMAP distance: {row['umap_distance']:.3f}")
        print(f"   GMM similarity: {row['gmm_similarity']:.3f}")

In [None]:
if len(lewandowski_2526) == 0:
    print("ERROR: Lewandowski temporada 25/26 no encontrado")
    print("\nJugadores Barcelona 25/26 disponibles:")
    barca_2526 = embedding_df[
        (embedding_df['team'].str.contains('Barcelona', case=False)) &
        (embedding_df['season'] == '2526')
    ]
    print(barca_2526[['player_name', 'team', 'season']].sort_values('player_name'))
else:
    lewa_2526_id = lewandowski_2526['unique_player_id'].iloc[0]
    
    print("="*80)
    print("REEMPLAZOS LEWANDOWSKI TEMPORADA 2025/26")
    print("="*80)
    
    similar_2526 = similarity_engine.find_similar_players(
        player_identifier=lewa_2526_id,
        top_n=10,
        filters={
            'exclude_same_team': True,
            'max_age': 32
        },
        return_scores=True
    )
    
    similar_2526['rank'] = range(1, len(similar_2526) + 1)
    
    print(f"\nTop 10 jugadores similares a Lewandowski (temporada 25/26):")
    print("\n" + similar_2526[[
        'rank', 'player_name', 'team', 'league', 'season',
        'similarity_score', 'umap_distance', 'gmm_similarity'
    ]].to_string(index=False))
    
    print(f"\n\nDETALLE TOP 3:")
    for i in range(min(3, len(similar_2526))):
        row = similar_2526.iloc[i]
        print(f"\n{i+1}. {row['player_name']} ({row['team']}, {row['league']})")
        print(f"   Temporada: {row['season']}")
        print(f"   Similarity score: {row['similarity_score']:.3f}")
        print(f"   UMAP distance: {row['umap_distance']:.3f}")
        print(f"   GMM similarity: {row['gmm_similarity']:.3f}")

In [None]:
if len(lewandowski_2425) > 0 and len(lewandowski_2526) > 0:
    print("="*80)
    print("COMPARACION ENTRE TEMPORADAS")
    print("="*80)
    
    explanation = similarity_engine.explain_similarity(
        player1_identifier=lewa_2425_id,
        player2_identifier=lewa_2526_id,
        top_features=10
    )
    
    print(f"\nSimilitud Lewandowski 24/25 vs 25/26:")
    print(f"  Score total: {explanation['overall_score']:.3f}")
    print(f"  UMAP distance score: {explanation['component_scores']['umap_distance_score']:.3f}")
    print(f"  GMM similarity score: {explanation['component_scores']['gmm_similarity_score']:.3f}")
    print(f"  Feature similarity score: {explanation['component_scores']['feature_similarity_score']:.3f}")
    print(f"\n  Cluster 24/25: {explanation['cluster_1']}")
    print(f"  Cluster 25/26: {explanation['cluster_2']}")
    print(f"  Mismo arquetipo: {'SI' if explanation['common_cluster'] is not None else 'NO'}")

In [None]:
if len(lewandowski_2425) > 0:
    visualizer = SimilarityVisualizer(figsize=(16, 10), dpi=100)
    
    highlight_ids = [lewa_2425_id]
    if len(lewandowski_2526) > 0:
        highlight_ids.append(lewa_2526_id)
    
    fig = visualizer.plot_umap_embedding(
        embedding_df,
        color_by='cluster_id',
        highlight_players=highlight_ids,
        save_path='lewandowski_replacement_umap.png',
        show=True
    )
    print("\nVisualizacion guardada: lewandowski_replacement_umap.png")

In [None]:
if len(lewandowski_2425) > 0 and len(lewandowski_2526) > 0:
    print("="*80)
    print("RESUMEN FINAL")
    print("="*80)
    
    print("\nMEJOR REEMPLAZO POR TEMPORADA:")
    print("\n1. TEMPORADA 2024/25")
    if len(similar_2425) > 0:
        best_2425 = similar_2425.iloc[0]
        print(f"   Jugador: {best_2425['player_name']}")
        print(f"   Equipo: {best_2425['team']}")
        print(f"   Liga: {best_2425['league']}")
        print(f"   Temporada datos: {best_2425['season']}")
        print(f"   Similarity score: {best_2425['similarity_score']:.3f}")
    
    print("\n2. TEMPORADA 2025/26")
    if len(similar_2526) > 0:
        best_2526 = similar_2526.iloc[0]
        print(f"   Jugador: {best_2526['player_name']}")
        print(f"   Equipo: {best_2526['team']}")
        print(f"   Liga: {best_2526['league']}")
        print(f"   Temporada datos: {best_2526['season']}")
        print(f"   Similarity score: {best_2526['similarity_score']:.3f}")
    
    print("\n\nNOTA METODOLOGICA:")
    print("- Metricas per90 permiten comparar temporadas con diferente progreso")
    print("- Normalizacion StandardScaler ajusta distribucion estadistica")
    print("- UMAP + GMM descubren patrones sin sesgo temporal")
    print("- Similarity score combina: distancia UMAP (50%) + GMM (30%) + features (20%)")
    print("\nFiltros aplicados:")
    print("- Excluido Barcelona (mismo equipo)")
    print("- Edad maxima: 32 anos")
    print("- Minutos minimos: 450 (equivalente ~5 partidos)")
    print("- Posicion: FW (delanteros)")
    print("- Ligas: Big 5 European")

In [None]:
if len(lewandowski_2425) > 0:
    results_export = pd.concat([
        similar_2425.assign(lewandowski_season='2024/25'),
        similar_2526.assign(lewandowski_season='2025/26')
    ], ignore_index=True)
    
    results_export = results_export[[
        'lewandowski_season', 'rank', 'player_name', 'team', 'league', 'season',
        'similarity_score', 'umap_distance', 'gmm_similarity', 'feature_similarity',
        'unique_player_id'
    ]]
    
    results_export.to_csv('lewandowski_replacement_results.csv', index=False)
    print("\nResultados exportados a: lewandowski_replacement_results.csv")
    print(f"Total filas exportadas: {len(results_export)}")

In [None]:
data_prep.close_connection()
print("Analisis completado. Conexion BD cerrada.")