In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

from database.connection import get_db_manager
from similarity.data_preparation import DataPreparator
from similarity.feature_engineering import FeatureEngineer
from similarity.umap_reducer import UMAPReducer
from similarity.gmm_clustering import GMMClusterer
from similarity.player_similarity import PlayerSimilarity
from similarity.validation import PipelineValidator
from similarity.visualization import SimilarityVisualizer

In [None]:
# CONFIGURACION
TABLE_TYPE = 'domestic'
LEAGUES = ['ENG-Premier League', 'ESP-La Liga', 'ITA-Serie A', 'GER-Bundesliga', 'FRA-Ligue 1']
SEASON = '2526'
POSITION_FILTER = 'FW'
MIN_MINUTES = 600
MAX_AGE = None

In [None]:
# PASO 1: PREPARACION DATOS
db = get_db_manager()
data_prep = DataPreparator(db_manager=db, table_type=TABLE_TYPE)

df_raw = data_prep.load_players(
    leagues=LEAGUES,
    season=SEASON,
    position_filter=POSITION_FILTER,
    min_minutes=MIN_MINUTES,
    max_age=MAX_AGE
)

print(f"Jugadores cargados: {len(df_raw)}")

In [None]:
# PASO 2: EXTRACCION METRICAS
df_metrics = data_prep.extract_all_metrics()
print(f"Metricas extraidas: {df_metrics.shape[1]} columnas")

In [None]:
# PASO 3: MANEJO VALORES FALTANTES
df_clean = data_prep.handle_missing_values(
    strategy='median_by_position',
    max_missing_pct=0.4
)
print(f"Datos limpios: {df_clean.shape}")

In [None]:
# PASO 4: DETECCION OUTLIERS
df_outliers = data_prep.detect_outliers(
    method='isolation_forest',
    contamination=0.05
)
print(f"Outliers detectados: {df_outliers['is_outlier'].sum()}")

In [None]:
# PASO 5: FEATURE ENGINEERING
feature_eng = FeatureEngineer(position_type=POSITION_FILTER)

df_selected = feature_eng.select_relevant_features(
    df_outliers,
    exclude_gk_metrics=True,
    min_variance=0.01
)
print(f"Features seleccionadas: {len(feature_eng.selected_features)}")

In [None]:
# PASO 6: ELIMINAR FEATURES CORRELACIONADAS
df_uncorrelated = feature_eng.remove_correlated_features(
    df_selected,
    threshold=0.95
)
print(f"Features no redundantes: {len(feature_eng.selected_features)}")

In [None]:
# PASO 7: NORMALIZACION POR POSICION
df_normalized = feature_eng.normalize_by_position(
    df_uncorrelated,
    method='standard',
    fit_per_position=True
)
print("Normalizacion completada")

In [None]:
# PASO 8: FEATURE IMPORTANCE
feature_importance = feature_eng.get_feature_importance(df_normalized)
print("Top 10 features:")
print(pd.Series(feature_importance).head(10))

In [None]:
# PASO 9: PREPARAR MATRIZ FEATURES PARA UMAP
X, metadata_df = feature_eng.prepare_for_umap(df_normalized, return_dataframe=True)
print(f"Matriz features: {X.shape}")
print(f"Metadata: {metadata_df.shape}")

In [None]:
# PASO 10: REDUCCION DIMENSIONAL UMAP
umap_reducer = UMAPReducer(
    n_components=5,
    n_neighbors=20,
    min_dist=0.0,
    metric='euclidean',
    random_state=42
)

X_umap = umap_reducer.fit_transform(X, verbose=True)
print(f"UMAP embedding: {X_umap.shape}")

In [None]:
# PASO 11: VALIDAR ESTABILIDAD UMAP (OPCIONAL)
stability_results = umap_reducer.validate_embedding_stability(
    X,
    n_runs=5,
    metric='correlation'
)
print(f"Estabilidad UMAP: {stability_results['mean_similarity']:.3f}")

In [None]:
# PASO 12: CREAR DATAFRAME EMBEDDING
embedding_df = umap_reducer.get_embedding_dataframe(metadata_df)
print(embedding_df.head())

In [None]:
# PASO 13: CLUSTERING GMM - ENCONTRAR N OPTIMO
gmm_clusterer = GMMClusterer(
    covariance_type='full',
    max_iter=200,
    random_state=42
)

optimal_results = gmm_clusterer.find_optimal_clusters(
    X_umap,
    min_clusters=3,
    max_clusters=12,
    criterion='bic'
)

print(f"Numero optimo de clusters: {optimal_results['optimal_n']}")

In [None]:
# PASO 14: FIT GMM CON N OPTIMO
gmm_clusterer.fit(X_umap, n_components=optimal_results['optimal_n'])
print(f"GMM fitted con {gmm_clusterer.n_components} clusters")
print(f"Converged: {gmm_clusterer.model.converged_}")

In [None]:
# PASO 15: METRICAS CALIDAD CLUSTERING
cluster_metrics = gmm_clusterer.get_cluster_metrics(X_umap)
print("Metricas clustering:")
for key, value in cluster_metrics.items():
    print(f"  {key}: {value}")

In [None]:
# PASO 16: VALIDAR ESTABILIDAD CLUSTERING (OPCIONAL)
stability_clustering = gmm_clusterer.cluster_stability(
    X_umap,
    n_runs=30,
    subsample_frac=0.8
)
print(f"Estabilidad clustering (ARI): {stability_clustering['mean_ari']:.3f}")

In [None]:
# PASO 17: OBTENER ASSIGNMENTS CLUSTERS
cluster_assignments = gmm_clusterer.get_cluster_assignments_df(metadata_df)
print(cluster_assignments[['player_name', 'team', 'cluster_id', 'cluster_confidence']].head(10))

In [None]:
# PASO 18: CARACTERIZAR CLUSTERS (ARQUETIPOS)
cluster_profiles = gmm_clusterer.get_cluster_profiles(
    X,
    feature_names=feature_eng.selected_features,
    top_n_features=10
)
print("\nPerfil Cluster 0:")
print(cluster_profiles[cluster_profiles['cluster_id'] == 0][['feature', 'mean_value', 'z_score']])

In [None]:
# PASO 19: MERGE EMBEDDING + CLUSTERS
embedding_df = pd.merge(
    embedding_df,
    cluster_assignments[['unique_player_id', 'cluster_id', 'cluster_confidence']],
    on='unique_player_id',
    how='left'
)
print(embedding_df.head())

In [None]:
# PASO 20: INICIALIZAR MOTOR SIMILITUD
similarity_engine = PlayerSimilarity(
    embedding_df=embedding_df,
    gmm_proba=gmm_clusterer.labels_proba,
    feature_df=df_normalized,
    weights={'umap_distance': 0.50, 'gmm_probability': 0.30, 'feature_similarity': 0.20}
)
print("Motor similitud inicializado")

In [None]:
# PASO 21: BUSCAR JUGADORES SIMILARES (EJEMPLO)
query_player_name = 'Lewandowski'
query_player_id = embedding_df[embedding_df['player_name'].str.contains(query_player_name, case=False)]['unique_player_id'].iloc[0]

similar_players = similarity_engine.find_similar_players(
    player_identifier=query_player_id,
    top_n=10,
    filters={'exclude_same_team': True},
    return_scores=True
)

print(f"\nJugadores similares a {query_player_name}:")
print(similar_players[['player_name', 'team', 'league', 'similarity_score', 'umap_distance', 'gmm_similarity']])

In [None]:
# PASO 22: VALIDAR CADENA REEMPLAZOS (EJEMPLO)
player_chain = [
    embedding_df[embedding_df['player_name'].str.contains('Player1', case=False)]['unique_player_id'].iloc[0],
    embedding_df[embedding_df['player_name'].str.contains('Player2', case=False)]['unique_player_id'].iloc[0]
]

chain_validation = similarity_engine.validate_replacement_chain(
    player_identifiers=player_chain,
    min_similarity=0.6
)
print("\nValidacion cadena:")
print(chain_validation)

In [None]:
# PASO 23: MATRIZ SIMILITUD MULTIPLES JUGADORES
players_to_compare = [
    embedding_df[embedding_df['player_name'].str.contains('Lewandowski', case=False)]['unique_player_id'].iloc[0],
    embedding_df[embedding_df['player_name'].str.contains('Kane', case=False)]['unique_player_id'].iloc[0],
    embedding_df[embedding_df['player_name'].str.contains('Haaland', case=False)]['unique_player_id'].iloc[0]
]

sim_matrix, player_names = similarity_engine.get_similarity_matrix(
    player_identifiers=players_to_compare,
    normalize=True
)
print("\nMatriz similitud:")
print(pd.DataFrame(sim_matrix, index=player_names, columns=player_names))

In [None]:
# PASO 24: ENCONTRAR REPRESENTANTES CLUSTER
cluster_id_to_explore = 0
representatives = similarity_engine.find_cluster_representatives(
    cluster_id=cluster_id_to_explore,
    n_representatives=5
)
print(f"\nRepresentantes Cluster {cluster_id_to_explore}:")
print(representatives[['player_name', 'team', 'cluster_probability', 'distance_to_center']])

In [None]:
# PASO 25: EXPLICAR SIMILITUD ENTRE DOS JUGADORES
player1_id = embedding_df[embedding_df['player_name'].str.contains('Lewandowski', case=False)]['unique_player_id'].iloc[0]
player2_id = similar_players.iloc[0]['unique_player_id']

explanation = similarity_engine.explain_similarity(
    player1_identifier=player1_id,
    player2_identifier=player2_id,
    top_features=10
)

print("\nExplicacion similitud:")
print(f"Score total: {explanation['overall_score']:.3f}")
print(f"Scores componentes: {explanation['component_scores']}")
print(f"Cluster comun: {explanation['common_cluster']}")
print(f"Top features similares: {explanation['top_similar_features'][:5]}")

In [None]:
# PASO 26: VISUALIZACION - UMAP EMBEDDING
visualizer = SimilarityVisualizer(figsize=(14, 10), dpi=100)

fig = visualizer.plot_umap_embedding(
    embedding_df,
    color_by='cluster_id',
    highlight_players=[query_player_id],
    save_path='umap_embedding.png',
    show=True
)

In [None]:
# PASO 27: VISUALIZACION - HEATMAP SIMILITUD
fig = visualizer.plot_similarity_heatmap(
    sim_matrix,
    player_names,
    save_path='similarity_heatmap.png',
    show=True
)

In [None]:
# PASO 28: VISUALIZACION - PERFILES CLUSTERS
fig = visualizer.plot_cluster_profiles(
    cluster_profiles,
    save_path='cluster_profiles.png',
    show=True
)

In [None]:
# PASO 29: VISUALIZACION - FEATURE IMPORTANCE
fig = visualizer.plot_feature_importance(
    feature_importance,
    top_n=20,
    save_path='feature_importance.png',
    show=True
)

In [None]:
# PASO 30: VALIDACION - SETUP GROUND TRUTH
validator = PipelineValidator(similarity_engine)

validator.add_ground_truth_pair(
    embedding_df[embedding_df['player_name'].str.contains('Sorloth', case=False)]['unique_player_id'].iloc[0],
    embedding_df[embedding_df['player_name'].str.contains('Jackson', case=False)]['unique_player_id'].iloc[0]
)

print("Ground truth pairs configurados")

In [None]:
# PASO 31: VALIDACION - EVALUAR PARES CONOCIDOS
validation_results = validator.validate_known_pairs(top_k=10)
print("\nResultados validacion:")
print(validation_results)

In [None]:
# PASO 32: VALIDACION - REPORTE COMPLETO
report = validator.generate_validation_report()
print("\nReporte validacion:")
print(report)

In [None]:
# PASO 33: GUARDAR MODELOS (OPCIONAL)
umap_reducer.save_model('models/umap_model.pkl')
gmm_clusterer.save_model('models/gmm_model.pkl')
print("Modelos guardados")

In [None]:
# PASO 34: EXPORT RESULTADOS A CSV
similarity_results_export = pd.merge(
    embedding_df,
    cluster_assignments[['unique_player_id', 'cluster_id', 'cluster_confidence']],
    on='unique_player_id'
)

similarity_results_export.to_csv('similarity_results.csv', index=False)
print("Resultados exportados a similarity_results.csv")

In [None]:
# PASO 35: CERRAR CONEXION BD
data_prep.close_connection()
print("Conexion BD cerrada")