# Clustering - Ph√¢n c·ª•m c·∫ßu th·ªß v√† ƒë·ªôi b√≥ng

Notebook n√†y th·ª±c hi·ªán ph√¢n c·ª•m c·∫ßu th·ªß v√† ƒë·ªôi b√≥ng theo phong c√°ch ch∆°i s·ª≠ d·ª•ng K-Means v√† Hierarchical Clustering.

## M·ª•c ti√™u:
1. Ch·ªçn features ph√π h·ª£p
2. Scale d·ªØ li·ªáu
3. T√¨m s·ªë c·ª•m t·ªëi ∆∞u
4. √Åp d·ª•ng K-Means v√† Hierarchical Clustering
5. Ph√¢n t√≠ch v√† visualize c√°c c·ª•m


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append('../src')
from clustering import (
    select_features_for_clustering, find_optimal_clusters,
    perform_kmeans_clustering, perform_hierarchical_clustering,
    analyze_clusters, reduce_dimensions_for_visualization
)
from sklearn.preprocessing import StandardScaler

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)

print("‚úÖ ƒê√£ import c√°c modules c·∫ßn thi·∫øt")


## 1. Load v√† chu·∫©n b·ªã d·ªØ li·ªáu


In [None]:
# Load d·ªØ li·ªáu
try:
    players_df = pd.read_excel('../data/players_processed.xlsx')
    print(f"‚úÖ ƒê√£ load d·ªØ li·ªáu c·∫ßu th·ªß: {players_df.shape}")
except:
    from data_preprocessing import load_data, feature_engineering_players, prepare_data_for_analysis
    data = load_data()
    players_df = feature_engineering_players(data['players'])
    players_df = prepare_data_for_analysis(players_df)
    print(f"‚úÖ ƒê√£ load v√† x·ª≠ l√Ω: {players_df.shape}")

# Ch·ªçn features cho clustering
feature_cols = select_features_for_clustering(players_df)
print(f"\nüìä ƒê√£ ch·ªçn {len(feature_cols)} features cho clustering")
print(f"Features: {feature_cols[:10]}...")

# L·ªçc d·ªØ li·ªáu v√† scale
X = players_df[feature_cols].fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"\n‚úÖ ƒê√£ scale d·ªØ li·ªáu: {X_scaled.shape}")


## 2. T√¨m s·ªë c·ª•m t·ªëi ∆∞u


In [None]:
# T√¨m s·ªë c·ª•m t·ªëi ∆∞u
optimal_results = find_optimal_clusters(X_scaled, max_k=10, method='kmeans')

# Visualize Elbow Method v√† Silhouette Score
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Elbow Method
ax1 = axes[0]
ax1.plot(optimal_results['k_range'], optimal_results['inertias'], 'bo-')
ax1.axvline(x=optimal_results['optimal_k'], color='r', linestyle='--', label=f'Optimal k={optimal_results["optimal_k"]}')
ax1.set_xlabel('Number of Clusters (k)', fontweight='bold')
ax1.set_ylabel('Inertia', fontweight='bold')
ax1.set_title('Elbow Method', fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Silhouette Score
ax2 = axes[1]
ax2.plot(optimal_results['k_range'], optimal_results['silhouette_scores'], 'go-')
ax2.axvline(x=optimal_results['optimal_k'], color='r', linestyle='--', label=f'Optimal k={optimal_results["optimal_k"]}')
ax2.set_xlabel('Number of Clusters (k)', fontweight='bold')
ax2.set_ylabel('Silhouette Score', fontweight='bold')
ax2.set_title('Silhouette Score', fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/clustering/optimal_clusters.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"‚úÖ S·ªë c·ª•m t·ªëi ∆∞u: {optimal_results['optimal_k']}")


## 3. K-Means Clustering


In [None]:
# Th·ª±c hi·ªán K-Means clustering
kmeans_results = perform_kmeans_clustering(X_scaled, find_optimal=True, max_k=10)

print(f"\nüìä K·∫øt qu·∫£ K-Means:")
print(f"  - S·ªë c·ª•m: {kmeans_results['n_clusters']}")
print(f"  - Silhouette Score: {kmeans_results['silhouette_score']:.3f}")
print(f"  - Davies-Bouldin Score: {kmeans_results['davies_bouldin_score']:.3f}")

# Ph√¢n t√≠ch c√°c c·ª•m
cluster_stats = analyze_clusters(players_df, kmeans_results['labels'], feature_cols)
print("\nüìà Th·ªëng k√™ c√°c c·ª•m:")
print(cluster_stats)

# Th√™m cluster labels v√†o dataframe
players_df['Cluster_KMeans'] = kmeans_results['labels']
print(f"\nüìä Ph√¢n b·ªë c·∫ßu th·ªß theo c·ª•m:")
print(players_df['Cluster_KMeans'].value_counts().sort_index())


In [None]:
# Visualize clusters v·ªõi PCA
X_pca, pca = reduce_dimensions_for_visualization(X_scaled, n_components=2)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Scatter plot v·ªõi colors theo cluster
ax1 = axes[0]
scatter = ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_results['labels'], 
                     cmap='viridis', alpha=0.6, s=50, edgecolors='black', linewidth=0.5)
ax1.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)', fontweight='bold')
ax1.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)', fontweight='bold')
ax1.set_title('K-Means Clustering (PCA Visualization)', fontweight='bold')
plt.colorbar(scatter, ax=ax1, label='Cluster')
ax1.grid(True, alpha=0.3)

# Ph√¢n b·ªë theo v·ªã tr√≠ v√† cluster
ax2 = axes[1]
if 'Pos' in players_df.columns:
    cluster_pos = pd.crosstab(players_df['Cluster_KMeans'], players_df['Pos'])
    cluster_pos.plot(kind='bar', ax=ax2, stacked=True, colormap='Set3')
    ax2.set_xlabel('Cluster', fontweight='bold')
    ax2.set_ylabel('Number of Players', fontweight='bold')
    ax2.set_title('Distribution of Positions in Each Cluster', fontweight='bold')
    ax2.legend(title='Position', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)

plt.tight_layout()
plt.savefig('../results/clustering/kmeans_visualization.png', dpi=300, bbox_inches='tight')
plt.show()


## 4. Hierarchical Clustering


In [None]:
# Th·ª±c hi·ªán Hierarchical Clustering
hierarchical_results = perform_hierarchical_clustering(X_scaled, find_optimal=True, max_k=10, linkage='ward')

print(f"\nüìä K·∫øt qu·∫£ Hierarchical Clustering:")
print(f"  - S·ªë c·ª•m: {hierarchical_results['n_clusters']}")
print(f"  - Silhouette Score: {hierarchical_results['silhouette_score']:.3f}")
print(f"  - Davies-Bouldin Score: {hierarchical_results['davies_bouldin_score']:.3f}")

# Ph√¢n t√≠ch c√°c c·ª•m
cluster_stats_hier = analyze_clusters(players_df, hierarchical_results['labels'], feature_cols)
print("\nüìà Th·ªëng k√™ c√°c c·ª•m (Hierarchical):")
print(cluster_stats_hier)

players_df['Cluster_Hierarchical'] = hierarchical_results['labels']
print(f"\nüìä Ph√¢n b·ªë c·∫ßu th·ªß theo c·ª•m (Hierarchical):")
print(players_df['Cluster_Hierarchical'].value_counts().sort_index())

# Visualize
fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], c=hierarchical_results['labels'], 
                    cmap='plasma', alpha=0.6, s=50, edgecolors='black', linewidth=0.5)
ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)', fontweight='bold')
ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)', fontweight='bold')
ax.set_title('Hierarchical Clustering (PCA Visualization)', fontweight='bold')
plt.colorbar(scatter, ax=ax, label='Cluster')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../results/clustering/hierarchical_visualization.png', dpi=300, bbox_inches='tight')
plt.show()

# L∆∞u k·∫øt qu·∫£
players_df[['Player', 'Pos', 'Squad', 'Cluster_KMeans', 'Cluster_Hierarchical']].to_csv(
    '../results/clustering/player_clusters.csv', index=False)
print("\n‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ clustering")
