# Classification - Ph√¢n lo·∫°i c·∫ßu th·ªß v√† ƒë·ªôi b√≥ng

Notebook n√†y th·ª±c hi·ªán ph√¢n lo·∫°i s·ª≠ d·ª•ng Random Forest v√† Decision Tree.

## M·ª•c ti√™u:
1. Ph√¢n lo·∫°i v·ªã tr√≠ c·∫ßu th·ªß
2. Ph√¢n lo·∫°i ƒë·ªôi b√≥ng v√†o Top 4
3. Ph√¢n lo·∫°i hi·ªáu su·∫•t c·∫ßu th·ªß
4. ƒê√°nh gi√° v√† so s√°nh models


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append('../src')
from classification import (
    classify_player_position, classify_team_top4, classify_player_performance,
    evaluate_classification, get_feature_importance
)
from sklearn.metrics import confusion_matrix

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)

print("‚úÖ ƒê√£ import c√°c modules c·∫ßn thi·∫øt")


## 1. Ph√¢n lo·∫°i v·ªã tr√≠ c·∫ßu th·ªß


In [None]:
# Load d·ªØ li·ªáu
try:
    players_df = pd.read_excel('../data/players_processed.xlsx')
except:
    from data_preprocessing import load_data, feature_engineering_players, prepare_data_for_analysis
    data = load_data()
    players_df = feature_engineering_players(data['players'])
    players_df = prepare_data_for_analysis(players_df)

# Ph√¢n lo·∫°i v·ªã tr√≠
print("="*70)
print("PH√ÇN LO·∫†I V·ªä TR√ç C·∫¶U TH·ª¶")
print("="*70)

position_results = classify_player_position(players_df, min_samples_per_class=10)

if position_results:
    # ƒê√°nh gi√° Random Forest
    rf_metrics = evaluate_classification(position_results['random_forest'], 'Random Forest')
    print(f"\nüìä Random Forest Metrics:")
    for metric, value in rf_metrics.items():
        print(f"  {metric}: {value:.3f}")
    
    # ƒê√°nh gi√° Decision Tree
    dt_metrics = evaluate_classification(position_results['decision_tree'], 'Decision Tree')
    print(f"\nüìä Decision Tree Metrics:")
    for metric, value in dt_metrics.items():
        print(f"  {metric}: {value:.3f}")
    
    # Feature importance
    rf_importance = get_feature_importance(
        position_results['random_forest']['model'],
        position_results['random_forest']['feature_names'],
        top_n=15
    )
    
    if rf_importance is not None:
        print("\nüìà Top 15 Features (Random Forest):")
        print(rf_importance)
        
        # Visualize
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        
        # Feature importance
        ax1 = axes[0]
        top_features = rf_importance.head(10)
        ax1.barh(range(len(top_features)), top_features['importance'], color='steelblue')
        ax1.set_yticks(range(len(top_features)))
        ax1.set_yticklabels(top_features['feature'], fontsize=9)
        ax1.set_xlabel('Importance', fontweight='bold')
        ax1.set_title('Top 10 Feature Importance (Random Forest)', fontweight='bold')
        ax1.invert_yaxis()
        
        # Confusion Matrix
        ax2 = axes[1]
        cm = confusion_matrix(position_results['random_forest']['y_test'], 
                             position_results['random_forest']['predictions'])
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2)
        ax2.set_xlabel('Predicted', fontweight='bold')
        ax2.set_ylabel('Actual', fontweight='bold')
        ax2.set_title('Confusion Matrix (Random Forest)', fontweight='bold')
        
        plt.tight_layout()
        plt.savefig('../results/classification/position_classification.png', dpi=300, bbox_inches='tight')
        plt.show()


## 2. Ph√¢n lo·∫°i ƒë·ªôi b√≥ng Top 4


In [None]:
# Load d·ªØ li·ªáu ƒë·ªôi b√≥ng
try:
    teams_df = pd.read_excel('../data/teams_processed.xlsx')
except:
    from data_preprocessing import load_data, feature_engineering_teams, prepare_data_for_analysis
    data = load_data()
    teams_merged = feature_engineering_teams(data['teams_for'], data['teams_vs'])
    if teams_merged is not None:
        teams_df = prepare_data_for_analysis(teams_merged, target_cols=['Squad'])
    else:
        teams_df = None

if teams_df is not None:
    print("="*70)
    print("PH√ÇN LO·∫†I ƒê·ªòI B√ìNG TOP 4")
    print("="*70)
    
    top4_results = classify_team_top4(teams_df)
    
    if top4_results:
        # ƒê√°nh gi√°
        rf_metrics = evaluate_classification(top4_results['random_forest'], 'Random Forest')
        dt_metrics = evaluate_classification(top4_results['decision_tree'], 'Decision Tree')
        
        print(f"\nüìä Random Forest Metrics:")
        for metric, value in rf_metrics.items():
            print(f"  {metric}: {value:.3f}")
        
        print(f"\nüìä Decision Tree Metrics:")
        for metric, value in dt_metrics.items():
            print(f"  {metric}: {value:.3f}")
        
        # Visualize
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # Confusion Matrix RF
        ax1 = axes[0]
        cm_rf = confusion_matrix(top4_results['random_forest']['y_test'], 
                                 top4_results['random_forest']['predictions'])
        sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens', ax=ax1, 
                   xticklabels=['Not Top 4', 'Top 4'], yticklabels=['Not Top 4', 'Top 4'])
        ax1.set_title('Confusion Matrix - Random Forest', fontweight='bold')
        
        # Confusion Matrix DT
        ax2 = axes[1]
        cm_dt = confusion_matrix(top4_results['decision_tree']['y_test'], 
                                top4_results['decision_tree']['predictions'])
        sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Oranges', ax=ax2,
                   xticklabels=['Not Top 4', 'Top 4'], yticklabels=['Not Top 4', 'Top 4'])
        ax2.set_title('Confusion Matrix - Decision Tree', fontweight='bold')
        
        plt.tight_layout()
        plt.savefig('../results/classification/top4_classification.png', dpi=300, bbox_inches='tight')
        plt.show()
else:
    print("‚ö†Ô∏è Kh√¥ng c√≥ d·ªØ li·ªáu ƒë·ªôi b√≥ng")


In [None]:
# Ph√¢n lo·∫°i hi·ªáu su·∫•t
print("="*70)
print("PH√ÇN LO·∫†I HI·ªÜU SU·∫§T C·∫¶U TH·ª¶")
print("="*70)

performance_results = classify_player_performance(players_df)

if performance_results:
    # ƒê√°nh gi√°
    rf_metrics = evaluate_classification(performance_results['random_forest'], 'Random Forest')
    dt_metrics = evaluate_classification(performance_results['decision_tree'], 'Decision Tree')
    
    print(f"\nüìä Random Forest Metrics:")
    for metric, value in rf_metrics.items():
        print(f"  {metric}: {value:.3f}")
    
    print(f"\nüìä Decision Tree Metrics:")
    for metric, value in dt_metrics.items():
        print(f"  {metric}: {value:.3f}")
    
    # Visualize
    fig, ax = plt.subplots(figsize=(10, 8))
    cm = confusion_matrix(performance_results['random_forest']['y_test'], 
                         performance_results['random_forest']['predictions'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='viridis', ax=ax)
    le = performance_results['random_forest']['label_encoder']
    class_names = le.classes_
    ax.set_xticklabels(class_names, rotation=45, ha='right')
    ax.set_yticklabels(class_names, rotation=0)
    ax.set_xlabel('Predicted', fontweight='bold')
    ax.set_ylabel('Actual', fontweight='bold')
    ax.set_title('Confusion Matrix - Performance Classification', fontweight='bold')
    plt.tight_layout()
    plt.savefig('../results/classification/performance_classification.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\n‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ classification")
