In [None]:
# Data Exploration - Premier League 2024-2025

Notebook n√†y th·ª±c hi·ªán kh√°m ph√° v√† ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu Premier League 2024-2025.

## M·ª•c ti√™u:
1. Load v√† ki·ªÉm tra d·ªØ li·ªáu
2. Ph√¢n t√≠ch missing values
3. Th·ªëng k√™ m√¥ t·∫£
4. Feature engineering
5. Chu·∫©n b·ªã d·ªØ li·ªáu cho c√°c ph√¢n t√≠ch ti·∫øp theo


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
import os

# Th√™m th∆∞ m·ª•c src v√†o path
sys.path.append('../src')

from data_preprocessing import (
    load_data, explore_data, get_numeric_columns, 
    get_categorical_columns, feature_engineering_players,
    feature_engineering_teams, prepare_data_for_analysis
)

warnings.filterwarnings('ignore')

# C·∫•u h√¨nh hi·ªÉn th·ªã
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ ƒê√£ import c√°c th∆∞ vi·ªán v√† modules c·∫ßn thi·∫øt")


## 1. Load d·ªØ li·ªáu


In [None]:
# Load t·∫•t c·∫£ d·ªØ li·ªáu
data = load_data()

print("üìä D·ªØ li·ªáu ƒë√£ ƒë∆∞·ª£c load:")
for name, df in data.items():
    print(f"  - {name}: {df.shape[0]} rows, {df.shape[1]} columns")

# L∆∞u v√†o bi·∫øn ri√™ng ƒë·ªÉ d·ªÖ s·ª≠ d·ª•ng
players_df = data['players']
keepers_df = data['keepers']
teams_for_df = data['teams_for']
teams_vs_df = data['teams_vs']


## 2. Kh√°m ph√° d·ªØ li·ªáu c·∫ßu th·ªß (Players)


In [None]:
# Kh√°m ph√° d·ªØ li·ªáu c·∫ßu th·ªß
missing_players = explore_data(players_df, "Players Dataset")

print("\n" + "="*60)
print("SAMPLE DATA - First 3 rows:")
print("="*60)
print(players_df.head(3))

print("\n" + "="*60)
print("COLUMN NAMES:")
print("="*60)
print(f"Total columns: {len(players_df.columns)}")
print("\nFirst 20 columns:")
print(players_df.columns[:20].tolist())


In [None]:
# Th·ªëng k√™ m√¥ t·∫£ cho c√°c c·ªôt s·ªë quan tr·ªçng
numeric_cols = get_numeric_columns(players_df, exclude_cols=['Age', 'Born'])

# Ch·ªçn m·ªôt s·ªë c·ªôt quan tr·ªçng ƒë·ªÉ xem
important_cols = [col for col in numeric_cols if any(keyword in col.lower() 
    for keyword in ['gls', 'ast', 'xg', 'xa', 'sh', 'sot', 'pass', 'tkl', 'touches'])]

if len(important_cols) > 0:
    print("üìà Th·ªëng k√™ m√¥ t·∫£ - C√°c ch·ªâ s·ªë quan tr·ªçng:")
    print(players_df[important_cols[:15]].describe())


In [None]:
# Ph√¢n t√≠ch v·ªã tr√≠ c·∫ßu th·ªß
if 'Pos' in players_df.columns:
    print("üìä Ph√¢n b·ªë v·ªã tr√≠ c·∫ßu th·ªß:")
    print(players_df['Pos'].value_counts())
    
    plt.figure(figsize=(10, 6))
    players_df['Pos'].value_counts().plot(kind='bar', color='steelblue')
    plt.title('Ph√¢n b·ªë v·ªã tr√≠ c·∫ßu th·ªß', fontsize=14, fontweight='bold')
    plt.xlabel('V·ªã tr√≠', fontweight='bold')
    plt.ylabel('S·ªë l∆∞·ª£ng', fontweight='bold')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('../results/position_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()


## 3. Kh√°m ph√° d·ªØ li·ªáu ƒë·ªôi b√≥ng (Teams)


In [None]:
# Kh√°m ph√° d·ªØ li·ªáu ƒë·ªôi b√≥ng
explore_data(teams_for_df, "Teams For Dataset")
explore_data(teams_vs_df, "Teams VS Dataset")

print("\n" + "="*60)
print("TEAMS FOR - Sample:")
print("="*60)
print(teams_for_df.head())

print("\n" + "="*60)
print("TEAMS VS - Sample:")
print("="*60)
print(teams_vs_df.head())


## 4. Feature Engineering


In [None]:
# T·∫°o c√°c features m·ªõi cho c·∫ßu th·ªß
players_df_enhanced = feature_engineering_players(players_df)

print("‚úÖ ƒê√£ t·∫°o c√°c features m·ªõi cho c·∫ßu th·ªß")
print(f"\nS·ªë c·ªôt tr∆∞·ªõc: {len(players_df.columns)}")
print(f"S·ªë c·ªôt sau: {len(players_df_enhanced.columns)}")
print(f"\nC√°c c·ªôt m·ªõi ƒë∆∞·ª£c th√™m:")
new_cols = set(players_df_enhanced.columns) - set(players_df.columns)
print(list(new_cols))


In [None]:
# T·∫°o c√°c features m·ªõi cho ƒë·ªôi b√≥ng
teams_merged = feature_engineering_teams(teams_for_df, teams_vs_df)

if teams_merged is not None:
    print("‚úÖ ƒê√£ merge v√† t·∫°o features m·ªõi cho ƒë·ªôi b√≥ng")
    print(f"\nShape: {teams_merged.shape}")
    print("\nSample data:")
    print(teams_merged.head())


## 5. L∆∞u d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω


In [None]:
# Chu·∫©n b·ªã d·ªØ li·ªáu cho ph√¢n t√≠ch
players_processed = prepare_data_for_analysis(players_df_enhanced)

# L∆∞u d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω
players_processed.to_excel('../data/players_processed.xlsx', index=False)
print("‚úÖ ƒê√£ l∆∞u players_processed.xlsx")

if teams_merged is not None:
    teams_processed = prepare_data_for_analysis(teams_merged, target_cols=['Squad'])
    teams_processed.to_excel('../data/teams_processed.xlsx', index=False)
    print("‚úÖ ƒê√£ l∆∞u teams_processed.xlsx")

print("\nüéâ Ho√†n t·∫•t kh√°m ph√° d·ªØ li·ªáu!")
