In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from matplotlib.ticker import FuncFormatter

In [4]:
# Set plot style
plt.style.use('default')  # Changed from 'seaborn' to 'default'
sns.set_theme()  # This will set a nice seaborn theme

# Set plot style
plt.style.use('default')
sns.set_theme()

# Path to the CSV files directory
csv_dir = os.path.join(os.getcwd(), "data")  # Point to the data directory

def load_csv(filename):
    """Loads a CSV file and returns it as a DataFrame."""
    file_path = os.path.join(csv_dir, filename)
    print(f"Loading {filename}...")
    try:
        return pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: Could not find {file_path}")
        print(f"Current working directory: {os.getcwd()}")
        print(f"Files in current directory: {os.listdir('.')}")
        raise

# Load necessary datasets
players_df = load_csv("players.csv")
clubs_df = load_csv("clubs.csv")
player_valuations_df = load_csv("player_valuations.csv")
appearances_df = load_csv("appearances.csv")

# Display basic information about the datasets
print("\nDataset shapes:")
print(f"Players: {players_df.shape}")
print(f"Clubs: {clubs_df.shape}")
print(f"Player Valuations: {player_valuations_df.shape}")
print(f"Appearances: {appearances_df.shape}") 

Loading players.csv...
Loading clubs.csv...
Loading player_valuations.csv...
Loading appearances.csv...

Dataset shapes:
Players: (32601, 23)
Clubs: (439, 17)
Player Valuations: (496606, 5)
Appearances: (1706806, 13)


In [8]:
def analyze_actual_vs_earned_market_values():
    """
    Analyzes the relationship between actual market values and "earned" market values
    based on player performance statistics.
    
    The "earned" market value is what a player should be worth based on their
    performance metrics and other relevant factors.
    """
    print("\n=== ANALYZING ACTUAL VS 'EARNED' MARKET VALUES ===")
    
    # Step 1: Merge player data with their valuations
    print("Merging players with their valuations...")
    player_values = pd.merge(
        players_df,
        player_valuations_df,
        how='inner',
        on='player_id'
    )
    
    # Check the merged dataframe columns
    print("\nMerged player_values columns:")
    print(player_values.columns.tolist())
    
    # Identify the market value columns after the merge
    market_value_cols = [col for col in player_values.columns if 'market_value' in col.lower()]
    print(f"\nMarket value columns: {market_value_cols}")
    
    # Use the correct market value column from the merge
    market_value_col = 'market_value_in_eur_y' if 'market_value_in_eur_y' in player_values.columns else 'market_value_in_eur'
    print(f"Using market value column: {market_value_col}")
    
    # Step 2: Add player performance metrics from appearances
    print("Adding player performance metrics...")
    
    # Check the columns in appearances_df to ensure we have the expected columns for aggregation
    expected_columns = ['player_id', 'goals', 'assists', 'minutes_played', 'yellow_cards', 'red_cards']
    missing_columns = [col for col in expected_columns if col not in appearances_df.columns]
    
    if missing_columns:
        print(f"Warning: Missing columns in appearances_df: {missing_columns}")
        print("Please check the appearances data structure")
        
        # Create sample columns if they don't exist (for demonstration purposes)
        for col in missing_columns:
            if col != 'player_id':  # Skip player_id as it should exist
                appearances_df[col] = 0
    


In [9]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
    
# Train a Random Forest model (more robust to outliers and complex relationships)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
    
# Predict "earned" market values
y_pred = model.predict(X_test_scaled)
    
# Calculate model performance
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Model performance: R² = {r2:.4f}, RMSE = {rmse:.4f}")
 

NameError: name 'train_test_split' is not defined