In [32]:
# General libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import FancyBboxPatch
import seaborn as sns

# Machine Learning and Forecasting
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from statsmodels.tsa.seasonal import seasonal_decompose

# Time Series Forecasting
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima

# Clustering and Dimensionality Reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Miscellaneous
import warnings

In [37]:
import os
import pandas as pd

# Path to the directory containing the player stats CSV files
data_dir = "FIFA_datasets/laliga2023_34/player_stats"

# List all CSV files in the directory
csv_files = [file for file in os.listdir(data_dir) if file.endswith(".csv")]

# Categorize the files into groups based on their metrics
stat_groups = {
    "defensive_stats": [
        "player_clean_sheets.csv", "player_interceptions.csv", "player_effective_clearances.csv",
        "player_tackles_won.csv", "player_outfielder_blocks.csv", "player_saves_made.csv",
        "player_goals_conceded.csv", "player_penalties_conceded.csv"
    ],
    "attacking_stats": [
        "player_top_scorers.csv", "player_shots_per_90.csv", "player_on_target_scoring_attempts.csv",
        "player_shots_on_target_per_90.csv", "player_big_chances_created.csv",
        "player_big_chances_missed.csv"
    ],
    "expected_stats": [
        "player_expected_goals.csv", "player_expected_goals_per_90.csv", "player_expected_assists.csv",
        "player_expected_assists_per_90.csv", "player_expected_goals_on_target.csv"
    ],
    "misc_stats": [
        "player_yellow_cards.csv", "player_red_cards.csv", "player_player_ratings.csv",
        "player_penalties_won.csv", "player_possessions_won_attacking_third.csv"
    ]
}

# Function to load and combine CSV files for each stat group
def combine_stat_group(stat_files, group_name, suffix):
    group_dfs = []
    for file in stat_files:
        file_path = os.path.join(data_dir, file)
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            group_dfs.append(df)
        else:
            print(f"Warning: File '{file}' not found in directory '{data_dir}'")
    combined_df = pd.concat(group_dfs, ignore_index=True)
    combined_df = combined_df.add_suffix(suffix).rename(
        columns={f"Player{suffix}": "Player", f"Team{suffix}": "Team"}
    )
    print(f"Combined {group_name} DataFrame created with shape: {combined_df.shape}")
    return combined_df

# Combine stats for each group with appropriate suffixes
defensive_df = combine_stat_group(stat_groups["defensive_stats"], "Defensive Stats", "_defensive")
attacking_df = combine_stat_group(stat_groups["attacking_stats"], "Attacking Stats", "_attacking")
expected_df = combine_stat_group(stat_groups["expected_stats"], "Expected Stats", "_expected")
misc_df = combine_stat_group(stat_groups["misc_stats"], "Miscellaneous Stats", "_misc")

# Merge all groups into a master DataFrame
master_df = (
    defensive_df.merge(attacking_df, on=["Player", "Team"], how="outer", suffixes=("_defensive", "_attacking"))
    .merge(expected_df, on=["Player", "Team"], how="outer", suffixes=("", "_expected"))
    .merge(misc_df, on=["Player", "Team"], how="outer", suffixes=("", "_misc"))
)

# Save the merged DataFrame to a CSV file
output_file_path = "master_player_stats_full_outer.csv"
master_df.to_csv(output_file_path, index=False)
print(f"\nMaster dataset saved to '{output_file_path}'")

# Display the final DataFrame shape
print(f"\nFinal Master DataFrame Shape: {master_df.shape}")


Combined Defensive Stats DataFrame created with shape: (1404, 22)
Combined Attacking Stats DataFrame created with shape: (1200, 14)
Combined Expected Stats DataFrame created with shape: (2134, 15)
Combined Miscellaneous Stats DataFrame created with shape: (1203, 14)

Master dataset saved to 'master_player_stats_full_outer.csv'

Final Master DataFrame Shape: (65815, 59)


In [None]:
overall_df.head(100)

Unnamed: 0,Rank,Player,Team,Possessions Won in Final 3rd per 90,Possessions Won Midfield per 90,Minutes,Matches,Country,Rank_overall_1,FotMob Rating,Player of the Match Awards,Minutes_overall_1,Matches_overall_1,Country_overall_1
0,273.0,Abdelkabir Abqar,Alavés,0.1,1.5,2314.0,27.0,MAR,224.0,6.69,0.0,2314.0,27.0,MAR
1,104.0,Abderrahman Rebbach,Alavés,0.6,1.5,713.0,23.0,ALG,305.0,6.32,0.0,713.0,23.0,ALG
2,200.0,Abdessamad Ezzalzouli,Real Betis,0.3,2.0,921.0,28.0,MAR,293.0,6.42,0.0,921.0,28.0,MAR
3,182.0,Abdon Prats,Mallorca,0.4,1.4,1258.0,34.0,ESP,288.0,6.45,0.0,1258.0,34.0,ESP
4,279.0,Abdul Mumin,Rayo Vallecano,0.1,1.2,1695.0,20.0,GHA,190.0,6.76,0.0,1695.0,20.0,GHA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,64.0,Fermin Lopez,Barcelona,0.7,2.4,1444.0,31.0,ESP,140.0,6.87,1.0,1444.0,31.0,ESP
96,78.0,Ferran Torres,Barcelona,0.7,1.5,1204.0,29.0,ESP,187.0,6.78,1.0,1204.0,29.0,ESP
97,,Filip Jörgensen,Villarreal,,,,,,234.0,6.65,2.0,3240.0,36.0,DEN
98,244.0,Florian Lejeune,Rayo Vallecano,0.2,1.6,3328.0,37.0,FRA,117.0,6.92,3.0,3328.0,37.0,FRA


In [None]:
passing_df.head(50)



Unnamed: 0,Rank_passing_x,Player,Team,Accurate Passes per 90_passing,Pass Success (%)_passing,Minutes_passing_x,Matches_passing_x,Country_passing_x,Rank_passing_y,Accurate Long Balls per 90_passing,Successful Long Balls (%)_passing,Minutes_passing_y,Matches_passing_y,Country_passing_y
0,248,Abdelkabir Abqar,Alavés,21.4,72.9,2314,27,MAR,46.0,3.1,36.9,2314.0,27.0,MAR
1,314,Abderrahman Rebbach,Alavés,12.5,66.9,713,23,ALG,264.0,0.6,41.7,713.0,23.0,ALG
2,230,Abdessamad Ezzalzouli,Real Betis,23.9,72.3,921,28,MAR,274.0,0.5,38.5,921.0,28.0,MAR
3,329,Abdon Prats,Mallorca,9.4,56.7,1258,34,ESP,270.0,0.5,53.8,1258.0,34.0,ESP
4,150,Abdul Mumin,Rayo Vallecano,32.7,80.2,1695,20,GHA,54.0,2.9,37.4,1695.0,20.0,GHA
5,160,Abner,Real Betis,31.8,81.0,1397,23,BRA,190.0,1.3,37.0,1397.0,23.0,BRA
6,200,Adrià Pedrosa,Sevilla,26.9,78.6,1749,31,ESP,218.0,1.1,29.3,1749.0,31.0,ESP
7,262,Adrián Embarba,Almería,20.1,75.6,2485,36,ESP,198.0,1.2,47.8,2485.0,36.0,ESP
8,162,Aihen Munoz,Real Sociedad,31.4,77.3,1120,21,ESP,153.0,1.7,36.2,1120.0,21.0,ESP
9,159,Aimar Oroz,Osasuna,31.8,82.2,2350,33,ESP,225.0,1.0,57.4,2350.0,33.0,ESP


In [None]:

scoring_df.head(50)

Unnamed: 0,Rank,Player,Team,Goals per 90,Total Goals,Minutes,Matches,Country,Expected Goals (xG),Actual Goals,Shots on Target per 90,Shot Accuracy (%),Goals,Penalties,Goals Conceded per 90,Total Goals Conceded,Expected Goals on Target (xGOT)
0,1,Cristhian Stuani,Girona,1.04,9.0,782,31,URU,,,,,,,,,
1,2,Bebé,Rayo Vallecano,0.93,2.0,193,19,CPV,,,,,,,,,
2,3,Artem Dovbyk,Girona,0.83,24.0,2605,36,UKR,,,,,,,,,
3,4,Alexander Sørloth,Villarreal,0.83,23.0,2491,34,NOR,,,,,,,,,
4,5,Jude Bellingham,Real Madrid,0.74,19.0,2323,28,ENG,,,,,,,,,
5,6,Vinicius Junior,Real Madrid,0.72,15.0,1872,26,BRA,,,,,,,,,
6,7,Álvaro Morata,Atlético de Madrid,0.71,15.0,1909,32,ESP,,,,,,,,,
7,8,Anastasios Douvikas,Celta de Vigo,0.71,7.0,886,32,GRE,,,,,,,,,
8,9,Robert Lewandowski,Barcelona,0.62,19.0,2758,35,POL,,,,,,,,,
9,10,Ante Budimir,Osasuna,0.62,17.0,2449,33,CRO,,,,,,,,,


In [None]:

defensive_df.head()

Unnamed: 0,Rank,Player,Team,Clean Sheets,Goals Conceded,Minutes,Matches,Country,Rank_defensive_1,Successful Dribbles per 90,...,Yellow Cards_defensive_8,Minutes_defensive_8,Matches_defensive_8,Country_defensive_8,Rank_defensive_9,Yellow Cards_defensive_9,Red Cards_defensive_9,Minutes_defensive_9,Matches_defensive_9,Country_defensive_9
0,,Aaron Escandell,Las Palmas,,,,,,,,...,0.0,152.0,2.0,ESP,,,,,,
1,,Abdelkabir Abqar,Alavés,,,,,,289.0,0.1,...,,,,,10.0,10.0,0.0,2314.0,27.0,MAR
2,,Abderrahman Rebbach,Alavés,,,,,,72.0,1.3,...,,,,,378.0,1.0,0.0,713.0,23.0,ALG
3,,Abdessamad Ezzalzouli,Real Betis,,,,,,21.0,2.2,...,,,,,378.0,1.0,0.0,921.0,28.0,MAR
4,,Abdon Prats,Mallorca,,,,,,170.0,0.6,...,,,,,229.0,3.0,0.0,1258.0,34.0,ESP


In [None]:

assists_df.head()

Unnamed: 0,Rank,Player,Team,Assists,Secondary Assists,Minutes,Matches,Country,Rank_assists,Expected Assists (xA),Actual Assists,Minutes_assists,Matches_assists,Country_assists,Rank_assists.1,Big Chances Created,Total Assists,Minutes_assists.1,Matches_assists.1,Country_assists.1
0,,Aaron Escandell,Las Palmas,,,,,,478.0,0.0,0.0,152.0,2.0,ESP,,,,,,
1,,Abdelkabir Abqar,Alavés,,,,,,389.0,0.2,0.0,2314.0,27.0,MAR,,,,,,
2,,Abderrahman Rebbach,Alavés,,,,,,152.0,1.7,0.0,713.0,23.0,ALG,226.0,2.0,0.0,713.0,23.0,ALG
3,,Abdessamad Ezzalzouli,Real Betis,,,,,,316.0,0.5,0.0,921.0,28.0,MAR,,,,,,
4,199.0,Abdon Prats,Mallorca,1.0,1.2,1258.0,34.0,ESP,190.0,1.2,1.0,1258.0,34.0,ESP,120.0,4.0,1.0,1258.0,34.0,ESP


In [None]:

misc_df.head()

Unnamed: 0,Rank,Player,Team,Red Cards,Yellow Cards,Minutes,Matches,Country,Rank_misc,Yellow Cards_misc,Red Cards_misc,Minutes_misc,Matches_misc,Country_misc,Rank_misc.1,FotMob Rating,Player of the Match Awards,Minutes_misc.1,Matches_misc.1,Country_misc.1
0,72.0,Aaron Escandell,Las Palmas,1.0,0.0,152.0,2.0,ESP,,,,,,,,,,,,
1,,Abdelkabir Abqar,Alavés,,,,,,10.0,10.0,0.0,2314.0,27.0,MAR,224.0,6.69,0.0,2314.0,27.0,MAR
2,,Abderrahman Rebbach,Alavés,,,,,,378.0,1.0,0.0,713.0,23.0,ALG,305.0,6.32,0.0,713.0,23.0,ALG
3,,Abdessamad Ezzalzouli,Real Betis,,,,,,378.0,1.0,0.0,921.0,28.0,MAR,293.0,6.42,0.0,921.0,28.0,MAR
4,,Abdon Prats,Mallorca,,,,,,229.0,3.0,0.0,1258.0,34.0,ESP,288.0,6.45,0.0,1258.0,34.0,ESP


In [None]:
# Function to calculate the percentage of null values for a DataFrame
def calculate_null_percentage(df, df_name):
    null_percentage = df.isnull().mean() * 100
    print(f"\n--- Null Percentage for {df_name} ---")
    print(null_percentage[null_percentage > 0])  # Only display columns with nulls

# Calculate null percentages for each DataFrame
calculate_null_percentage(passing_df, "Passing Statistics")
calculate_null_percentage(scoring_df, "Scoring Statistics")
calculate_null_percentage(defensive_df, "Defensive Statistics")
calculate_null_percentage(assists_and_creativity_df, "Assists and Creativity")
calculate_null_percentage(misc_performance_df, "Miscellaneous Performance Metrics")



--- Null Percentage for Passing Statistics ---
Rank_passing_y                        5.688623
Accurate Long Balls per 90_passing    5.688623
Successful Long Balls (%)_passing     5.688623
Minutes_passing_y                     5.688623
Matches_passing_y                     5.688623
Country_passing_y                     5.688623
dtype: float64

--- Null Percentage for Scoring Statistics ---
Goals per 90                       87.104338
Total Goals                        87.104338
Expected Goals (xG)                71.512309
Actual Goals                       47.596717
Shots on Target per 90             82.590856
Shot Accuracy (%)                  82.590856
Goals                              83.821805
Penalties                          83.821805
Goals Conceded per 90              98.886284
Total Goals Conceded               98.886284
Expected Goals on Target (xGOT)    76.084408
dtype: float64

--- Null Percentage for Defensive Statistics ---
Rank                          93.762994
Clean S