In [8]:
# General libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning and Forecasting
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Time Series Forecasting
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima

# Clustering and Dimensionality Reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Miscellaneous
import warnings
warnings.filterwarnings("ignore")




# Exploratory Data Analysis: Team Stats

In this section, we analyze team-level data from the La Liga 2023/24 season. The goal is to uncover trends, patterns, and relationships in the data that can guide feature engineering and model building. Key areas of focus include:

- **Performance Metrics:** Goals scored, goals conceded, clean sheets, possession percentages, and more.
- **Offensive Metrics:** Expected goals (xG), shots on target, and big chances created.
- **Defensive Metrics:** Tackles, interceptions, and goals conceded.
- **Discipline:** Fouls committed, yellow cards, and red cards.
- **Overall Ratings:** FotMob team ratings and standings.

Below, we explore the distribution of each metric, compare team performance, and identify potential outliers or anomalies.


In [9]:
# Define the directory containing the datasets
dataset_dir = "team_stats"

# Setting Up Initial Dataframes for La Liga Team Analysis

## Introduction
To effectively analyze La Liga team performance for the 2023/24 season, the data will be grouped into three key categories: **Offensive Stats**, **Defensive Stats**, and **Overall Team Rankings**. These categories will allow for a focused investigation into team strengths and weaknesses, as well as comprehensive comparisons across various performance metrics.

---

## Offensive Stats
The following files are associated with offensive performance metrics:
- **`team_goals_per_match.csv`**: Goals scored per match.
- **`big_chance_team.csv`**: Big chances created.
- **`accurate_cross_team.csv`**: Accuracy of crosses.
- **`accurate_pass_team.csv`**: Passing accuracy.
- **`possession_percentage_team.csv`**: Possession statistics.
- **`touches_in_opp_box_team.csv`**: Touches in the opponent's box.
- **`ontarget_scoring_att_team.csv`**: Shots on target and shot conversion rate.
- **`corner_taken_team.csv`**: Corners taken.
- **`penalty_won_team.csv`**: Penalties won.

### Plan:
These files will be merged into a single dataframe using the `Team` column to create a comprehensive offensive stats dataset.

---


In [10]:
# Offensive stats files
offensive_files = [
    'team_goals_per_match.csv', 'big_chance_team.csv', 'accurate_cross_team.csv',
    'accurate_pass_team.csv', 'possession_percentage_team.csv', 'touches_in_opp_box_team.csv',
    'ontarget_scoring_att_team.csv', 'corner_taken_team.csv', 'penalty_won_team.csv'
]



## Defensive Stats
The following files are associated with defensive performance metrics:
- **`clean_sheet_team.csv`**: Clean sheets by teams.
- **`effective_clearance_team.csv`**: Clearances made.
- **`interception_team.csv`**: Interceptions per match.
- **`won_tackle_team.csv`**: Successful tackles.
- **`goals_conceded_team_match.csv`**: Goals conceded per match.
- **`expected_goals_conceded_team.csv`**: Expected goals conceded.
- **`penalty_conceded_team.csv`**: Penalties conceded.
- **`total_red_card_team.csv`**: Red card counts.
- **`total_yel_card_team.csv`**: Yellow card counts.

### Plan:
These files will be merged into a single dataframe using the `Team` column to create a comprehensive defensive stats dataset.

---


In [11]:
# Defensive stats files
defensive_files = [
    'clean_sheet_team.csv', 'effective_clearance_team.csv', 'interception_team.csv',
    'won_tackle_team.csv', 'goals_conceded_team_match.csv', 'expected_goals_conceded_team.csv',
    'penalty_conceded_team.csv', 'total_red_card_team.csv', 'total_yel_card_team.csv'
]


## Overall Team Rankings
The following files provide team-level rankings and performance metrics:
- **`Laliga_table_2023_24.csv`**: Overall rankings, matches played, wins, losses, points.
- **`Laliga_table_home_2023_24.csv`**: Home performance.
- **`Laliga_table_away_2023_24.csv`**: Away performance.
- **`Laliga_table_xg_2023_24.csv`**: Expected goals (xG) and expected points.

### Plan:
These files will be merged into a single dataframe to facilitate overall team performance comparisons.

---

In [12]:
# Overall team rankings files
ranking_files = [
    'Laliga_table_2023_24.csv', 'Laliga_table_home_2023_24.csv',
    'Laliga_table_away_2023_24.csv', 'Laliga_table_xg_2023_24.csv'
]

---

## Next Steps
The three resulting dataframes (**Offensive Stats**, **Defensive Stats**, and **Overall Team Rankings**) will provide a structured basis for exploratory data analysis (EDA), feature engineering, and machine learning applications. The `Team` column will serve as the key for merging these datasets.

---

In [13]:
# Update dataset directory to match the actual structure
dataset_dir = "FIFA_datasets/laliga2023_34/team_stats"

# Initialize empty dataframes
offensive_df = pd.DataFrame()
defensive_df = pd.DataFrame()
ranking_df = pd.DataFrame()

# Offensive files
offensive_files = [
    'team_goals_per_match.csv', 'ontarget_scoring_att_team.csv', 'accurate_pass_team.csv',
    'accurate_cross_team.csv', 'accurate_long_balls_team.csv', 'big_chance_team.csv',
    'touches_in_opp_box_team.csv', 'possession_percentage_team.csv', 'possession_won_att_3rd_team.csv'
]

# Defensive files
defensive_files = [
    'clean_sheet_team.csv', 'effective_clearance_team.csv', 'interception_team.csv',
    'won_tackle_team.csv', 'penalty_conceded_team.csv', 'goals_conceded_team_match.csv',
    'expected_goals_conceded_team.csv', 'total_red_card_team.csv', 'total_yel_card_team.csv'
]

# Ranking files
ranking_files = [
    'Laliga_table_2023_24.csv', 'Laliga_table_home_2023_24.csv', 'Laliga_table_away_2023_24.csv',
    'Laliga_table_xg_2023_24.csv', 'team_ratings.csv'
]

# Merge offensive stats
for file_name in offensive_files:
    file_path = os.path.join(dataset_dir, file_name)
    try:
        temp_df = pd.read_csv(file_path)
        if 'Team' in temp_df.columns:
            if offensive_df.empty:
                offensive_df = temp_df
            else:
                offensive_df = offensive_df.merge(temp_df, on='Team', how='left', suffixes=('', f'_{file_name.split(".")[0]}'))
    except FileNotFoundError:
        print(f"File not found: {file_path}")

# Merge defensive stats
for file_name in defensive_files:
    file_path = os.path.join(dataset_dir, file_name)
    try:
        temp_df = pd.read_csv(file_path)
        if 'Team' in temp_df.columns:
            if defensive_df.empty:
                defensive_df = temp_df
            else:
                defensive_df = defensive_df.merge(temp_df, on='Team', how='left', suffixes=('', f'_{file_name.split(".")[0]}'))
    except FileNotFoundError:
        print(f"File not found: {file_path}")

# Merge overall team rankings
for file_name in ranking_files:
    file_path = os.path.join(dataset_dir, file_name)
    try:
        temp_df = pd.read_csv(file_path)
        if 'name' in temp_df.columns:  # Adjusting for different column name
            temp_df.rename(columns={'name': 'Team'}, inplace=True)
        if ranking_df.empty:
            ranking_df = temp_df
        else:
            ranking_df = ranking_df.merge(temp_df, on='Team', how='left', suffixes=('', f'_{file_name.split(".")[0]}'))
    except FileNotFoundError:
        print(f"File not found: {file_path}")

# Drop duplicate columns after merging
def drop_duplicate_columns(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        df.drop(columns=[dup], inplace=True, errors='ignore')
    return df

# Apply duplicate removal to all dataframes
offensive_df = drop_duplicate_columns(offensive_df)
defensive_df = drop_duplicate_columns(defensive_df)
ranking_df = drop_duplicate_columns(ranking_df)

In [17]:
offensive_df.head(21)

Unnamed: 0,Rank,Team,Goals per Match,Total Goals Scored,Matches,Country,Rank_ontarget_scoring_att_team,Shots on Target per Match,Shot Conversion Rate (%),Matches_ontarget_scoring_att_team,...,Country_touches_in_opp_box_team,Rank_possession_percentage_team,Possession (%),Matches_possession_percentage_team,Country_possession_percentage_team,Rank_possession_won_att_3rd_team,Possession Won Final 3rd per Match,Total Possessions Won,Matches_possession_won_att_3rd_team,Country_possession_won_att_3rd_team
0,1,Real Madrid,2.3,87.0,38,ESP,1,6.6,14.6,38,...,ESP,3,59.4,38,ESP,2,4.7,781.0,38,ESP
1,2,Girona,2.2,85.0,38,ESP,4,5.1,17.6,38,...,ESP,4,57.2,38,ESP,17,3.6,746.0,38,ESP
2,3,Barcelona,2.1,79.0,38,ESP,2,6.1,13.3,38,...,ESP,1,64.7,38,ESP,5,4.5,811.0,38,ESP
3,4,Atletico Madrid,1.8,70.0,38,ESP,3,5.4,14.6,38,...,ESP,6,50.7,38,ESP,6,4.5,801.0,38,ESP
4,5,Villarreal,1.7,65.0,38,ESP,5,4.5,13.9,38,...,ESP,10,49.6,38,ESP,12,4.0,711.0,38,ESP
5,6,Athletic Club,1.6,61.0,38,ESP,6,4.5,13.2,38,...,ESP,9,49.7,38,ESP,1,5.8,945.0,38,ESP
6,7,Real Sociedad,1.3,51.0,38,ESP,12,4.1,11.1,38,...,ESP,5,55.6,38,ESP,3,4.6,856.0,38,ESP
7,8,Real Betis,1.3,48.0,38,ESP,13,4.1,9.8,38,...,ESP,7,50.5,38,ESP,13,3.9,833.0,38,ESP
8,8,Sevilla,1.3,48.0,38,ESP,9,4.3,9.8,38,...,ESP,8,50.3,38,ESP,9,4.2,782.0,38,ESP
9,10,Celta Vigo,1.2,46.0,38,ESP,10,4.2,9.8,38,...,ESP,13,46.2,38,ESP,14,3.9,807.0,38,ESP


In [15]:
defensive_df.head()

Unnamed: 0,Rank,Team,Clean Sheets,Matches,Country,Rank_effective_clearance_team,Clearances per Match,Total Clearances,Matches_effective_clearance_team,Country_effective_clearance_team,...,Rank_total_red_card_team,Red Cards,Yellow Cards,Matches_total_red_card_team,Country_total_red_card_team,Rank_total_yel_card_team,Yellow Cards_total_yel_card_team,Red Cards_total_yel_card_team,Matches_total_yel_card_team,Country_total_yel_card_team
0,1,Real Madrid,21.0,38,ESP,20,12.8,486.0,38,ESP,...,14,4.0,67.0,38,ESP,18,67.0,4.0,38,ESP
1,2,Athletic Club,18.0,38,ESP,18,15.7,596.0,38,ESP,...,9,5.0,75.0,38,ESP,16,75.0,5.0,38,ESP
2,3,Barcelona,17.0,38,ESP,19,14.4,547.0,38,ESP,...,18,2.0,85.0,38,ESP,11,85.0,2.0,38,ESP
3,4,Real Sociedad,15.0,38,ESP,16,17.6,670.0,38,ESP,...,17,2.0,98.0,38,ESP,7,98.0,2.0,38,ESP
4,5,Atletico Madrid,13.0,38,ESP,10,19.1,726.0,38,ESP,...,6,5.0,82.0,38,ESP,13,82.0,5.0,38,ESP


In [16]:
ranking_df.head()

Unnamed: 0,idx,Team,played,wins,draws,losses,scoresStr,goalConDiff,pts,idx_Laliga_table_home_2023_24,...,xg,xgConceded,xPoints,xgDiff,xgConcededDiff,pts_Laliga_table_xg_2023_24,Rank,FotMob Team Rating,Matches,Country
0,1,Real Madrid,38,29,8,1,87-26,61,95,1,...,69.7642,35.8447,74.50569,17.2358,-9.8447,95,1,7.26,38,ESP
1,2,Barcelona,38,26,7,5,79-44,35,85,4,...,78.4744,41.95,76.865361,0.5256,2.05,85,2,7.1,38,ESP
2,3,Girona,38,25,6,7,85-46,39,81,3,...,71.815,55.3311,63.560373,13.185,-9.3311,81,3,7.03,38,ESP
3,4,Atletico Madrid,38,24,4,10,70-43,27,76,2,...,63.0209,39.8447,67.758065,6.9791,3.1553,76,4,6.97,38,ESP
4,5,Athletic Club,38,19,11,8,61-37,24,68,5,...,53.236,41.9482,58.832578,7.764,-4.9482,68,5,6.9,38,ESP
