In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import math

## Data Cleaning and Preprocessing

In [2]:
spanish_squads = ['Sevilla', 'Sporting Huelva', 'Athletic Club', 'Levante Planas',
       'UDG Tenerife', 'Villarreal', 'Madrid CFF', 'Barcelona',
       'Atlético Madrid', 'Real Madrid', 'Alhama', 'Alavés',
       'Real Sociedad', 'Levante', 'Real Betis', 'Valencia']

explanable_cols = ['Player','Nation','Pos','Squad','Age','Born','Starts','Min','Gls','Total_Att','Blocks_Blocks','Blocks_Sh','Blocks_Pass','Clr','Err','Touches_Touches','Touches_DefPen','Dribbles_Succ','Dribbles_Att','Dribbles_Mis','AerialDuels_Won','AerialDuels_Lost']

spanish_players = pd.read_csv('assets/all_players.csv')
spanish_players = spanish_players[spanish_players['Squad'].isin(spanish_squads)]
spanish_players = spanish_players[explanable_cols]

# Update the Age column by substracting the Born column from the current year if Born is not null
spanish_players['Age'] = spanish_players['Born'].apply(lambda x: (2023- x) if x != np.nan else np.nan)

# If the Nation is NaN, replace it with Spain
spanish_players['Nation'] = spanish_players['Nation'].fillna('es ESP')

# In the spanish_players dataframe, if the datatype is float64, change the datatype to int64
for col in spanish_players.columns:
    if spanish_players[col].dtype == 'float64':
        spanish_players[col] = spanish_players[col].astype('int64')

cols = spanish_players.columns.to_list()
display(spanish_players.sample(5), spanish_players.shape)

Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,Starts,Min,Gls,Total_Att,...,Blocks_Pass,Clr,Err,Touches_Touches,Touches_DefPen,Dribbles_Succ,Dribbles_Att,Dribbles_Mis,AerialDuels_Won,AerialDuels_Lost
48,Berta Bou,es ESP,DF,Levante Planas,24,1999,9,638,0,219,...,3,32,0,307,63,1,2,4,4,7
53,Nagore Calderón,es ESP,MF,Sevilla,30,1993,7,643,0,236,...,7,2,0,283,9,1,4,4,2,0
245,Olivia Oprea,ro ROU,DF,Alhama,36,1987,4,335,0,110,...,3,9,1,153,24,0,1,1,4,2
35,Laia Ballesté,es ESP,"DF,FW",Sporting Huelva,24,1999,5,450,1,123,...,3,12,0,158,22,0,0,3,2,2
137,Yenifer Giménez,ve VEN,DF,Villarreal,27,1996,10,862,0,367,...,12,21,1,467,26,3,7,7,6,3


(358, 22)

In [3]:
matches = pd.read_csv('assets/matches-checkpoint.csv')
# Make sure 'Home' or 'Away' is in the spanish_squads
matches = matches[(matches['Home'].isin(spanish_squads)) | (matches['Away'].isin(spanish_squads))]

useless_ids = ['Away_id','Home_id','Match_id','League_id']
matches = matches.drop(useless_ids, axis=1).reset_index(drop=True)

matches['Date'] = pd.to_datetime(matches['Date'])
display(matches.sample(5), matches.shape)

Unnamed: 0,Wk,Day,Date,Time,Home,xGHome,Score,xGAway,Away,xPHome,xPAway,ScoreHome,ScoreAway
50,1,Wed,2022-11-02,19:00,Sporting Huelva,0.7,1–1,2.0,Sevilla,0.57,2.23,1,1
7,2,Sun,2022-09-18,18:00,Real Betis,0.3,1–2,1.3,Levante Planas,0.54,2.17,1,2
21,4,Sun,2022-10-02,16:00,Sevilla,1.7,2–2,1.3,Real Sociedad,1.64,1.12,2,2
36,6,Sun,2022-10-23,11:00,Levante,1.3,2–2,0.9,Real Madrid,1.65,1.06,2,2
76,10,Sun,2022-11-27,12:00,Alavés,2.6,3–1,1.3,Real Betis,2.14,0.69,3,1


(87, 13)

## Data Transformation

In [4]:
display(matches.sample(5), matches.info(), matches.columns.to_list())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Wk         87 non-null     int64         
 1   Day        87 non-null     object        
 2   Date       87 non-null     datetime64[ns]
 3   Time       87 non-null     object        
 4   Home       87 non-null     object        
 5   xGHome     87 non-null     float64       
 6   Score      87 non-null     object        
 7   xGAway     87 non-null     float64       
 8   Away       87 non-null     object        
 9   xPHome     87 non-null     float64       
 10  xPAway     87 non-null     float64       
 11  ScoreHome  87 non-null     int64         
 12  ScoreAway  87 non-null     int64         
dtypes: datetime64[ns](1), float64(4), int64(3), object(5)
memory usage: 9.0+ KB


Unnamed: 0,Wk,Day,Date,Time,Home,xGHome,Score,xGAway,Away,xPHome,xPAway,ScoreHome,ScoreAway
34,6,Sat,2022-10-22,18:15,Real Betis,0.0,0–3,2.8,Barcelona,0.06,2.88,0,3
30,5,Sun,2022-10-16,18:00,Madrid CFF,2.1,4–0,0.2,Real Betis,2.62,0.23,4,0
35,6,Sun,2022-10-23,11:00,UDG Tenerife,1.3,2–0,0.6,Villarreal,1.9,0.81,2,0
23,5,Sat,2022-10-15,12:00,Levante Planas,0.3,1–1,2.2,Levante,0.26,2.59,1,1
64,9,Sat,2022-11-19,16:00,Sporting Huelva,0.9,0–1,1.6,Real Madrid,0.89,1.86,0,1


None

['Wk',
 'Day',
 'Date',
 'Time',
 'Home',
 'xGHome',
 'Score',
 'xGAway',
 'Away',
 'xPHome',
 'xPAway',
 'ScoreHome',
 'ScoreAway']

### Feature Engineering
Calculate derived metrics

In [9]:
# Goal Difference: You can calculate the goal difference by subtracting the "ScoreAway" from the "ScoreHome" column. This metric gives you the difference in goals scored between the home and away teams in each match.
matches['GoalDifference'] = matches['ScoreHome'] - matches['ScoreAway']
# Expected Goals Difference: Similar to the goal difference, you can calculate the expected goals difference by subtracting the "xGAway" from the "xGHome" column. This metric represents the difference in expected goals between the home and away teams in each match.
matches['ExpectedGoalDifference'] = matches['xGHome'] - matches['xGAway']
# Points: You can calculate the points earned by each team using a scoring system (e.g., 3 points for a win, 1 point for a draw, and 0 points for a loss). You can create a new column called "Points" and assign the corresponding points based on the match result in the "Score" column.
matches['Points'] = matches['Score'].apply(lambda x: 3 if x[0] > x[2] else 1 if x[0] == x[2] else 0)
# Expected Points: Similar to the points metric, you can calculate the expected points earned by each team using a similar scoring system but based on the expected goals (e.g., 3 points for xGHome > xGAway, 1 point for xGHome = xGAway, and 0 points for xGHome < xGAway). You can create a new column called "ExpectedPoints" and assign the corresponding expected points based on the expected goals in the "xGHome" and "xGAway" columns.
matches['ExpectedPoints'] = matches['Score'].apply(lambda x: 3 if x[0] > x[2] else 1 if x[0] == x[2] else 0)
# Win Percentage: You can calculate the win percentage for each team by dividing the number of wins (based on the "Score" column) by the total number of matches played.
# matches['WinPercentage'] = matches['Score'].apply(lambda x: 1 if x[0] > x[2] else 0)
# matches['WinPercentage'] = matches.groupby('Home')['Score'].apply(lambda x: (pd.to_numeric(x.str.replace('–', '-')) > 0).sum()) / matches['Home'].value_counts() * 100

display(matches.sample(15), matches.shape)

ValueError: Unable to parse string "1-2" at position 0

3. **Feature Engineering**: Create new features or derive additional information from existing columns. For example, you can calculate derived metrics like goals per minute, successful dribble percentage, or aerial duel success rate based on the existing columns.

4. **Data Aggregation**: Aggregate the data to a higher level of granularity. For example, you can calculate summary statistics like the total number of goals scored by each player, the average age of players in each squad, or the total number of blocks made by each nation.

5. **Data Filtering**: Filter the dataframe based on specific conditions or criteria. For example, you can filter the dataframe to include only players above a certain age, players from a specific nation, or players who have scored a certain number of goals.

6. **Data Transformation**: Apply mathematical or statistical transformations to the data. For example, you can normalize numeric columns, apply logarithmic transformations, or standardize the data using z-scores.

7. **Feature Scaling**: Scale the numeric features to a common range to avoid bias in the analysis. For example, you can use techniques like min-max scaling or standardization to scale numeric columns like 'Starts', 'Min', 'Gls', etc., to a common range.

8. **Data Encoding**: Encode categorical variables into numerical representations. For example, you can use techniques like one-hot encoding or label encoding to represent categorical columns like 'Pos', 'Squad', etc., as numerical values.