# Imports & Global Settings (Top of Notebook)

In [1]:
import pandas as pd 
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

# Load Raw Data

In [2]:
match_df = pd.read_csv("understat_match_1524.csv")
roster_df = pd.read_csv("understat_roster_1524.csv")

print("Match Data Shape: ", match_df.shape)
print("Roster Data Shape: ", roster_df.shape)

match_df.head()

Match Data Shape:  (3420, 25)
Roster Data Shape:  (96091, 22)


Unnamed: 0,id,fid,h,a,date,league_id,season,h_goals,a_goals,team_h,team_a,h_xg,a_xg,h_w,h_d,h_l,league,h_shot,a_shot,h_shotOnTarget,a_shotOnTarget,h_deep,a_deep,a_ppda,h_ppda
0,81,958431,89,82,2015-08-08 15:45:00,1,2015,1,0,Manchester United,Tottenham,0.627539,0.6746,0.2859,0.3905,0.3236,EPL,9,9,1,4,4,10,8.2188,13.8261
1,82,958427,73,71,2015-08-08 18:00:00,1,2015,0,1,Bournemouth,Aston Villa,0.876106,0.782253,0.3435,0.3607,0.2958,EPL,11,7,2,3,11,2,11.8462,6.9
2,83,958429,72,90,2015-08-08 18:00:00,1,2015,2,2,Everton,Watford,0.604226,0.557892,0.2988,0.4337,0.2675,EPL,10,11,5,5,5,4,17.1579,6.65
3,84,958430,75,77,2015-08-08 18:00:00,1,2015,4,2,Leicester,Sunderland,2.56803,1.45946,0.6422,0.2057,0.1521,EPL,19,11,8,5,5,6,9.5556,10.88
4,85,958433,79,78,2015-08-08 18:00:00,1,2015,1,3,Norwich,Crystal Palace,1.13076,2.10975,0.1461,0.2159,0.638,EPL,17,11,6,7,5,10,10.625,5.7368


In [3]:
roster_df.head()

Unnamed: 0,id,goals,own_goals,shots,xG,time,player_id,team_id,position,player,h_a,yellow_card,red_card,roster_in,roster_out,key_passes,assists,xA,xGChain,xGBuildup,positionOrder,match_link
0,620382,0,0,0,0.0,90,560,89,GK,Sergio Romero,h,0,0,0,0,0,0,0.0,0.0,0.0,1,https://understat.com/match/81
1,620383,0,0,0,0.0,82,557,89,DR,Matteo Darmian,h,0,0,620393,0,1,0,0.106513,0.106513,0.106513,2,https://understat.com/match/81
2,620385,0,0,0,0.0,90,628,89,DC,Chris Smalling,h,0,0,0,0,0,0,0.0,0.106513,0.106513,3,https://understat.com/match/81
3,620384,0,0,0,0.0,90,548,89,DC,Daley Blind,h,0,0,0,0,0,0,0.0,0.127738,0.127738,3,https://understat.com/match/81
4,620386,0,0,0,0.0,90,1006,89,DL,Luke Shaw,h,0,0,0,0,0,0,0.0,0.021225,0.021225,4,https://understat.com/match/81


# Initial Data Audit

In [4]:
def data_audit(df, name):
    print(f"\n===== {name} Data Audit =====")
    display(df.head(3))
    print("\nInfo:")
    df.info()
    print("\nMissing Values:")
    print(df.isna().sum())
    print("\nDuplicate Rows:", df.duplicated().sum())

data_audit(match_df, "Match")



===== Match Data Audit =====


Unnamed: 0,id,fid,h,a,date,league_id,season,h_goals,a_goals,team_h,team_a,h_xg,a_xg,h_w,h_d,h_l,league,h_shot,a_shot,h_shotOnTarget,a_shotOnTarget,h_deep,a_deep,a_ppda,h_ppda
0,81,958431,89,82,2015-08-08 15:45:00,1,2015,1,0,Manchester United,Tottenham,0.627539,0.6746,0.2859,0.3905,0.3236,EPL,9,9,1,4,4,10,8.2188,13.8261
1,82,958427,73,71,2015-08-08 18:00:00,1,2015,0,1,Bournemouth,Aston Villa,0.876106,0.782253,0.3435,0.3607,0.2958,EPL,11,7,2,3,11,2,11.8462,6.9
2,83,958429,72,90,2015-08-08 18:00:00,1,2015,2,2,Everton,Watford,0.604226,0.557892,0.2988,0.4337,0.2675,EPL,10,11,5,5,5,4,17.1579,6.65



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3420 entries, 0 to 3419
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              3420 non-null   int64  
 1   fid             3420 non-null   int64  
 2   h               3420 non-null   int64  
 3   a               3420 non-null   int64  
 4   date            3420 non-null   object 
 5   league_id       3420 non-null   int64  
 6   season          3420 non-null   int64  
 7   h_goals         3420 non-null   int64  
 8   a_goals         3420 non-null   int64  
 9   team_h          3420 non-null   object 
 10  team_a          3420 non-null   object 
 11  h_xg            3420 non-null   float64
 12  a_xg            3420 non-null   float64
 13  h_w             3420 non-null   float64
 14  h_d             3420 non-null   float64
 15  h_l             3420 non-null   float64
 16  league          3420 non-null   object 
 17  h_shot          3420 non-n

In [5]:
data_audit(roster_df, "Roster")


===== Roster Data Audit =====


Unnamed: 0,id,goals,own_goals,shots,xG,time,player_id,team_id,position,player,h_a,yellow_card,red_card,roster_in,roster_out,key_passes,assists,xA,xGChain,xGBuildup,positionOrder,match_link
0,620382,0,0,0,0.0,90,560,89,GK,Sergio Romero,h,0,0,0,0,0,0,0.0,0.0,0.0,1,https://understat.com/match/81
1,620383,0,0,0,0.0,82,557,89,DR,Matteo Darmian,h,0,0,620393,0,1,0,0.106513,0.106513,0.106513,2,https://understat.com/match/81
2,620385,0,0,0,0.0,90,628,89,DC,Chris Smalling,h,0,0,0,0,0,0,0.0,0.106513,0.106513,3,https://understat.com/match/81



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96091 entries, 0 to 96090
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             96091 non-null  int64  
 1   goals          96091 non-null  int64  
 2   own_goals      96091 non-null  int64  
 3   shots          96091 non-null  int64  
 4   xG             96091 non-null  float64
 5   time           96091 non-null  int64  
 6   player_id      96091 non-null  int64  
 7   team_id        96091 non-null  int64  
 8   position       96091 non-null  object 
 9   player         96091 non-null  object 
 10  h_a            96091 non-null  object 
 11  yellow_card    96091 non-null  int64  
 12  red_card       96091 non-null  int64  
 13  roster_in      96091 non-null  int64  
 14  roster_out     96091 non-null  int64  
 15  key_passes     96091 non-null  int64  
 16  assists        96091 non-null  int64  
 17  xA             96091 non-null  float64
 18 

# Cleaning Match Dataset

In [None]:
# Normalize column names
match_df.columns = (
    match_df.columns.str.strip().str.lower().str.replace(" ","_")
)

match_df.head()

In [6]:
# Remove Exact Duplicates
before = match_df.shape[0]
match_df = match_df.drop_duplicates()
after = match_df.shape[0]

print(f"Removed {before - after} duplicate rows from match data")

Removed 0 duplicate rows from match data


In [7]:
match_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3420 entries, 0 to 3419
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              3420 non-null   int64  
 1   fid             3420 non-null   int64  
 2   h               3420 non-null   int64  
 3   a               3420 non-null   int64  
 4   date            3420 non-null   object 
 5   league_id       3420 non-null   int64  
 6   season          3420 non-null   int64  
 7   h_goals         3420 non-null   int64  
 8   a_goals         3420 non-null   int64  
 9   team_h          3420 non-null   object 
 10  team_a          3420 non-null   object 
 11  h_xg            3420 non-null   float64
 12  a_xg            3420 non-null   float64
 13  h_w             3420 non-null   float64
 14  h_d             3420 non-null   float64
 15  h_l             3420 non-null   float64
 16  league          3420 non-null   object 
 17  h_shot          3420 non-null   i

In [8]:
# Convert Datatypes Explicitly
match_df.rename(columns={
    "h_goals":"home_goals", 
    "a_goals":"away_goals", 
    "h_xg":"home_xg", 
    "a_xg":"away_xg", 
    "team_h":"home_team", 
    "team_a":"away_team"}, inplace=True)

numeric_cols = [
    "home_goals",
    "away_goals",
    "home_xg",
    "away_xg"
]

for col in numeric_cols:
    match_df[col] = pd.to_numeric(match_df[col], errors="coerce")

match_df.head()

Unnamed: 0,id,fid,h,a,date,league_id,season,home_goals,away_goals,home_team,away_team,home_xg,away_xg,h_w,h_d,h_l,league,h_shot,a_shot,h_shotOnTarget,a_shotOnTarget,h_deep,a_deep,a_ppda,h_ppda
0,81,958431,89,82,2015-08-08 15:45:00,1,2015,1,0,Manchester United,Tottenham,0.627539,0.6746,0.2859,0.3905,0.3236,EPL,9,9,1,4,4,10,8.2188,13.8261
1,82,958427,73,71,2015-08-08 18:00:00,1,2015,0,1,Bournemouth,Aston Villa,0.876106,0.782253,0.3435,0.3607,0.2958,EPL,11,7,2,3,11,2,11.8462,6.9
2,83,958429,72,90,2015-08-08 18:00:00,1,2015,2,2,Everton,Watford,0.604226,0.557892,0.2988,0.4337,0.2675,EPL,10,11,5,5,5,4,17.1579,6.65
3,84,958430,75,77,2015-08-08 18:00:00,1,2015,4,2,Leicester,Sunderland,2.56803,1.45946,0.6422,0.2057,0.1521,EPL,19,11,8,5,5,6,9.5556,10.88
4,85,958433,79,78,2015-08-08 18:00:00,1,2015,1,3,Norwich,Crystal Palace,1.13076,2.10975,0.1461,0.2159,0.638,EPL,17,11,6,7,5,10,10.625,5.7368


In [9]:
print(match_df.columns.tolist())

# Handle missing values
#  Drops rows where both teams or date is missing (useless records)
match_df = match_df.dropna(subset=["home_team", "away_team", "date"])

#  Fill xG missing values with 0
match_df["home_xg"] = match_df["home_xg"].fillna(0)
match_df["away_xg"] = match_df["away_xg"].fillna(0)

['id', 'fid', 'h', 'a', 'date', 'league_id', 'season', 'home_goals', 'away_goals', 'home_team', 'away_team', 'home_xg', 'away_xg', 'h_w', 'h_d', 'h_l', 'league', 'h_shot', 'a_shot', 'h_shotOnTarget', 'a_shotOnTarget', 'h_deep', 'a_deep', 'a_ppda', 'h_ppda']


In [None]:
# Convert date column

match_df["date"] = pd.to_datetime(match_df["date"], errors="coerce")

#  Drop rows where conversion failed:
match_df = match_df[match_df["date"].notna()]

0      2015-08-08 15:45:00
1      2015-08-08 18:00:00
2      2015-08-08 18:00:00
3      2015-08-08 18:00:00
4      2015-08-08 18:00:00
               ...        
3415   2024-05-19 15:00:00
3416   2024-05-19 15:00:00
3417   2024-05-19 15:00:00
3418   2024-05-19 15:00:00
3419   2024-05-19 15:00:00
Name: date, Length: 3420, dtype: datetime64[ns]

In [None]:
# Feature Engineering
match_df["goal_diff"] = match_df["home_goals"] - match_df["away_goals"]
match_df["xg_diff"] = match_df["home_xg"] - match_df["away_xg"]

In [None]:
# Final Validation

data_audit(match_df, "Cleaned Match")

# Cleaning Roster Dataset

## Normalize Column Names

In [None]:
roster_df.columns = (
    roster_df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ","_")
)

## Remove Duplicates

In [None]:
before = roster_df.shape[0]
roster_df = roster_df.drop_duplicates()
after = roster_df.shape[0]

print(f"Removed {before - after} duplicate rows from roster data")

## Convert Numeric Columns

In [None]:
roster_df.columns.tolist()

In [None]:
numeric_cols = [
    "time",
    "goals",
    "xg",
    "assists",
    "xa"
]

for col in numeric_cols:
    roster_df[col] = pd.to_numeric(roster_df[col], errors="coerce")

## Handle Missing Values

In [None]:
# Drop players with no name or team
roster_df = roster_df.dropna(subset=["player_id","team_id"])

# Fill performance metrics with 0
for col in numeric_cols:
    roster_df[col] = roster_df[col].fillna(0)

## Standardize player names

In [None]:
roster_df["player"] = roster_df["player"].str.strip().str.lower()

## Remove players who never played

In [None]:
# Remove players with zero time
roster_df = roster_df[roster_df["time"] > 0]

Per-90 Feature Engineering

In [None]:
roster_df["goals_per_90"] = (
    (roster_df["goals"] / roster_df["time"]) * 90
)

roster_df["xg_per_90"] = (
    (roster_df["xg"] / roster_df["time"]) * 90
)

roster_df["xa_per_90"] = (
    (roster_df["xa"] / roster_df["time"]) * 90
)

In [None]:
# Handling Infinites
roster_df = roster_df.replace([np.inf, -np.inf], 0)

## Final Validation

In [None]:
data_audit(roster_df, "Cleaned Roster")

# Save Cleaned Data

In [None]:
match_df.to_csv("cleaned_understat_match_1524.csv", index=False)
roster_df.to_csv("cleaned_understat_roster_1524.csv", index=False)