# EPL 2025 Analysis - Transformation Pipeline

This notebook prepares English Premier League 2024–25 data for analysis and modeling by:
- Connecting to Azure Data Lake
- Loading and cleaning raw match data
- Performing feature engineering
- Saving processed data

# Access to Data Lake

## 🔐 Step 1: Authentication & Configuration

Authenticate with Azure Data Lake using Service Principal to access the raw dataset.

In [0]:
# Replace the following values with your own Azure credentials
client_id = "client_id"         # aka Service Principal App ID
tenant_id = "tenant_id"           # Azure AD tenant
client_secret = "secret_value"      # secret value of the service principal
storage_account_name = "dl_storage"  # Azure Storage account name
container_name = "raw"          # Azure Storage container name

# Set Spark configs to use OAuth with Service Principal
spark.conf.set(f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net", 
               "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net", 
               f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")


# Load Data

## 📥 Step 2: Load Raw EPL Dataset

We load the raw CSV file from the data lake and preview the first few rows to understand the structure.

In [0]:
# Load CSV from ADLS Gen2

# Set the file path
file_name = "epl_raw_2024_25.csv" # replace with your file name
file_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{file_name}" # replace with your file path

# Read the CSV file
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \ 
    .load(file_path)


# View the schema
df.printSchema()

# Display the first 5 rows
display(df.limit(5))

root
 |-- Div: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Time: timestamp (nullable = true)
 |-- HomeTeam: string (nullable = true)
 |-- AwayTeam: string (nullable = true)
 |-- FTHG: integer (nullable = true)
 |-- FTAG: integer (nullable = true)
 |-- FTR: string (nullable = true)
 |-- HTHG: integer (nullable = true)
 |-- HTAG: integer (nullable = true)
 |-- HTR: string (nullable = true)
 |-- Referee: string (nullable = true)
 |-- HS: integer (nullable = true)
 |-- AS: integer (nullable = true)
 |-- HST: integer (nullable = true)
 |-- AST: integer (nullable = true)
 |-- HF: integer (nullable = true)
 |-- AF: integer (nullable = true)
 |-- HC: integer (nullable = true)
 |-- AC: integer (nullable = true)
 |-- HY: integer (nullable = true)
 |-- AY: integer (nullable = true)
 |-- HR: integer (nullable = true)
 |-- AR: integer (nullable = true)
 |-- B365H: double (nullable = true)
 |-- B365D: double (nullable = true)
 |-- B365A: double (nullable = true)
 |-- BWH: double 

Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,BFH,BFD,BFA,PSH,PSD,PSA,WHH,WHD,WHA,1XBH,1XBD,1XBA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA,BFEH,BFED,BFEA,B365>2.5,B365<2.5,P>2.5,P<2.5,Max>2.5,Max<2.5,Avg>2.5,Avg<2.5,BFE>2.5,BFE<2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA,BFEAHH,BFEAHA,B365CH,B365CD,B365CA,BWCH,BWCD,BWCA,BFCH,BFCD,BFCA,PSCH,PSCD,PSCA,WHCH,WHCD,WHCA,1XBCH,1XBCD,1XBCA,MaxCH,MaxCD,MaxCA,AvgCH,AvgCD,AvgCA,BFECH,BFECD,BFECA,B365C>2.5,B365C<2.5,PC>2.5,PC<2.5,MaxC>2.5,MaxC<2.5,AvgC>2.5,AvgC<2.5,BFEC>2.5,BFEC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,BFECAHH,BFECAHA
E0,2024-08-16,2025-05-29T20:00:00Z,Man United,Fulham,1,0,H,0,0,D,R Jones,14,10,5,2,12,10,7,8,2,3,0,0,1.6,4.2,5.25,1.6,4.4,5.25,1.6,4.33,5.0,1.63,4.38,5.3,1.65,4.2,5.0,1.68,4.32,5.03,1.68,4.5,5.5,1.62,4.36,5.15,1.66,4.5,5.6,1.53,2.5,1.56,2.56,1.57,2.6,1.53,2.52,1.59,2.64,-1.0,2.05,1.88,2.07,1.86,2.07,1.89,2.03,1.85,2.1,1.88,1.67,4.1,5.0,1.65,4.2,4.8,1.62,4.0,5.0,1.65,4.23,5.28,1.6,4.2,5.5,1.66,4.15,5.33,1.7,4.33,5.5,1.66,4.2,5.02,1.72,4.2,5.4,1.62,2.3,1.63,2.38,1.66,2.45,1.61,2.37,1.68,2.46,-0.75,1.86,2.07,1.83,2.11,1.88,2.11,1.82,2.05,1.9,2.08
E0,2024-08-17,2025-05-29T12:30:00Z,Ipswich,Liverpool,0,2,A,0,0,D,T Robinson,7,18,2,5,9,18,2,10,3,1,0,0,8.5,5.5,1.33,7.5,5.5,1.36,8.5,5.5,1.33,8.18,5.84,1.34,8.5,5.5,1.33,8.6,5.85,1.35,9.0,6.1,1.37,8.28,5.76,1.34,9.4,6.2,1.36,1.4,3.0,1.41,3.0,1.43,3.07,1.41,2.94,1.45,3.15,1.5,2.02,1.91,1.99,1.92,2.02,1.95,1.97,1.9,2.04,1.93,8.0,5.75,1.33,8.0,5.75,1.34,7.5,5.5,1.33,8.14,6.09,1.34,8.0,5.5,1.35,8.57,5.85,1.34,8.57,6.25,1.39,7.87,5.81,1.35,8.6,6.2,1.37,1.36,3.2,1.37,3.3,1.4,3.38,1.37,3.18,1.4,3.4,1.5,2.05,1.88,2.04,1.9,2.2,2.0,1.99,1.88,2.04,1.93
E0,2024-08-17,2025-05-29T15:00:00Z,Arsenal,Wolves,2,0,H,1,0,H,J Gillett,18,9,6,3,17,14,8,2,2,2,0,0,1.18,7.5,13.0,1.2,7.5,13.5,1.15,8.0,16.0,1.16,8.56,16.22,1.18,7.0,17.0,1.2,7.65,16.0,1.2,9.1,18.0,1.18,7.86,15.87,1.19,9.0,18.0,1.44,2.75,1.46,2.79,1.5,2.82,1.46,2.7,1.53,2.84,-2.0,1.93,2.0,1.88,2.0,1.97,2.0,1.9,1.96,1.94,2.0,1.14,8.5,15.0,1.16,8.5,18.0,1.13,8.5,17.0,1.15,9.05,18.76,1.15,8.0,19.0,1.16,9.39,16.6,1.17,9.4,21.0,1.15,8.62,18.11,1.17,9.4,21.0,1.4,3.0,1.41,2.98,1.45,3.0,1.42,2.93,1.44,3.2,-2.25,2.02,1.91,2.0,1.9,2.05,1.93,1.99,1.87,2.02,1.96
E0,2024-08-17,2025-05-29T15:00:00Z,Everton,Brighton,0,3,A,0,1,A,S Hooper,9,10,1,5,8,8,1,5,1,1,1,0,2.63,3.3,2.63,2.65,3.4,2.6,2.7,3.4,2.63,2.73,3.36,2.71,2.6,3.5,2.7,2.68,3.66,2.63,2.76,3.66,2.78,2.67,3.41,2.68,2.78,3.5,2.78,1.8,2.0,1.83,2.05,1.85,2.08,1.81,2.04,1.88,2.08,0.0,1.96,1.97,1.96,1.94,1.97,1.97,1.94,1.94,1.99,1.99,3.1,3.4,2.3,3.0,3.4,2.37,3.0,3.3,2.3,3.15,3.41,2.4,3.1,3.3,2.4,3.16,3.47,2.34,3.16,3.5,2.45,3.06,3.4,2.38,3.15,3.55,2.46,1.93,1.97,1.93,1.97,1.95,2.0,1.89,1.96,1.94,2.04,0.25,1.87,2.06,1.86,2.07,1.92,2.1,1.83,2.04,1.88,2.11
E0,2024-08-17,2025-05-29T15:00:00Z,Newcastle,Southampton,1,0,H,1,0,H,C Pawson,3,19,1,4,15,16,3,12,2,4,1,0,1.36,5.25,8.0,1.35,5.5,7.75,1.33,5.5,8.5,1.35,5.7,8.25,1.35,5.5,8.0,1.37,5.74,8.1,1.37,5.9,8.6,1.35,5.62,8.1,1.37,6.0,9.2,1.4,3.0,1.4,3.09,1.42,3.12,1.4,3.01,1.43,3.15,-1.5,1.98,1.95,1.94,1.96,1.99,1.98,1.93,1.93,1.94,2.01,1.4,5.0,7.0,1.39,5.25,7.25,1.36,5.0,7.5,1.42,5.3,7.26,1.4,5.0,7.5,1.39,5.34,7.9,1.44,5.75,8.0,1.39,5.27,7.33,1.43,5.5,8.2,1.44,2.75,1.46,2.85,1.46,3.05,1.43,2.84,1.49,2.98,-1.25,1.87,2.06,1.88,2.06,1.89,2.1,1.82,2.05,1.89,2.1


# Transformation

## 🧹 Step 3: Remove Betting Odds Columns

These columns are excluded as they won't be used in our analysis and modeling. We filter them out based on known prefixes.

In [0]:
# List prefixes for betting odds columns
betting_prefixes = [
    "B365", "BF", "PS", "WH", "1XB", "Max", "Avg", "BFE", "P", "GB", "Bb",
    "SO", "SB", "SJ", "SY", "VC", "BW", "PA", "PC", "AH", "AHh", "AHCh", "CA"
]

# Get columns to drop based on prefix matching
cols_to_drop = [col for col in df.columns if any(col.startswith(prefix) for prefix in betting_prefixes)]

# Drop these columns
df_clean = df.drop(*cols_to_drop)

# Display the first 5 rows
display(df_clean.limit(5))

Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
E0,2024-08-16,2025-05-29T20:00:00Z,Man United,Fulham,1,0,H,0,0,D,R Jones,14,10,5,2,12,10,7,8,2,3,0,0
E0,2024-08-17,2025-05-29T12:30:00Z,Ipswich,Liverpool,0,2,A,0,0,D,T Robinson,7,18,2,5,9,18,2,10,3,1,0,0
E0,2024-08-17,2025-05-29T15:00:00Z,Arsenal,Wolves,2,0,H,1,0,H,J Gillett,18,9,6,3,17,14,8,2,2,2,0,0
E0,2024-08-17,2025-05-29T15:00:00Z,Everton,Brighton,0,3,A,0,1,A,S Hooper,9,10,1,5,8,8,1,5,1,1,1,0
E0,2024-08-17,2025-05-29T15:00:00Z,Newcastle,Southampton,1,0,H,1,0,H,C Pawson,3,19,1,4,15,16,3,12,2,4,1,0


## 🏷️ Step 4: Rename Columns for Clarity

We rename cryptic columns (e.g., `FTHG`, `HTR`, etc.) to human-readable names like `full_time_home_goals` and `half_time_result`.

In [0]:
# Rename columns using a dictionary
rename_map = {
    "Div": "league_division",
    "Date": "match_date",
    "Time": "kickoff_time",
    "HomeTeam": "home_team",
    "AwayTeam": "away_team",
    "FTHG": "full_time_home_goals",
    "FTAG": "full_time_away_goals",
    "FTR": "full_time_result",
    "HTHG": "half_time_home_goals",
    "HTAG": "half_time_away_goals",
    "HTR": "half_time_result",
    "Referee": "referee",
    "HS": "home_shots",
    "AS": "away_shots",
    "HST": "home_shots_on_target",
    "AST": "away_shots_on_target",
    "HF": "home_fouls",
    "AF": "away_fouls",
    "HC": "home_corners",
    "AC": "away_corners",
    "HY": "home_yellow_cards",
    "AY": "away_yellow_cards",
    "HR": "home_red_cards",
    "AR": "away_red_cards",
    "Attendance": "attendance",
    "HBP": "home_booking_points",
    "ABP": "away_booking_points"
}

# Use a loop to rename columns
for old_name, new_name in rename_map.items():
    if old_name in df_clean.columns:
        df_clean = df_clean.withColumnRenamed(old_name, new_name) # iterate through key-value pairs in dictionary

# Check schema and preview
df_clean.printSchema()
display(df_clean.limit(5))

root
 |-- league_division: string (nullable = true)
 |-- match_date: date (nullable = true)
 |-- kickoff_time: timestamp (nullable = true)
 |-- home_team: string (nullable = true)
 |-- away_team: string (nullable = true)
 |-- full_time_home_goals: integer (nullable = true)
 |-- full_time_away_goals: integer (nullable = true)
 |-- full_time_result: string (nullable = true)
 |-- half_time_home_goals: integer (nullable = true)
 |-- half_time_away_goals: integer (nullable = true)
 |-- half_time_result: string (nullable = true)
 |-- referee: string (nullable = true)
 |-- home_shots: integer (nullable = true)
 |-- away_shots: integer (nullable = true)
 |-- home_shots_on_target: integer (nullable = true)
 |-- away_shots_on_target: integer (nullable = true)
 |-- home_fouls: integer (nullable = true)
 |-- away_fouls: integer (nullable = true)
 |-- home_corners: integer (nullable = true)
 |-- away_corners: integer (nullable = true)
 |-- home_yellow_cards: integer (nullable = true)
 |-- away_yell

league_division,match_date,kickoff_time,home_team,away_team,full_time_home_goals,full_time_away_goals,full_time_result,half_time_home_goals,half_time_away_goals,half_time_result,referee,home_shots,away_shots,home_shots_on_target,away_shots_on_target,home_fouls,away_fouls,home_corners,away_corners,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards
E0,2024-08-16,2025-05-29T20:00:00Z,Man United,Fulham,1,0,H,0,0,D,R Jones,14,10,5,2,12,10,7,8,2,3,0,0
E0,2024-08-17,2025-05-29T12:30:00Z,Ipswich,Liverpool,0,2,A,0,0,D,T Robinson,7,18,2,5,9,18,2,10,3,1,0,0
E0,2024-08-17,2025-05-29T15:00:00Z,Arsenal,Wolves,2,0,H,1,0,H,J Gillett,18,9,6,3,17,14,8,2,2,2,0,0
E0,2024-08-17,2025-05-29T15:00:00Z,Everton,Brighton,0,3,A,0,1,A,S Hooper,9,10,1,5,8,8,1,5,1,1,1,0
E0,2024-08-17,2025-05-29T15:00:00Z,Newcastle,Southampton,1,0,H,1,0,H,C Pawson,3,19,1,4,15,16,3,12,2,4,1,0


In [0]:
from pyspark.sql.functions import col, sum

df_clean.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_clean.columns]).show()

+---------------+----------+------------+---------+---------+--------------------+--------------------+----------------+--------------------+--------------------+----------------+-------+----------+----------+--------------------+--------------------+----------+----------+------------+------------+-----------------+-----------------+--------------+--------------+
|league_division|match_date|kickoff_time|home_team|away_team|full_time_home_goals|full_time_away_goals|full_time_result|half_time_home_goals|half_time_away_goals|half_time_result|referee|home_shots|away_shots|home_shots_on_target|away_shots_on_target|home_fouls|away_fouls|home_corners|away_corners|home_yellow_cards|away_yellow_cards|home_red_cards|away_red_cards|
+---------------+----------+------------+---------+---------+--------------------+--------------------+----------------+--------------------+--------------------+----------------+-------+----------+----------+--------------------+--------------------+----------+------

In [0]:
df_clean.count(), df_clean.dropDuplicates().count()
# df_clean = df_clean.dropDuplicates()

(380, 380)

In [0]:
df_clean.select("full_time_result").distinct().show()
df_clean.filter("full_time_home_goals < 0 or full_time_away_goals < 0").show()

+----------------+
|full_time_result|
+----------------+
|               D|
|               A|
|               H|
+----------------+

+---------------+----------+------------+---------+---------+--------------------+--------------------+----------------+--------------------+--------------------+----------------+-------+----------+----------+--------------------+--------------------+----------+----------+------------+------------+-----------------+-----------------+--------------+--------------+
|league_division|match_date|kickoff_time|home_team|away_team|full_time_home_goals|full_time_away_goals|full_time_result|half_time_home_goals|half_time_away_goals|half_time_result|referee|home_shots|away_shots|home_shots_on_target|away_shots_on_target|home_fouls|away_fouls|home_corners|away_corners|home_yellow_cards|away_yellow_cards|home_red_cards|away_red_cards|
+---------------+----------+------------+---------+---------+--------------------+--------------------+----------------+--------------

In [0]:
from pyspark.sql.functions import lower, trim

df_clean = df_clean.withColumn("home_team", lower(col("home_team"))) \
                   .withColumn("away_team", lower(col("away_team"))) \
                   .withColumn("referee", trim(col("referee")))

## Load Transformed Data to Data Lake Storage

- Save to Parquet in ADLS

In [0]:
# Saving cleaned or transformed version of entire dataset
df_clean.write.format("parquet").mode("overwrite").save("abfss://transformed@epl2025dl.dfs.core.windows.net/epl_2025_clean")


## 3. Business Questions

We aim to explore:

1. What predicts a home win?
2. Which teams improve after half-time?
3. How do cards impact match outcomes?
4. What drives high-scoring matches?
5. Do certain referees influence match behavior?

## Feature Engineering

We create features such as goal difference, card impact, match outcome indicators, and more.

In [0]:
from pyspark.sql.functions import col, concat_ws, lit, when

df_gold = df_clean.withColumn(
    "match_id", concat_ws("_vs_", "home_team", "away_team", col("match_date").cast("string")) 
).withColumn(
    "goal_diff", col("full_time_home_goals") - col("full_time_away_goals")
).withColumn(
    "total_goals", col("full_time_home_goals") + col("full_time_away_goals") 
).withColumn(
    "is_draw", col("full_time_result") == lit("D")
).withColumn(
    "home_win", col("full_time_result") == lit("H")
).withColumn(
    "away_win", col("full_time_result") == lit("A")
).withColumn(
    "home_team_won", when(col("home_win"), lit(1)).otherwise(lit(0))
).withColumn(
    "away_team_won", when(col("away_win"), lit(1)).otherwise(lit(0))
).withColumn(
    "ht_score_diff", col("half_time_home_goals") - col("half_time_away_goals")
).withColumn(
    "ft_score_diff", col("full_time_home_goals") - col("full_time_away_goals")
).withColumn(
    "second_half_impact", col("ft_score_diff") - col("ht_score_diff")
)

df_gold.display()
df_gold.printSchema()


league_division,match_date,kickoff_time,home_team,away_team,full_time_home_goals,full_time_away_goals,full_time_result,half_time_home_goals,half_time_away_goals,half_time_result,referee,home_shots,away_shots,home_shots_on_target,away_shots_on_target,home_fouls,away_fouls,home_corners,away_corners,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards,match_id,goal_diff,total_goals,is_draw,home_win,away_win,home_team_won,away_team_won,ht_score_diff,ft_score_diff,second_half_impact
E0,2024-08-16,2025-05-29T20:00:00Z,man united,fulham,1,0,H,0,0,D,R Jones,14,10,5,2,12,10,7,8,2,3,0,0,man united_vs_fulham_vs_2024-08-16,1,1,False,True,False,1,0,0,1,1
E0,2024-08-17,2025-05-29T12:30:00Z,ipswich,liverpool,0,2,A,0,0,D,T Robinson,7,18,2,5,9,18,2,10,3,1,0,0,ipswich_vs_liverpool_vs_2024-08-17,-2,2,False,False,True,0,1,0,-2,-2
E0,2024-08-17,2025-05-29T15:00:00Z,arsenal,wolves,2,0,H,1,0,H,J Gillett,18,9,6,3,17,14,8,2,2,2,0,0,arsenal_vs_wolves_vs_2024-08-17,2,2,False,True,False,1,0,1,2,1
E0,2024-08-17,2025-05-29T15:00:00Z,everton,brighton,0,3,A,0,1,A,S Hooper,9,10,1,5,8,8,1,5,1,1,1,0,everton_vs_brighton_vs_2024-08-17,-3,3,False,False,True,0,1,-1,-3,-2
E0,2024-08-17,2025-05-29T15:00:00Z,newcastle,southampton,1,0,H,1,0,H,C Pawson,3,19,1,4,15,16,3,12,2,4,1,0,newcastle_vs_southampton_vs_2024-08-17,1,1,False,True,False,1,0,1,1,0
E0,2024-08-17,2025-05-29T15:00:00Z,nott'm forest,bournemouth,1,1,D,1,0,H,M Oliver,14,13,8,4,17,8,2,6,1,3,0,0,nott'm forest_vs_bournemouth_vs_2024-08-17,0,2,True,False,False,0,0,1,0,-1
E0,2024-08-17,2025-05-29T17:30:00Z,west ham,aston villa,1,2,A,1,1,D,T Harrington,14,15,3,3,18,11,5,3,1,2,0,0,west ham_vs_aston villa_vs_2024-08-17,-1,3,False,False,True,0,1,0,-1,-1
E0,2024-08-18,2025-05-29T14:00:00Z,brentford,crystal palace,2,1,H,1,0,H,S Barrott,9,14,5,6,6,15,4,7,1,5,0,0,brentford_vs_crystal palace_vs_2024-08-18,1,3,False,True,False,1,0,1,1,0
E0,2024-08-18,2025-05-29T16:30:00Z,chelsea,man city,0,2,A,0,1,A,A Taylor,10,11,3,5,12,9,4,3,1,1,0,0,chelsea_vs_man city_vs_2024-08-18,-2,2,False,False,True,0,1,-1,-2,-1
E0,2024-08-19,2025-05-29T20:00:00Z,leicester,tottenham,1,1,D,0,1,A,C Kavanagh,7,15,3,7,11,12,2,13,1,1,0,0,leicester_vs_tottenham_vs_2024-08-19,0,2,True,False,False,0,0,-1,0,1


root
 |-- league_division: string (nullable = true)
 |-- match_date: date (nullable = true)
 |-- kickoff_time: timestamp (nullable = true)
 |-- home_team: string (nullable = true)
 |-- away_team: string (nullable = true)
 |-- full_time_home_goals: integer (nullable = true)
 |-- full_time_away_goals: integer (nullable = true)
 |-- full_time_result: string (nullable = true)
 |-- half_time_home_goals: integer (nullable = true)
 |-- half_time_away_goals: integer (nullable = true)
 |-- half_time_result: string (nullable = true)
 |-- referee: string (nullable = true)
 |-- home_shots: integer (nullable = true)
 |-- away_shots: integer (nullable = true)
 |-- home_shots_on_target: integer (nullable = true)
 |-- away_shots_on_target: integer (nullable = true)
 |-- home_fouls: integer (nullable = true)
 |-- away_fouls: integer (nullable = true)
 |-- home_corners: integer (nullable = true)
 |-- away_corners: integer (nullable = true)
 |-- home_yellow_cards: integer (nullable = true)
 |-- away_yell

In [0]:
# Saving enriched version to ADLS Gen2
df_gold.write \
    .format("parquet") \
    .mode("overwrite") \
    .save("abfss://enriched@epl2025dl.dfs.core.windows.net/epl_2025_enriched")