In [1]:
from pyspark.sql import SparkSession

spark: SparkSession = SparkSession.builder \
                        .appName('Test Silver') \
                        .config("spark.hadoop.fs.defaultFS", "file:///") \
                        .config("spark.driver.memory", "4g") \
                        .config("spark.sql.shuffle.partitions", "400") \
                        .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.3.0") \
                        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                        .config("spark.sql.debug.maxToStringFields", 50) \
                        .getOrCreate()

25/08/26 09:40:00 WARN Utils: Your hostname, DESKTOP-9VM3SA1 resolves to a loopback address: 127.0.1.1; using 172.28.82.250 instead (on interface eth0)
25/08/26 09:40:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/dottier/big_data/venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/dottier/.ivy2/cache
The jars for the packages stored in: /home/dottier/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c5547d03-d164-42a9-b464-7d61340c2ef0;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.3.0 in central
	found io.delta#delta-storage;3.3.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 137ms :: artifacts dl 6ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.3.0 from central in [default]
	io.delta#delta-storage;3.3.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   

In [3]:
SILVER_PATH = "/home/dottier/big_data/silver"

from delta.tables import DeltaTable

def read_from_silver(table_name: str):
    table_path = f"{SILVER_PATH}/{table_name}"
    df = spark.read.format("delta").load(table_path)
    return df

In [54]:
from delta.tables import DeltaTable
from pyspark.sql.functions import DataFrame

GOLD_PATH = "/home/dottier/big_data/gold"

def write_table_to_gold(
    df: DataFrame,
    table_name: str,
    primary_keys: list[str], 
    partition_cols: list[str] = None
):
    table_path = f"{GOLD_PATH}/{table_name}"

    print(f"--- Writing gold table: {table_name} ---")
    print(f"  - Primary Keys: {primary_keys}")
    print(f"  - Partition Columns: {partition_cols}")

    if not DeltaTable.isDeltaTable(spark, table_path):
        initial_writer = df.write.format("delta")
        if partition_cols:
            initial_writer = initial_writer.partitionBy(*partition_cols)
        initial_writer.save(table_path)

        print(f"  - Successfully created and wrote data to {table_name}.")
        return


    delta_table = DeltaTable.forPath(spark, table_path)
    merge_condition = " AND ".join([f"target.{key} = source.{key}" for key in primary_keys])

    (
        delta_table.alias("target")
        .merge(
            source=df.alias("source"),
            condition=merge_condition
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )
    print(f"  - Merge complete for {table_name}.")

In [4]:
event = read_from_silver("fct_match_events")

+-------+-----------------------+------+--------+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
event.filter()

In [None]:
import pyspark.sql.functions as sf
from pyspark.sql.functions import col

event.filter(
    (col("match_id") == 1729454)
    # (col("player_id").isNull()) 
    # (col("type_display_name") == "pass")
    # (col("period_value") > 2)
).groupBy("player_id").agg(
    sf.sum(sf.when(sf.array_contains(col("satisfied_events_types"), "touches"), 1)).alias("total_touches"),
    sf.sum(
        sf.when(
            (sf.array_contains(col("satisfied_events_types"), "touches")) &
            (col("type_display_name") != "foul"), 1
        )
    ).alias("possession")
).select(
    "_event_id",
    "minute",
    "second",
    "period_value",
    "type_display_name",
    "is_successful",
    "qualifiers_list",
    "satisfied_events_types"
).show(n=300, truncate=False)

                                                                                

+--------+---------+--------+-------+---------+------+------+------------+-------------------+-----------------+-------------+----+-----+-----+-----+---------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------+------------------------------------------------------+----------------------+---------+--------+
|match_id|_event_id|event_id|team_id|player_id|minute|second|period_value|period_display_name|type_display_name|is_successful|x   |y    |end_x|end_y|qualifiers_display_names                                                                                 |satisfied_events_types_names                                                                                                                                     

In [5]:
player = read_from_silver("dim_players")
team = read_from_silver("dim_teams")
stage = read_from_silver("dim_stages")
match = read_from_silver("fct_match_summary")

In [5]:
stage.show(10)

                                                                                

+---------+-------------+-------------+----------------+---------+-----------+--------------------+--------------------+---------+--------+
|region_id|  region_name|tournament_id| tournament_name|season_id|season_name|          stage_name|              league|   season|stage_id|
+---------+-------------+-------------+----------------+---------+-----------+--------------------+--------------------+---------+--------+
|      250|       Europe|           12|Champions League|     8177|  2020/2021|Champions League ...|europe-champions-...|2020-2021|   18978|
|      250|       Europe|           12|Champions League|     7804|  2019/2020|Champions League ...|europe-champions-...|2019-2020|   17942|
|      250|       Europe|           12|Champions League|     8177|  2020/2021|Champions League ...|europe-champions-...|2020-2021|   18975|
|      247|International|           36|  FIFA World Cup|     3768|       2014|    World Cup grp. G|international-fif...|     2014|    7563|
|      250|       Eu

In [6]:
qualifier_mapping = read_from_silver("qualifier_mapping")
event_type_mapping = read_from_silver("event_type_mapping")

In [7]:
# qualifier_mapping.show(5)
# event_type_mapping.show(5)

qualifier_rows = qualifier_mapping.collect()
qualifier_map = {row.qualifier_name: row.qualifier_id for row in qualifier_rows}

event_type_rows = event_type_mapping.collect()
event_type_map = {row.event_type_value: row.event_type_id for row in event_type_rows}

print(event_type_map["goal_head"])

                                                                                

27


In [17]:
qualifier_mapping.show(1000)

+--------------------+------------+
|      qualifier_name|qualifier_id|
+--------------------+------------+
|        parried_safe|         173|
|          one_on_one|          89|
|           high_left|          77|
|      freekick_taken|           5|
|           box_right|          63|
|  leading_to_attempt|         169|
|     other_body_part|          21|
|   throwin_set_piece|         160|
|               cross|           2|
|          high_claim|          88|
|       second_yellow|          32|
|              yellow|          31|
|         from_corner|          25|
|     individual_play|         215|
|           offensive|         286|
|            own_goal|          28|
|        corner_taken|           6|
|             blocked|          82|
|              volley|         108|
|            last_man|          14|
|       keeper_missed|         186|
|out_of_box_deep_r...|          66|
|        keeper_throw|         123|
|                foul|          13|
|keeper_save_in_th...|      

In [9]:
participation = read_from_silver("fct_player_match_participation")
# participation.show(10)

In [None]:
import pyspark.sql.functions as sf
from pyspark.sql.functions import col
from pyspark.sql.window import Window

match_window = Window.partitionBy("match_id")

events_with_period_ends = event.withColumn(
    "p1_end", sf.coalesce(sf.max(sf.when(col("period_value") == 1, col("minute"))).over(match_window), sf.lit(45))
).withColumn(
    "p2_end", sf.coalesce(sf.max(sf.when(col("period_value") == 2, col("minute"))).over(match_window), sf.lit(90))
).withColumn(
    "p3_end", sf.coalesce(sf.max(sf.when(col("period_value") == 3, col("minute"))).over(match_window), sf.lit(105))
).withColumn(
    "p4_end", sf.coalesce(sf.max(sf.when(col("period_value") == 4, col("minute"))).over(match_window), sf.lit(105))
)

events_with_extended_minute = events_with_period_ends.withColumn(
    "extended_minute",
    sf.when(
        col("period_value") == 1,
        col("minute")
    ).when(
        col("period_value") == 2,
        col("minute") + (col("p1_end") - 44)
    ).when(
        col("period_value") == 3,
        col("minute") + (col("p1_end") - 44) + (col("p2_end") - 90)
    ).when(
        col("period_value") == 4,
        col("minute") + (col("p1_end") - 44) + (col("p2_end") - 90) + (col("p3_end") - 105)
    ).when(
        col("period_value") == 5,
        col("minute") + (col("p1_end") - 44) + (col("p2_end") - 90) + (col("p3_end") - 105) + (col("p4_end") - 120)
    ).otherwise(
        col("minute")
    )
)

In [177]:
event.filter(
    # (col("match_id") == 1775621) &
    (col("player_id") == 406347) &
    (col("type_display_name").isin(["substitution_on", "substitution_off"])) &
    # (col("minute") > 90) & 
    (col("stage_id") == 22686) 
).show(1000)

+--------+---------+--------+-------+---------+------+------+------------+-------------------+-----------------+-------------+---+---+-----+-----+---------------+----------------------+--------------------+---------+--------+
|match_id|_event_id|event_id|team_id|player_id|minute|second|period_value|period_display_name|type_display_name|is_successful|  x|  y|end_x|end_y|qualifiers_list|satisfied_events_types|              league|   season|stage_id|
+--------+---------+--------+-------+---------+------+------+------------+-------------------+-----------------+-------------+---+---+-----+-----+---------------+----------------------+--------------------+---------+--------+
| 1789436|     1294|     723|    297|   406347|    96|    29|           2|        second_half| substitution_off|         true|0.0|0.0| NULL| NULL|             []|             [sub_off]|europe-champions-...|2023-2024|   22686|
+--------+---------+--------+-------+---------+------+------+------------+-------------------+--

In [None]:
import pyspark.sql.functions as sf
from pyspark.sql.functions import col, lit, when

# --- Steps 1 & 2: No changes. You already have these DataFrames ---
# `events_with_extended_minute`
# `participation` (with `is_first_eleven` column)

# --- Step 3: Create START and END event information for each player ---
# We need to know WHEN and in WHICH PERIOD each player started and stopped playing.
sub_on_events = events_with_extended_minute.filter(col("type_display_name") == "substitution_on").select(
    "match_id",
    col("player_id"),
    col("extended_minute").alias("start_minute"),
    col("period_value").alias("start_period")
)

sub_off_events = events_with_extended_minute.filter(col("type_display_name") == "substitution_off").select(
    "match_id",
    col("player_id"),
    col("extended_minute").alias("end_minute"),
    col("period_value").alias("end_period")
)

# --- Step 4: Get Match End Times for each period ---
# We need to know the official end time of each half.
match_period_ends = events_with_extended_minute.groupBy("match_id").agg(
    sf.max(when(col("period_value") == 1, col("extended_minute"))).alias("p1_end_minute"),
    sf.max(when(col("period_value") == 2, col("extended_minute"))).alias("p2_end_minute"),
    sf.max(when(col("period_value") == 3, col("extended_minute"))).alias("p3_end_minute"),
    sf.max(when(col("period_value") == 4, col("extended_minute"))).alias("p4_end_minute")
)

# --- Step 5: Combine all information into one DataFrame ---
player_status = participation.join(sub_on_events, ["match_id", "player_id"], "left")
player_status = player_status.join(sub_off_events, ["match_id", "player_id"], "left")
player_status = player_status.join(match_period_ends, "match_id", "left")

participants_only_df = player_status.filter(
    col("is_first_eleven") | col("start_minute").isNotNull()
)

final_minutes_df = participants_only_df.withColumn(
    # Official end of the match (90 for normal, 120 for ET)
    "match_end_minute",
    when(col("p4_end_minute").isNotNull(), col("p4_end_minute"))
    .otherwise(col("p2_end_minute"))
).withColumn(
    # Player's entry minute (0 for starters)
    "player_start_minute",
    when(col("is_first_eleven"), lit(0)).otherwise(col("start_minute"))
).withColumn(
    # Player's exit minute (official end if not subbed off)
    "player_end_minute",
    sf.coalesce(col("end_minute"), col("match_end_minute"))
)

# --- NEW Step 7: Calculate Overlap for Each Period and Sum Them ---
# This is the core of the fix.
participation_with_minutes = final_minutes_df.withColumn(
    # Minutes in Period 1 (0-45)
    "mins_p1",
    sf.greatest(lit(0), sf.least(lit(45),
        sf.least(col("player_end_minute"), col("p1_end_minute")) - col("player_start_minute")
    ))
).withColumn(
    # Minutes in Period 2 (45-90)
    "mins_p2",
    sf.greatest(lit(0), sf.least(lit(45),
        sf.least(col("player_end_minute"), col("p2_end_minute")) - sf.greatest(col("player_start_minute"), col("p1_end_minute"))
    ))
).withColumn(
    # Minutes in Period 3 (ET First Half, 90-105)
    "mins_p3",
    sf.greatest(lit(0), sf.least(lit(15),
        sf.least(col("player_end_minute"), col("p3_end_minute")) - sf.greatest(col("player_start_minute"), col("p2_end_minute"))
    ))
).withColumn(
    # Minutes in Period 4 (ET Second Half, 105-120)
    "mins_p4",
    sf.greatest(lit(0), sf.least(lit(15),
        sf.least(col("player_end_minute"), col("p4_end_minute")) - sf.greatest(col("player_start_minute"), sf.coalesce(col("p3_end_minute"), col("p2_end_minute")))
    ))
).withColumn(
    "minutes_played",
    # Sum the calculated, capped minutes for each period
    (
        sf.coalesce(col("mins_p1"), lit(0)) + 
        sf.coalesce(col("mins_p2"), lit(0)) + 
        sf.coalesce(col("mins_p3"), lit(0)) + 
        sf.coalesce(col("mins_p4"), lit(0))
    )
).select(
    "match_id",
    "player_id",
    "team_id",
    "shirt_no",
    "is_first_eleven",
    "minutes_played"
)

In [163]:
participation_with_minutes.printSchema()
participation_with_minutes.filter(
    (col("match_id") == 1866112) 
    #& col("start_minute").isNotNull()
).show(500)

root
 |-- match_id: integer (nullable = true)
 |-- player_id: integer (nullable = true)
 |-- team_id: integer (nullable = true)
 |-- shirt_no: integer (nullable = true)
 |-- is_first_eleven: boolean (nullable = true)
 |-- start_minute: integer (nullable = true)
 |-- start_period: integer (nullable = true)
 |-- end_minute: integer (nullable = true)
 |-- end_period: integer (nullable = true)
 |-- p1_end_minute: integer (nullable = true)
 |-- p2_end_minute: integer (nullable = true)
 |-- p3_end_minute: integer (nullable = true)
 |-- p4_end_minute: integer (nullable = true)
 |-- match_end_minute: integer (nullable = true)
 |-- player_start_minute: integer (nullable = true)
 |-- player_end_minute: integer (nullable = true)
 |-- mins_p1: integer (nullable = false)
 |-- mins_p2: integer (nullable = false)
 |-- mins_p3: integer (nullable = false)
 |-- mins_p4: integer (nullable = false)
 |-- minutes_played: integer (nullable = false)

+--------+---------+-------+--------+---------------+------

In [10]:
import pyspark.sql.functions as sf
from pyspark.sql.functions import col, lit, when
from pyspark.sql.window import Window

event_with_subs = (
    event
    .withColumn(
        "start_minute",
        sf.when(col("type_display_name") == "substitution_on", col("minute") + lit(1))
    ).withColumn(
        "end_minute",
        sf.when(
            (col("type_display_name") == "substitution_off") |
            (sf.array_contains(col("qualifiers_values"), qualifier_map["second_yellow"])) |
            (sf.array_contains(col("qualifiers_values"), qualifier_map["red"])),
            col("minute") + lit(1)
        )
    )
)

player_match_summary_df = event_with_subs.groupBy("match_id", "player_id").agg(
    sf.min("start_minute").alias("sub_on_minute"),
    sf.max("end_minute").alias("sub_off_minute")
)

match_end_times_df = event.filter(
    col("period_value") < 5
).groupBy("match_id").agg(
    sf.max("minute").alias("actual_match_end_minute"),
    sf.max("period_value").alias("max_period")
).withColumn(
    "official_match_duration",
    when(col("max_period") > 2, lit(120)).otherwise(lit(90))
)

participation_with_minutes = participation.join(player_match_summary_df, ["match_id", "player_id"], "left")
participation_with_minutes = participation_with_minutes.join(match_end_times_df, "match_id", "left")

participation_with_minutes = participation_with_minutes.filter(
    col("is_first_eleven") | col("sub_on_minute").isNotNull()
)

participation_with_minutes = participation_with_minutes.withColumn(
    "player_start_minute",
    when(col("is_first_eleven"), lit(0)).otherwise(col("sub_on_minute"))
).withColumn(
    "player_end_minute",
    sf.coalesce(col("sub_off_minute"), col("actual_match_end_minute"))
).withColumn(
    "minutes_played",
    sf.greatest(lit(1), sf.least(col("player_end_minute") - col("player_start_minute"), col("official_match_duration")))
).select(
    "match_id",
    "player_id",
    "team_id",
    "shirt_no",
    "is_first_eleven",
    "minutes_played"
)



In [9]:
participation_with_minutes.filter(
    (col("match_id") == 1891171) 
    #& col("start_minute").isNotNull()
).show(500)

+--------+---------+--------------+
|match_id|player_id|minutes_played|
+--------+---------+--------------+
| 1891171|    52197|            90|
| 1891171|   459256|            17|
| 1891171|   129832|            90|
| 1891171|   446053|            58|
| 1891171|   392647|            90|
| 1891171|   425338|            90|
| 1891171|   435100|            58|
| 1891171|   476110|            58|
| 1891171|   243567|            90|
| 1891171|   399296|            66|
| 1891171|   396158|            58|
| 1891171|   405314|            24|
| 1891171|   115401|            32|
| 1891171|   401354|            32|
| 1891171|   320609|            32|
| 1891171|   389191|            32|
| 1891171|   102248|            90|
| 1891171|   322043|            73|
| 1891171|   123167|            66|
| 1891171|   301440|            90|
| 1891171|   384134|            90|
| 1891171|   136345|            90|
| 1891171|   331790|            46|
| 1891171|   362824|            66|
| 1891171|   318449|        

In [None]:
import pyspark.sql.functions as sf
from pyspark.sql.functions import col
from pyspark.sql.window import Window

# Get corner_taken and corners_leading_to_goal data with window function
w = Window.partitionBy("match_id").orderBy("_event_id")

events_with_flags_df = (
    event
    .withColumn("is_pass_attempt", col("type_display_name") == "pass")
    .withColumn("is_cross", sf.array_contains(col("qualifiers_values"), qualifier_map["cross"]))
    .withColumn("is_keeper_throw", sf.array_contains(col("qualifiers_values"), qualifier_map["keeper_throw"]))
    .withColumn("is_throw_in", sf.array_contains(col("qualifiers_values"), qualifier_map["throw_in"]))
    .withColumn("is_long_ball", sf.array_contains(col("qualifiers_values"), qualifier_map["longball"]))
    .withColumn("is_through_ball", sf.array_contains(col("qualifiers_values"), qualifier_map["throughball"]))

    .withColumn("is_goal", col("type_display_name") == "goal")
    .withColumn("is_assist", sf.array_contains(col("qualifiers_values"), qualifier_map["intentional_goal_assist"]))
    .withColumn("is_penalty_taken", sf.array_contains(col("qualifiers_values"), qualifier_map["penalty"]))

    .withColumn("is_goal_six_yard_box", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_six_yard_box"]))
    .withColumn("is_goal_penalty_area", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_penalty_area"]))
    .withColumn("is_goal_out_of_box", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_obox"]))

    .withColumn("is_goal_open_play", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_open_play"]))
    .withColumn("is_goal_counter", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_counter"]))
    .withColumn("is_goal_set_piece", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_set_piece"]))
    .withColumn("is_official_penalty_scored", sf.array_contains(col("satisfied_events_types"), event_type_map["penalty_scored"]))

    .withColumn("is_goal_right_foot", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_right_foot"]))
    .withColumn("is_goal_left_foot", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_left_foot"]))
    .withColumn("is_goal_head", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_head"]))
    .withColumn("is_goal_other_body_parts", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_obp"]))

    .withColumn("is_regulation_time", col("period_display_name").isin(["first_half", "second_half"]))
    .withColumn("is_extra_time", col("period_display_name").isin(["first_period_of_extra_time", "second_period_of_extra_time"]))
    .withColumn("is_penalty_shootout", col("period_display_name") == "penalty_shootout")
    .withColumn("is_saved_shot", col("type_display_name") == "saved_shot")
    .withColumn("is_missed_shot", col("type_display_name") == "missed_shots")

    .withColumn("is_last_man_tackle", sf.array_contains(col("satisfied_events_types"), event_type_map["tackle_last_man"]))
    .withColumn("is_clearance_off_the_line", sf.array_contains(col("satisfied_events_types"), event_type_map["clearance_off_the_line"]))

    .withColumn("is_yellow", sf.array_contains(col("qualifiers_values"), qualifier_map["yellow"]))
    .withColumn("is_second_yellow", sf.array_contains(col("qualifiers_values"), qualifier_map["second_yellow"]))
    .withColumn("is_red", sf.array_contains(col("qualifiers_values"), qualifier_map["red"]))
    
    .withColumn("is_own_goal", sf.array_contains(col("qualifiers_values"), qualifier_map["own_goal"]))

    .withColumn("is_save_attempt", sf.array_contains(col("satisfied_events_types"), event_type_map["keeper_save_total"]))
    .withColumn("is_collected", sf.array_contains(col("satisfied_events_types"), event_type_map["collected"]))
    .withColumn("is_parried_safe", sf.array_contains(col("satisfied_events_types"), event_type_map["parried_safe"]))
    .withColumn("is_parried_danger", sf.array_contains(col("satisfied_events_types"), event_type_map["parried_danger"]))
    .withColumn("is_claim", col("type_display_name") == "claim")

    .withColumn("is_penalty_faced", col("type_display_name") == "penalty_faced")
    .withColumn("is_keeper_saved", sf.array_contains(col("qualifiers_values"), qualifier_map["keeper_saved"]))
    .withColumn("is_keeper_missed", sf.array_contains(col("qualifiers_values"), qualifier_map["keeper_missed"]))
)


# # Define the specific event types we want to turn into columns.
# # This is crucial for the pivot operation to be efficient.
# goal_event_types = [
#     "goal_six_yard_box",
#     "goal_penalty_area",
#     "goal_obox",
#     "goal_open_play",
#     "goal_counter",
#     "goal_set_piece",
#     "goal_penalty",
#     "goal_right_foot",
#     "goal_left_foot",
#     "goal_head",
#     "goal_obp"
# ]

# # Define the final column names you want.
# goal_cols = [
#     "is_goal_six_yard_box",
#     "is_goal_penalty_area",
#     "is_goal_out_of_box",
#     "is_goal_open_play",
#     "is_goal_counter",
#     "is_goal_set_piece",
#     "is_official_penalty_scored", # Note the name change to match your final agg
#     "is_goal_right_foot",
#     "is_goal_left_foot",
#     "is_goal_head",
#     "is_goal_other_body_parts",
# ]

# # Create a mapping from the raw event type to the desired column name
# goal_type_to_col_map = dict(zip(goal_event_types, goal_cols))

# # 1. Explode the array and create a long-format DataFrame
# exploded_satisfied_events_df = (
#     events_with_flags_df
#     .select("match_id", "_event_id", "is_goal", sf.explode("satisfied_events_types").alias("satisfied_event_type"))
# )

# # 2. Filter for only the goal events and types we care about.
# # This dramatically reduces the amount of data to be pivoted.
# pivoted_goal_flags_df = (
#     exploded_satisfied_events_df
#     .filter(
#         (col("is_goal")) &
#         (col("satisfied_event_type").isin(goal_event_types))
#     )
#     # Map the raw type to the final column name
#     .withColumn("flag_col_name", sf.create_map([sf.lit(k) for k in goal_type_to_col_map for _ in range(2)])[col("satisfied_event_type")])
#     # 3. Group by the event and pivot the types into columns
#     .groupBy("match_id", "_event_id")
#     .pivot("flag_col_name", goal_cols) # Use the list of final names for an efficient pivot
#     .agg(sf.lit(True)) # If a row exists, the flag is True
# )

# # 4. Join the pivoted flags back to the main DataFrame.
# # This is a single, efficient join that adds all the flag columns at once.
# events_with_flags_df = events_with_flags_df.join(
#     pivoted_goal_flags_df,
#     on=["match_id", "_event_id"],
#     how="left"
# ).fillna(False, subset=goal_cols)

# Note: The original 'is_official_penalty_scored' was based on 'goal_penalty' event type.
# This is now handled directly by the pivot. The aggregation logic later for
# sf.count(sf.when(col("is_official_penalty_scored"), True)).alias("penalties_scored")
# will now work correctly with the new pivoted column.

# goal_flags_df = events_with_flags_df.filter(col("is_goal"))
# goal_flags_df = (
#     goal_flags_df
#     .withColumn("is_goal_six_yard_box", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_six_yard_box"]))
#     .withColumn("is_goal_penalty_area", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_penalty_area"]))
#     .withColumn("is_goal_out_of_box", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_obox"]))

#     .withColumn("is_goal_open_play", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_open_play"]))
#     .withColumn("is_goal_counter", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_counter"]))
#     .withColumn("is_goal_set_piece", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_set_piece"]))
#     .withColumn("is_official_penalty_scored", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_penalty"]))

#     .withColumn("is_goal_right_foot", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_right_foot"]))
#     .withColumn("is_goal_left_foot", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_left_foot"]))
#     .withColumn("is_goal_head", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_head"]))
#     .withColumn("is_goal_other_body_parts", sf.array_contains(col("satisfied_events_types"), event_type_map["goal_obp"]))
# )

# goal_cols = [
#     "is_goal_six_yard_box",
#     "is_goal_penalty_area",
#     "is_goal_out_of_box",

#     "is_goal_open_play",
#     "is_goal_counter",
#     "is_goal_set_piece",
#     "is_official_penalty_scored",

#     "is_goal_right_foot",
#     "is_goal_left_foot",
#     "is_goal_head",
#     "is_goal_other_body_parts",
# ]

# events_with_flags_df = events_with_flags_df.join(
#     goal_flags_df.select(["match_id", "_event_id"] + goal_cols),
#     on=["match_id", "_event_id"],
#     how="left"
# ).fillna(False, subset=goal_cols)

# Here
pass_definition = (
    col("is_pass_attempt") &
    ~col("is_cross") &
    ~col("is_keeper_throw") &
    ~col("is_throw_in")
)

events_with_corner_flags_df = (
    event
    .withColumn("corner_taker", sf.when(sf.array_contains(col("qualifiers_values"), qualifier_map["corner_taken"]), col("player_id")))
    .withColumn("current_corner_taker", sf.last("corner_taker", ignorenulls=True).over(w))
)

corners_to_goal_stats = (
    events_with_corner_flags_df
    .groupBy("match_id", "current_corner_taker")
    .agg(
        sf.count("corner_taker").alias("corners_taken"),
        sf.count(
            sf.when(
                (sf.array_contains(col("qualifiers_values"), qualifier_map["from_corner"])) &
                (col("type_display_name") == "goal"),
                True
            )
        ).alias("corners_leading_to_goal")
    )
    .withColumnRenamed("current_corner_taker", "player_id")
)

events_with_flags_df = (
    events_with_flags_df
    .withColumn(
        "is_official_goal",
        col("is_goal") & ~col("is_penalty_shootout")
    ).withColumn(
        "is_extra_time_goal",
        col("is_goal") & col("is_extra_time")
    )
    
    .withColumn(
        "is_official_penalty_taken",
        col("is_penalty_taken") & ~col("is_penalty_shootout")
    )
    
    .withColumn(
        "is_pso_penalty_taken",
        (col("is_goal") | col("is_saved_shot") | col("is_missed_shot")) & col("is_penalty_shootout")
    ).withColumn(
        "is_pso_penalty_goal",
        col("is_goal") & col("is_penalty_shootout")
    )

    .withColumn(
        "is_valid_long_ball",
        col("is_long_ball") & pass_definition
    ).withColumn(
        "is_valid_through_ball",
        col("is_through_ball") & pass_definition
    )
    
    .withColumn(
        "is_collected_save",
        col("is_save_attempt") & col("is_collected")
    ).withColumn(
        "is_parried_safe_save",
        col("is_save_attempt") & col("is_parried_safe")
    ).withColumn(
        "is_parried_danger_save",
        col("is_save_attempt") & col("is_parried_danger")
    ).withColumn(
        "is_successful_claim",
        col("is_claim") & col("is_successful")
    )

    .withColumn(
        "is_official_penalty_faced",
        col("is_penalty_faced") & ~col("is_penalty_shootout")
    ).withColumn(
        "is_official_penalty_saved",
        col("is_keeper_saved") & col("is_official_penalty_faced")
    ).withColumn(
        "is_official_penalty_missed",
        col("is_keeper_missed") & col("is_official_penalty_faced")
    )

    .withColumn(
        "is_pso_penalty_faced",
        col("is_penalty_faced") & col("is_penalty_shootout")
    ).withColumn(
        "is_pso_penalty_saved",
        col("is_keeper_saved") & col("is_pso_penalty_faced")
    ).withColumn(
        "is_pso_penalty_missed",
        col("is_keeper_missed") & col("is_pso_penalty_faced")
    )
)

# events_with_flags_df.filter(
#     (col("match_id") == 1201831) &
#     (col("type_display_name") == "save"),
# ).show(n=10000, truncate=False)

player_stats = (
    events_with_flags_df
    .groupBy("match_id", "player_id")
    .agg(
        # Goals and Assists
        sf.count(sf.when(col("is_official_goal"), True)).alias("goals_scored"),
        sf.count(sf.when(col("is_extra_time_goal"), True)).alias("extra_time_goals"),
        sf.count(sf.when(col("is_assist"), True)).alias("assists"),

        # By zones
        sf.count(sf.when(col("is_goal_six_yard_box"), True)).alias("goals_six_yard_box"),
        sf.count(sf.when(col("is_goal_penalty_area"), True)).alias("goals_penalty_area"),
        sf.count(sf.when(col("is_goal_out_of_box"), True)).alias("goals_out_of_box"),

        # By body parts
        sf.count(sf.when(col("is_goal_right_foot"), True)).alias("goals_right_foot"),
        sf.count(sf.when(col("is_goal_left_foot"), True)).alias("goals_left_foot"),
        sf.count(sf.when(col("is_goal_head"), True)).alias("goals_head"),
        sf.count(sf.when(col("is_goal_other_body_parts"), True)).alias("goals_other_body_parts"),

        # By situations
        sf.count(sf.when(col("is_goal_open_play"), True)).alias("goals_open_play"),
        sf.count(sf.when(col("is_goal_counter"), True)).alias("goals_counter"),
        sf.count(sf.when(col("is_goal_set_piece"), True)).alias("goals_set_piece"),
        sf.count(sf.when(col("is_own_goal"), True)).alias("own_goals"), 

        # Penalty record (regulation time + ET and PSO)
        sf.count(sf.when(col("is_official_penalty_taken"), True)).alias("penalties_taken"),
        sf.count(sf.when(col("is_official_penalty_scored"), True)).alias("penalties_scored"),
        sf.count(sf.when(col("is_pso_penalty_taken"), True)).alias("pso_penalties_taken"),
        sf.count(sf.when(col("is_pso_penalty_goal"), True)).alias("pso_penalties_scored"),

        # Cards
        sf.count(sf.when(col("is_yellow"), True)).alias("yellow_cards"),
        sf.count(sf.when(col("is_second_yellow"), True)).alias("second_yellow_cards"),
        sf.count(sf.when(col("is_red"), True)).alias("red_cards"),

        # Passes
        sf.count(sf.when(pass_definition, True)).alias("total_passes"),
        sf.count(sf.when(pass_definition & col("is_successful"), True)).alias("accurate_passes"),
        sf.count(sf.when(
            col("is_pass_attempt") &
            sf.array_contains(col("satisfied_events_types"), event_type_map["pass_key"]), True)
        ).alias("key_passes"),
        sf.count(
            sf.when(
                (col("type_display_name") == "ball_touch") &
                ~col("is_successful"),
                True
            )
        ).alias("unsuccessful_touches"),
        sf.sum(sf.when(sf.array_contains(col("satisfied_events_types"), event_type_map["touches"]), 1)).alias("total_touches"),
        sf.count(sf.when(col("is_pass_attempt"), True)).alias("possession"),
        sf.count(sf.when(col("is_cross"), True)).alias("total_crosses"),
        sf.count(sf.when(col("is_cross") & col("is_successful"), True)).alias("accurate_crosses"),
        sf.count(sf.when(col("is_valid_long_ball") & col("is_successful"), True)).alias("accurate_long_balls"),
        sf.count(sf.when(col("is_valid_long_ball"), True)).alias("total_long_balls"),
        sf.count(sf.when(col("is_valid_through_ball") & col("is_successful"), True)).alias("accurate_through_balls"),
        sf.count(sf.when(col("is_valid_through_ball"), True)).alias("total_through_balls"),

        # Shots
        sf.sum(sf.when(sf.array_contains(col("satisfied_events_types"), event_type_map["shots_total"]), 1).otherwise(0)).alias("total_shots"),
        sf.sum(sf.when(sf.array_contains(col("satisfied_events_types"), event_type_map["shot_on_target"]), 1).otherwise(0)).alias("total_shots_on_target"),
        sf.sum(sf.when(sf.array_contains(col("satisfied_events_types"), event_type_map["shot_off_target"]), 1).otherwise(0)).alias("total_shots_off_target"),
        sf.sum(sf.when(sf.array_contains(col("satisfied_events_types"), event_type_map["shot_on_post"]), 1).otherwise(0)).alias("total_woodwork_shots"),
        sf.sum(sf.when(sf.array_contains(col("satisfied_events_types"), event_type_map["shot_blocked"]), 1).otherwise(0)).alias("total_shots_blocked"),

        # Dribbles
        sf.sum(sf.when(sf.array_contains(col("satisfied_events_types"), event_type_map["dribble_won"]), 1).otherwise(0)).alias("dribbles_won"),
        sf.count(
            sf.when(
                (col("type_display_name") == "take_on") &
                ~(sf.array_contains(col("satisfied_events_types"), event_type_map["overrun"])), True
            )
        ).alias("dribbles_attempted"),

        # Aerial Duels
        sf.sum(sf.when(sf.array_contains(col("satisfied_events_types"), event_type_map["duel_aerial_won"]), 1).otherwise(0)).alias("aerials_won"),
        sf.sum(sf.when(sf.array_contains(col("satisfied_events_types"), event_type_map["offensive_duel"]), 1).otherwise(0)).alias("offensive_aerials"),
        sf.sum(sf.when(sf.array_contains(col("satisfied_events_types"), event_type_map["defensive_duel"]), 1).otherwise(0)).alias("defensive_aerials"),

        # Tackles
        sf.count(sf.when(col("type_display_name").isin(["tackle", "challenge"]), True)).alias("tackles_attempted"),
        sf.count(sf.when(col("type_display_name") == "tackle", True)).alias("successful_tackles"),
        sf.count(sf.when(col("type_display_name") == "challenge", True)).alias("dribbled_past"),
        sf.count(sf.when(col("is_last_man_tackle"), True)).alias("last_man_tackles"),
        sf.count(sf.when(col("type_display_name") == "clearance", True)).alias("clearances"),
        sf.count(sf.when(col("is_clearance_off_the_line"), True)).alias("clearances_off_the_line"),
        sf.count(sf.when(col("type_display_name") == "interception", True)).alias("interception"),

        # Corners
        sf.count(
            sf.when(
                (col("type_display_name") == "corner_awarded") &
                (col("is_successful")),
                True
            )
        ).alias("corners_won"),
        sf.count(
            sf.when(
                (col("type_display_name") == "corner_awarded") &
                ~(col("is_successful")),
                True
            )
        ).alias("corners_conceded"),
        # sf.count(sf.when(sf.array_contains(col("qualifiers_values"), qualifier_map["corner_taken"]), True)).alias("corners_taken"),
        sf.count(
            sf.when(
                (sf.array_contains(col("qualifiers_values"), qualifier_map["corner_taken"])) &
                (col("is_successful")), True
            )
        ).alias("accurate_corners"),

        # Dispossessed
        sf.count(sf.when(col("type_display_name") == "dispossessed", True)).alias("dispossessed"),
        sf.count(
            sf.when(
                (col("type_display_name") == "foul") &
                ~(col("is_successful")), 
                True
            )
        ).alias("fouls"),
        sf.count(
            sf.when(
                (col("type_display_name") == "foul") &
                (col("is_successful")), 
                True
            )
        ).alias("was_fouled"),
        sf.count(sf.when(col("type_display_name") == "error", True)).alias("errors"),
        sf.count(
            sf.when(
                (col("type_display_name") == "error") &
                (sf.array_contains(col("satisfied_events_types"), event_type_map["error_leads_to_goal"])), 
                True
            )
        ).alias("errors_lead_to_goal"),
        sf.count(sf.when(col("type_display_name") == "offside_given", True)).alias("offsides"),

        sf.count(sf.when(col("is_save_attempt"), True)).alias("total_saves"),
        sf.count(sf.when(col("is_collected_save"), True)).alias("collected_saves"),
        sf.count(sf.when(col("is_parried_safe_save"), True)).alias("parried_saves"),
        sf.count(sf.when(col("is_parried_danger_save"), True)).alias("parried_danger_saves"),

        sf.count(sf.when(col("is_successful_claim"), True)).alias("successful_claims"),
        sf.count(sf.when(col("is_claim"), True)).alias("total_claims"),

        sf.count(sf.when(col("is_official_penalty_faced"), True)).alias("official_penalties_faced"),
        sf.count(sf.when(col("is_official_penalty_saved"), True)).alias("official_penalties_saved"),
        sf.count(sf.when(col("is_official_penalty_missed"), True)).alias("official_penalties_missed"),

        sf.count(sf.when(col("is_pso_penalty_faced"), True)).alias("pso_penalties_faced"),
        sf.count(sf.when(col("is_pso_penalty_saved"), True)).alias("pso_penalties_saved"),
        sf.count(sf.when(col("is_pso_penalty_missed"), True)).alias("pso_penalties_missed"),


        # sf.count(
        #     sf.when(
        #         (sf.array_contains(col("qualifiers_values"), qualifier_map["from_corner"])) &
        #         (col("type_display_name") == "goal") &
        #         (col("current_corner_taker") == col("player_id")),
        #         True
        #     )
        # ).alias("corners_leading_to_goal")
        # sf.count(
        #     sf.when(
        #         (sf.array_contains(col("qualifiers_values"), qualifier_map["from_corner"])) &
        #         (col("type_display_name") == "goal"), True
        #     )
        # ).alias("corners_leading_to_goal")
    )
    # .withColumn(
    #     "pass_success_percentage",
    #     sf.round(col("accurate_passes") * 100 / col("total_passes"), 2)
    # )
)

player_stats = (
    player_stats.alias("ps")
    .filter((col("player_id").isNotNull()))
    .join(player.alias("pl"), on="player_id", how="inner")
    .join(match.alias("m"), on="match_id", how="inner")
    .join(participation_with_minutes.alias("pt"), on=["player_id", "match_id"], how="inner") 
    .join(team.alias("t1"), col("pt.team_id") == col("t1.team_id"), "inner")
    .withColumn(
        "is_home",
        sf.when(
            col("home_team_id") == col("pt.team_id"),
            True
        ).otherwise(False)
    )
    .withColumn(
        "opposing_team_id",
        sf.when(
            col("is_home"),
            col("away_team_id")
        ).otherwise(col("home_team_id"))
    )
    .join(team.alias("t2"), col("opposing_team_id") == col("t2.team_id"), "inner")
    .join(stage, on="stage_id", how="inner")
    .join(corners_to_goal_stats, on=["player_id", "match_id"], how="left")
    
    # .filter(
    #     col("total_saves") > col("collected_saves") + col("parried_saves") + col("parried_danger_saves")
    #     # (col("match_id") == 1201831)
    #     # (col("tournament_name") == "FIFA World Cup") &
    #     # (col("season_name") == 2022) &
    #     # (col("player_name") == "Rodri")
    # )
    .select(
        # PK + player info
        col("match_id"),
        col("player_id"),
        col("player_name"),

        # Player stats
        # Minutes played
        col("minutes_played"),

        # Goals and Assists
        col("goals_scored"),
        col("extra_time_goals"),
        col("assists"),

        col("goals_six_yard_box"),
        col("goals_penalty_area"),
        col("goals_out_of_box"),

        col("goals_right_foot"),
        col("goals_left_foot"),
        col("goals_head"),
        col("goals_other_body_parts"),

        col("goals_open_play"),
        col("goals_counter"),
        col("goals_set_piece"),
        col("own_goals"), 

        col("penalties_taken"),
        col("penalties_scored"),
        col("pso_penalties_taken"),
        col("pso_penalties_scored"),

        # Cards
        col("yellow_cards"),
        col("second_yellow_cards"),
        col("red_cards"),

        # Passes
        col("total_passes"),
        col("accurate_passes"),
        col("key_passes"),
        col("possession"),
        col("unsuccessful_touches"),
        col("total_touches"),
        col("total_crosses"),
        col("accurate_crosses"),
        col("accurate_long_balls"),
        col("total_long_balls"),
        col("accurate_through_balls"),
        col("total_through_balls"),

        # Shots
        col("total_shots"),
        col("total_shots_on_target"),
        col("total_shots_off_target"),
        col("total_woodwork_shots"),
        col("total_shots_blocked"),

        # Dribbles
        col("dribbles_won"),
        col("dribbles_attempted"),

        # Aerials
        col("aerials_won"),
        col("offensive_aerials"),
        col("defensive_aerials"),

        # Tackles
        col("tackles_attempted"),
        col("successful_tackles"),
        col("dribbled_past"),
        col("last_man_tackles"),
        col("clearances"),
        col("clearances_off_the_line"),
        col("interception"),

        # Corners
        col("corners_won"),
        col("corners_conceded"),
        col("corners_taken"),
        col("accurate_corners"),
        col("corners_leading_to_goal"),

        # Dispossessed
        col("dispossessed"),
        col("fouls"),
        col("was_fouled"),
        col("errors"),
        col("errors_lead_to_goal"),
        col("offsides"),

        # Goalkeeping stats
        col("total_saves"),
        col("collected_saves"),
        col("parried_saves"),
        col("parried_danger_saves"),
        col("successful_claims"),
        col("total_claims"),

        col("official_penalties_faced"),
        col("official_penalties_saved"),
        col("official_penalties_missed"),
                
        col("pso_penalties_faced"),
        col("pso_penalties_saved"),
        col("pso_penalties_missed"),

        # Match info
        col("start_time_utc"),
        col("pt.team_id").alias("team_id"),
        col("t1.team_name").alias("team_name"),
        col("opposing_team_id"),
        col("t2.team_name").alias("opposing_team_name"),
        col("is_home"),

        # Stage info
        col("region_name"),
        col("tournament_name"),
        col("season_name"),
        col("stage_name"),
        col("m.stage_id")
    )
    .orderBy(col("goals_scored").desc())
)

columns_to_fill = ["corners_taken", "corners_leading_to_goal"]
player_stats = player_stats.fillna(0, subset=columns_to_fill)

(
    player_stats
    # .filter(
    # #    col("match_id") == 1697443
    #     (col("player_id") == 406347)
    #     & (col("tournament_name") == "Champions League")
    #     & (col("season_name") == "2023/2024")
    # )
    .show(50)
)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `is_goal_obp` cannot be resolved. Did you mean one of the following? [`is_goal`, `is_goal_head`, `is_yellow`, `is_claim`, `is_goal_counter`].;
'Aggregate [match_id#0, player_id#4], [match_id#0, player_id#4, count(CASE WHEN is_official_goal#167938 THEN true END) AS goals#169224L, count(CASE WHEN is_extra_time_goal#168001 THEN true END) AS extra_time_goals#169226L, count(CASE WHEN is_assist#166363 THEN true END) AS assists#169228L, count(CASE WHEN is_goal_six_yard_box#166426 THEN true END) AS goals_six_yard_box#169230L, count(CASE WHEN is_goal_penalty_area#166459 THEN true END) AS goals_penalty_area#169232L, count(CASE WHEN is_goal_out_of_box#166493 THEN true END) AS goals_out_of_box#169234L, count(CASE WHEN is_goal_right_foot#166678 THEN true END) AS goals_right_foot#169236L, count(CASE WHEN is_goal_left_foot#166718 THEN true END) AS goals_left_foot#169238L, count(CASE WHEN is_goal_head#166759 THEN true END) AS goals_head#169240L, count(CASE WHEN 'is_goal_obp THEN true END) AS goals_other_body_parts#169242, count(CASE WHEN is_goal_open_play#166528 THEN true END) AS goals_open_play#169244L, count(CASE WHEN is_goal_counter#166564 THEN true END) AS goals_counter#169246L, count(CASE WHEN is_goal_set_piece#166601 THEN true END) AS goals_set_piece#169248L, count(CASE WHEN is_own_goal#167329 THEN true END) AS own_goals#169250L, count(CASE WHEN is_official_penalty_taken#168065 THEN true END) AS penalties_taken#169252L, count(CASE WHEN is_official_penalty_scored#166639 THEN true END) AS penalties_scored#169254L, count(CASE WHEN is_pso_penalty_taken#168130 THEN true END) AS pso_penalties_taken#169256L, count(CASE WHEN is_pso_penalty_goal#168196 THEN true END) AS pso_penalties_scored#169258L, count(CASE WHEN is_yellow#167173 THEN true END) AS yellow_cards#169260L, count(CASE WHEN is_second_yellow#167224 THEN true END) AS second_yellow_cards#169262L, count(CASE WHEN is_red#167276 THEN true END) AS red_cards#169264L, count(CASE WHEN (((is_pass_attempt#166174 AND NOT is_cross#166198) AND NOT is_keeper_throw#166223) AND NOT is_throw_in#166249) THEN true END) AS total_passes#169266L, ... 49 more fields]
+- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 54 more fields]
   +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 53 more fields]
      +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 52 more fields]
         +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 51 more fields]
            +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 50 more fields]
               +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 49 more fields]
                  +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 48 more fields]
                     +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 47 more fields]
                        +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 46 more fields]
                           +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 45 more fields]
                              +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 44 more fields]
                                 +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 43 more fields]
                                    +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 42 more fields]
                                       +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 41 more fields]
                                          +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 40 more fields]
                                             +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 39 more fields]
                                                +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 38 more fields]
                                                   +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 37 more fields]
                                                      +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 36 more fields]
                                                         +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 35 more fields]
                                                            +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 34 more fields]
                                                               +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 33 more fields]
                                                                  +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 32 more fields]
                                                                     +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 31 more fields]
                                                                        +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 30 more fields]
                                                                           +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 29 more fields]
                                                                              +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 28 more fields]
                                                                                 +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 27 more fields]
                                                                                    +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 26 more fields]
                                                                                       +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 25 more fields]
                                                                                          +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 24 more fields]
                                                                                             +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 23 more fields]
                                                                                                +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 22 more fields]
                                                                                                   +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 21 more fields]
                                                                                                      +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 20 more fields]
                                                                                                         +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 19 more fields]
                                                                                                            +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 18 more fields]
                                                                                                               +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 17 more fields]
                                                                                                                  +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 16 more fields]
                                                                                                                     +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 15 more fields]
                                                                                                                        +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 14 more fields]
                                                                                                                           +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 13 more fields]
                                                                                                                              +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 12 more fields]
                                                                                                                                 +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 11 more fields]
                                                                                                                                    +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 10 more fields]
                                                                                                                                       +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 9 more fields]
                                                                                                                                          +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 8 more fields]
                                                                                                                                             +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 7 more fields]
                                                                                                                                                +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 6 more fields]
                                                                                                                                                   +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 5 more fields]
                                                                                                                                                      +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 4 more fields]
                                                                                                                                                         +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 3 more fields]
                                                                                                                                                            +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, ... 2 more fields]
                                                                                                                                                               +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, is_cross#166198, array_contains(qualifiers_values#17, 123) AS is_keeper_throw#166223]
                                                                                                                                                                  +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, is_pass_attempt#166174, array_contains(qualifiers_values#17, 2) AS is_cross#166198]
                                                                                                                                                                     +- Project [match_id#0, _event_id#1, event_id#2, team_id#3, player_id#4, minute#5, second#6, period_value#7, period_display_name#8, type_display_name#9, is_successful#10, x#11, y#12, end_x#13, end_y#14, qualifiers_display_names#15, satisfied_events_types_names#16, qualifiers_values#17, satisfied_events_types#18, league#19, season#20, stage_id#21, (type_display_name#9 = pass) AS is_pass_attempt#166174]
                                                                                                                                                                        +- Relation [match_id#0,_event_id#1,event_id#2,team_id#3,player_id#4,minute#5,second#6,period_value#7,period_display_name#8,type_display_name#9,is_successful#10,x#11,y#12,end_x#13,end_y#14,qualifiers_display_names#15,satisfied_events_types_names#16,qualifiers_values#17,satisfied_events_types#18,league#19,season#20,stage_id#21] parquet


#### Building flags from all qualifier and event type mapping
Idea: Create all flags ("is_shot_on_target") from mapping ("shot_on_target") at once  
to avoid array_contains overhead

In [15]:
import pyspark.sql.functions as sf
from pyspark.sql.functions import col
from pyspark.sql.window import Window

# Get corner_taken and corners_leading_to_goal data with window function
w = Window.partitionBy("match_id").orderBy("_event_id")


# 1/ Identify unique and overlapping mapping
qualifier_names = set(qualifier_map.keys())
event_type_names = set(event_type_map.keys())

overlapping_names = qualifier_names.intersection(event_type_names)
qualifier_only_names = qualifier_names - overlapping_names
event_type_only_names = event_type_names - overlapping_names


# qualifiers only
qual_only_exprs = [
    sf.array_contains(col("qualifiers_values"), qualifier_map[name]).alias(f"is_{name}")
    for name in qualifier_only_names
]

# event types only
event_only_exprs = [
    sf.array_contains(col("satisfied_events_types"), event_type_map[name]).alias(f"is_{name}")
    for name in event_type_only_names
]

# overlapping flag
overlapping_exprs = []
for name in overlapping_names:
    qual_check = sf.array_contains(col("qualifiers_values"), qualifier_map[name])
    event_check = sf.array_contains(col("satisfied_events_types"), event_type_map[name])
    
    merged_expr = (qual_check | event_check).alias(f"is_{name}")
    overlapping_exprs.append(merged_expr)

all_flag_exprs = qual_only_exprs + event_only_exprs + overlapping_exprs

# 2/ create events with flags in 1 single pass
events_with_flags_df = (
    event
    # Simple flags from non-array columns
    .withColumn("is_pass_attempt", col("type_display_name") == "pass")
    .withColumn("is_goal", col("type_display_name") == "goal")
    .withColumn("is_ball_touch", col("type_display_name") == "ball_touch")
    .withColumn("is_take_on", col("type_display_name") == "take_on")

    .withColumn("is_tackle_attempt", col("type_display_name").isin(["tackle", "challenge"]))
    .withColumn("is_successful_tackle", col("type_display_name") == "tackle")
    .withColumn("is_dribbled_past", col("type_display_name") == "challenge")
    .withColumn("is_clearance", col("type_display_name") == "clearance")

    .withColumn("is_error", col("type_display_name") == "error")

    .withColumn("is_regulation_time", col("period_display_name").isin(["first_half", "second_half"]))
    .withColumn("is_extra_time", col("period_display_name").isin(["first_period_of_extra_time", "second_period_of_extra_time"]))
    .withColumn("is_penalty_shootout", col("period_display_name") == "penalty_shootout")
    .withColumn("is_saved_shot", col("type_display_name") == "saved_shot")
    .withColumn("is_missed_shot", col("type_display_name") == "missed_shots")
    .withColumn("is_claim", col("type_display_name") == "claim")
    .withColumn("is_penalty_faced", col("type_display_name") == "penalty_faced")

    # Create a temporary struct with ALL the array-based flags at once
    .withColumn("all_flags", sf.struct(*all_flag_exprs))
    
    # Expand the struct's fields into top-level columns
    .select("*", "all_flags.*")
    
    # Drop the temporary struct column
    .drop("all_flags")
)


# Here
pass_definition = (
    col("is_pass_attempt") &
    ~col("is_cross") &
    ~col("is_keeper_throw") &
    ~col("is_throw_in")
)

events_with_corner_flags_df = (
    event
    .withColumn("corner_taker", sf.when(sf.array_contains(col("qualifiers_values"), qualifier_map["corner_taken"]), col("player_id")))
    .withColumn("current_corner_taker", sf.last("corner_taker", ignorenulls=True).over(w))
)

corners_to_goal_stats = (
    events_with_corner_flags_df
    .groupBy("match_id", "current_corner_taker")
    .agg(
        sf.count("corner_taker").alias("corners_taken"),
        sf.count(
            sf.when(
                (sf.array_contains(col("qualifiers_values"), qualifier_map["from_corner"])) &
                (col("type_display_name") == "goal"),
                True
            )
        ).alias("corners_leading_to_goal")
    )
    .withColumnRenamed("current_corner_taker", "player_id")
)

events_with_flags_df = (
    events_with_flags_df
    .withColumn(
        "is_official_goal",
        col("is_goal") & ~col("is_own_goal") & ~col("is_penalty_shootout")
    ).withColumn(
        "is_extra_time_goal",
        col("is_goal") & col("is_extra_time")
    )
    
    .withColumn(
        "is_official_penalty_taken",
        col("is_penalty") & ~col("is_penalty_shootout")
    )
    
    .withColumn(
        "is_pso_penalty_taken",
        col("is_penalty") & col("is_penalty_shootout")
    ).withColumn(
        "is_pso_penalty_goal",
        col("is_goal") & col("is_penalty_shootout")
    )

    .withColumn(
        "is_valid_long_ball",
        col("is_longball") & pass_definition
    ).withColumn(
        "is_valid_through_ball",
        col("is_throughball") & pass_definition
    )

    .withColumn(
        "is_valid_dribble",
        col("is_dribble_won") | col("is_dribble_lost")
    )

    .withColumn(
        "is_duel_aerial",
        col("is_duel_aerial_won") | col("is_duel_aerial_lost")
    )

    .withColumn(
        "is_corner_won",
        col("is_corner_awarded") & col("is_successful")
    ).withColumn(
        "is_corner_conceded",
        col("is_corner_awarded") & ~col("is_successful")
    )

    .withColumn(
        "is_official_penalty_faced",
        col("is_penalty_faced") & ~col("is_penalty_shootout")
    ).withColumn(
        "is_official_penalty_conceded",
        col("is_keeper_missed") & col("is_official_penalty_faced")
    )

    .withColumn(
        "is_pso_penalty_faced",
        col("is_penalty_faced") & col("is_penalty_shootout")
    )
)

# events_with_flags_df.filter(
#     (col("match_id") == 1201831) &
#     (col("type_display_name") == "save"),
# ).show(n=10000, truncate=False)

w_match = Window.partitionBy("match_id")

player_stats = (
    events_with_flags_df
    .groupBy("match_id", "player_id")
    .agg(
        # Goals and Assists
        sf.count(sf.when(col("is_official_goal"), True)).alias("goals_scored"),
        sf.count(sf.when(col("is_extra_time_goal"), True)).alias("extra_time_goals"),
        sf.count(sf.when(col("is_assist"), True)).alias("assists"),

        # By zones
        sf.count(sf.when(col("is_goal_six_yard_box"), True)).alias("goals_six_yard_box"),
        sf.count(sf.when(col("is_goal_penalty_area"), True)).alias("goals_penalty_area"),
        sf.count(sf.when(col("is_goal_obox"), True)).alias("goals_out_of_box"),

        # By body parts
        sf.count(sf.when(col("is_goal_right_foot"), True)).alias("goals_right_foot"),
        sf.count(sf.when(col("is_goal_left_foot"), True)).alias("goals_left_foot"),
        sf.count(sf.when(col("is_goal_head"), True)).alias("goals_head"),
        sf.count(sf.when(col("is_goal_obp"), True)).alias("goals_other_body_parts"),

        # By situations
        sf.count(sf.when(col("is_goal_open_play"), True)).alias("goals_open_play"),
        sf.count(sf.when(col("is_goal_counter"), True)).alias("goals_counter"),
        sf.count(sf.when(col("is_goal_set_piece"), True)).alias("goals_set_piece"),
        sf.count(sf.when(col("is_own_goal"), True)).alias("own_goals"), 

        # Penalty record (regulation time + ET and PSO)
        sf.count(sf.when(col("is_official_penalty_taken"), True)).alias("penalties_taken"),
        sf.count(sf.when(col("is_penalty_scored"), True)).alias("penalties_scored"),
        sf.count(sf.when(col("is_pso_penalty_taken"), True)).alias("pso_penalties_taken"),
        sf.count(sf.when(col("is_pso_penalty_goal"), True)).alias("pso_penalties_scored"),

        # Cards
        sf.count(sf.when(col("is_yellow"), True)).alias("yellow_cards"),
        sf.count(sf.when(col("is_second_yellow"), True)).alias("second_yellow_cards"),
        sf.count(sf.when(col("is_red"), True)).alias("red_cards"),

        # Passes
        sf.count(sf.when(pass_definition, True)).alias("total_passes"),
        sf.count(sf.when(pass_definition & col("is_successful"), True)).alias("accurate_passes"),
        sf.count(sf.when(col("is_key_pass"), True)).alias("key_passes"),
        sf.count(sf.when(col("is_ball_touch") & ~col("is_successful"), True)).alias("unsuccessful_touches"),
        sf.count(sf.when(col("is_touches"), True)).alias("total_touches"),
        sf.count(sf.when(col("is_pass_attempt") & ~col("is_throw_in"), True)).alias("possession"),
        sf.count(sf.when(col("is_cross"), True)).alias("total_crosses"),
        sf.count(sf.when(col("is_cross") & col("is_successful"), True)).alias("accurate_crosses"),
        sf.count(sf.when(col("is_valid_long_ball") & col("is_successful"), True)).alias("accurate_long_balls"),
        sf.count(sf.when(col("is_valid_long_ball"), True)).alias("total_long_balls"),
        sf.count(sf.when(col("is_valid_through_ball") & col("is_successful"), True)).alias("accurate_through_balls"),
        sf.count(sf.when(col("is_valid_through_ball"), True)).alias("total_through_balls"),

        # Shots
        sf.count(sf.when(col("is_shots_total"), True)).alias("total_shots"),
        sf.count(sf.when(col("is_shot_on_target"), True)).alias("total_shots_on_target"),
        sf.count(sf.when(col("is_shot_off_target"), True)).alias("total_shots_off_target"),
        sf.count(sf.when(col("is_shot_on_post"), True)).alias("total_woodwork_shots"),
        sf.count(sf.when(col("is_shot_blocked"), True)).alias("total_shots_blocked"),

        # Dribbles
        sf.count(sf.when(col("is_dribble_won"), True)).alias("dribbles_won"),
        sf.count(sf.when(col("is_valid_dribble"), True)).alias("dribbles_attempted"),

        # Aerial Duels
        sf.count(sf.when(col("is_duel_aerial"), True)).alias("aerials_total"),
        sf.count(sf.when(col("is_duel_aerial_won"), True)).alias("aerials_won"),
        sf.count(sf.when(col("is_offensive_duel"), True)).alias("offensive_aerials"),
        sf.count(sf.when(col("is_defensive_duel"), True)).alias("defensive_aerials"),

        # Tackles
        sf.count(sf.when(col("is_tackle_attempt"), True)).alias("tackles_attempted"),
        sf.count(sf.when(col("is_successful_tackle"), True)).alias("successful_tackles"),
        sf.count(sf.when(col("is_dribbled_past") , True)).alias("dribbled_past"),
        sf.count(sf.when(col("is_tackle_last_man"), True)).alias("last_man_tackles"),
        sf.count(sf.when(col("is_clearance"), True)).alias("clearances"),
        sf.count(sf.when(col("is_clearance_off_the_line"), True)).alias("clearances_off_the_line"),
        sf.count(sf.when(col("is_interception_won"), True)).alias("interceptions_won"),

        # Corners
        sf.count(sf.when(col("is_corner_won"), True)).alias("corners_won"),
        sf.count(sf.when(col("is_corner_conceded"), True)).alias("corners_conceded"),
        # sf.count(sf.when(sf.array_contains(col("qualifiers_values"), qualifier_map["corner_taken"]), True)).alias("corners_taken"),
        sf.count(sf.when(col("is_corner_taken") & col("is_successful"), True)).alias("accurate_corners"),

        # Dispossessed
        sf.count(sf.when(col("is_dispossessed"), True)).alias("dispossessed"),
        sf.count(sf.when(col("is_foul_committed"), True)).alias("fouls_committed"),
        sf.count(sf.when(col("is_foul_given"), True)).alias("was_fouled"),
        sf.count(sf.when(col("is_error"), True)).alias("errors"),
        sf.count(sf.when(col("is_error_leads_to_goal"), True)).alias("errors_lead_to_goal"),
        sf.count(sf.when(col("is_offside_given"), True)).alias("offsides"),

        sf.count(sf.when(col("is_keeper_save_total"), True)).alias("total_saves"),
        sf.count(sf.when(col("is_keeper_save_total") & col("is_collected"), True)).alias("collected_saves"),
        sf.count(sf.when(col("is_keeper_save_total") & col("is_parried_safe"), True)).alias("parried_saves"),
        sf.count(sf.when(col("is_keeper_save_total") & col("is_parried_danger"), True)).alias("parried_danger_saves"),

        sf.count(sf.when(col("is_claim"), True)).alias("total_claims"),
        sf.count(sf.when(col("is_claim") & col("is_successful"), True)).alias("successful_claims"),

        sf.count(sf.when(col("is_official_penalty_faced"), True)).alias("official_penalties_faced"),
        sf.count(sf.when(col("is_keeper_penalty_saved"), True)).alias("official_penalties_saved"),
        sf.count(sf.when(col("is_official_penalty_conceded"), True)).alias("official_penalties_conceded"),

        sf.count(sf.when(col("is_pso_penalty_faced"), True)).alias("pso_penalties_faced"),
        sf.count(sf.when(col("is_penalty_shootout_saved_g_k"), True)).alias("pso_penalties_saved"),
        sf.count(sf.when(col("is_penalty_shootout_conceded_g_k"), True)).alias("pso_penalties_conceded"),


        # sf.count(
        #     sf.when(
        #         (sf.array_contains(col("qualifiers_values"), qualifier_map["from_corner"])) &
        #         (col("type_display_name") == "goal") &
        #         (col("current_corner_taker") == col("player_id")),
        #         True
        #     )
        # ).alias("corners_leading_to_goal")
        # sf.count(
        #     sf.when(
        #         (sf.array_contains(col("qualifiers_values"), qualifier_map["from_corner"])) &
        #         (col("type_display_name") == "goal"), True
        #     )
        # ).alias("corners_leading_to_goal")
    )
    .withColumn(
        "pass_success_percentage",
        sf.round(col("accurate_passes") * 100 / col("total_passes"), 2)
    )
    .withColumn(
        "total_possession",
        sf.sum("possession").over(w_match)
    )
    .withColumn(
        "possession_percentage",
        sf.round(col("possession") * 100 / col("total_possession"), 2)
    )
    .drop("total_possession")
)

player_stats = (
    player_stats.alias("ps")
    .filter((col("player_id").isNotNull()))
    .join(player.alias("pl"), on="player_id", how="inner")
    .join(match.alias("m"), on="match_id", how="inner")
    .join(participation_with_minutes.alias("pt"), on=["player_id", "match_id"], how="inner") 
    .join(team.alias("t1"), col("pt.team_id") == col("t1.team_id"), "inner")
    .withColumn(
        "is_home",
        sf.when(
            col("home_team_id") == col("pt.team_id"),
            True
        ).otherwise(False)
    )
    .withColumn(
        "opposing_team_id",
        sf.when(
            col("is_home"),
            col("away_team_id")
        ).otherwise(col("home_team_id"))
    )
    .join(team.alias("t2"), col("opposing_team_id") == col("t2.team_id"), "inner")
    .join(stage, on="stage_id", how="inner")
    .join(corners_to_goal_stats, on=["player_id", "match_id"], how="left")
    
    # .filter(
    #     col("total_saves") > col("collected_saves") + col("parried_saves") + col("parried_danger_saves")
    #     # (col("match_id") == 1201831)
    #     # (col("tournament_name") == "FIFA World Cup") &
    #     # (col("season_name") == 2022) &
    #     # (col("player_name") == "Rodri")
    # )
    .select(
        # PK + player info
        col("match_id"),
        col("player_id"),
        col("player_name"),

        # Player stats
        # Minutes played
        col("is_first_eleven"),
        col("minutes_played"),

        # Goals and Assists
        col("goals_scored"),
        col("extra_time_goals"),
        col("assists"),

        col("goals_six_yard_box"),
        col("goals_penalty_area"),
        col("goals_out_of_box"),

        col("goals_right_foot"),
        col("goals_left_foot"),
        col("goals_head"),
        col("goals_other_body_parts"),

        col("goals_open_play"),
        col("goals_counter"),
        col("goals_set_piece"),
        col("own_goals"), 

        col("penalties_taken"),
        col("penalties_scored"),
        col("pso_penalties_taken"),
        col("pso_penalties_scored"),

        # Cards
        col("yellow_cards"),
        col("second_yellow_cards"),
        col("red_cards"),

        # Passes
        col("total_passes"),
        col("accurate_passes"),
        col("key_passes"),

        col("possession"),
        col("possession_percentage"),
        
        col("total_touches"),
        col("unsuccessful_touches"),

        col("total_crosses"),
        col("accurate_crosses"),
        col("accurate_long_balls"),
        col("total_long_balls"),
        col("accurate_through_balls"),
        col("total_through_balls"),

        # Shots
        col("total_shots"),
        col("total_shots_on_target"),
        col("total_shots_off_target"),
        col("total_woodwork_shots"),
        col("total_shots_blocked"),

        # Dribbles
        col("dribbles_won"),
        col("dribbles_attempted"),

        # Aerials
        col("aerials_won"),
        col("offensive_aerials"),
        col("defensive_aerials"),

        # Tackles
        col("tackles_attempted"),
        col("successful_tackles"),
        col("dribbled_past"),
        col("last_man_tackles"),
        col("clearances"),
        col("clearances_off_the_line"),
        col("interceptions_won"),

        # Corners
        col("corners_won"),
        col("corners_conceded"),
        col("corners_taken"),
        col("accurate_corners"),
        col("corners_leading_to_goal"),

        # Dispossessed
        col("dispossessed"),
        col("fouls_committed"),
        col("was_fouled"),
        col("errors"),
        col("errors_lead_to_goal"),
        col("offsides"),

        # Goalkeeping stats
        col("total_saves"),
        col("collected_saves"),
        col("parried_saves"),
        col("parried_danger_saves"),
        col("successful_claims"),
        col("total_claims"),

        col("official_penalties_faced"),
        col("official_penalties_saved"),
        col("official_penalties_conceded"),
                
        col("pso_penalties_faced"),
        col("pso_penalties_saved"),
        col("pso_penalties_conceded"),

        # Match info
        col("start_time_utc"),
        col("pt.team_id").alias("team_id"),
        col("t1.team_name").alias("team_name"),
        col("opposing_team_id"),
        col("t2.team_name").alias("opposing_team_name"),
        col("is_home"),

        # Stage info
        col("region_name"),
        col("region_id"),
        col("tournament_name"),
        col("tournament_id"),
        col("season_name"),
        col("season_id"),
        col("stage_name"),
        col("m.stage_id")
    )
    .orderBy(col("possession_percentage").desc())
)

columns_to_fill = ["corners_taken", "corners_leading_to_goal"]
player_stats = player_stats.fillna(0, subset=columns_to_fill)

# (
#     player_stats
#     .filter(
#        col("match_id") == 1697443
#         # (col("player_id") == 406347)
#         # & (col("tournament_name") == "Champions League")
#         # & (col("season_name") == "2023/2024")
#     )
#     .show(50)
# )

In [11]:
import pyspark.sql.functions as sf
from pyspark.sql.functions import col
from pyspark.sql.window import Window
import yaml

with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

rules = CONFIG['gold_layer']['player_match_stats']

# Get corner_taken and corners_leading_to_goal data with window function
w = Window.partitionBy("match_id").orderBy("_event_id")


# 1/ Identify unique and overlapping mapping
qualifier_names = set(qualifier_map.keys())
event_type_names = set(event_type_map.keys())

overlapping_names = qualifier_names.intersection(event_type_names)
qualifier_only_names = qualifier_names - overlapping_names
event_type_only_names = event_type_names - overlapping_names


# qualifiers only
qual_only_exprs = [
    sf.array_contains(col("qualifiers_values"), qualifier_map[name]).alias(f"is_{name}")
    for name in qualifier_only_names
]

# event types only
event_only_exprs = [
    sf.array_contains(col("satisfied_events_types"), event_type_map[name]).alias(f"is_{name}")
    for name in event_type_only_names
]

# overlapping flag
overlapping_exprs = []
for name in overlapping_names:
    qual_check = sf.array_contains(col("qualifiers_values"), qualifier_map[name])
    event_check = sf.array_contains(col("satisfied_events_types"), event_type_map[name])
    
    merged_expr = (qual_check | event_check).alias(f"is_{name}")
    overlapping_exprs.append(merged_expr)

all_flag_exprs = qual_only_exprs + event_only_exprs + overlapping_exprs

# 2/ create events with flags in 1 single pass
events_with_flags_df = (
    event
    # Simple flags from non-array columns
    .withColumn("is_pass_attempt", col("type_display_name") == "pass")
    .withColumn("is_goal", col("type_display_name") == "goal")
    .withColumn("is_ball_touch", col("type_display_name") == "ball_touch")
    .withColumn("is_take_on", col("type_display_name") == "take_on")

    .withColumn("is_tackle_attempt", col("type_display_name").isin(["tackle", "challenge"]))
    .withColumn("is_successful_tackle", col("type_display_name") == "tackle")
    .withColumn("is_dribbled_past", col("type_display_name") == "challenge")
    .withColumn("is_clearance", col("type_display_name") == "clearance")

    .withColumn("is_error", col("type_display_name") == "error")

    .withColumn("is_regulation_time", col("period_display_name").isin(["first_half", "second_half"]))
    .withColumn("is_extra_time", col("period_display_name").isin(["first_period_of_extra_time", "second_period_of_extra_time"]))
    .withColumn("is_penalty_shootout", col("period_display_name") == "penalty_shootout")
    .withColumn("is_saved_shot", col("type_display_name") == "saved_shot")
    .withColumn("is_missed_shot", col("type_display_name") == "missed_shots")
    .withColumn("is_claim", col("type_display_name") == "claim")
    .withColumn("is_penalty_faced", col("type_display_name") == "penalty_faced")

    # Create a temporary struct with ALL the array-based flags at once
    .withColumn("all_flags", sf.struct(*all_flag_exprs))
    
    # Expand the struct's fields into top-level columns
    .select("*", "all_flags.*")
    
    # Drop the temporary struct column
    .drop("all_flags")
)


# Here
pass_definition = (
    col("is_pass_attempt") &
    ~col("is_cross") &
    ~col("is_keeper_throw") &
    ~col("is_throw_in")
)

events_with_corner_flags_df = (
    event
    .withColumn("corner_taker", sf.when(sf.array_contains(col("qualifiers_values"), qualifier_map["corner_taken"]), col("player_id")))
    .withColumn("current_corner_taker", sf.last("corner_taker", ignorenulls=True).over(w))
)

corners_to_goal_stats = (
    events_with_corner_flags_df
    .groupBy("match_id", "current_corner_taker")
    .agg(
        sf.count("corner_taker").alias("corners_taken"),
        sf.count(
            sf.when(
                (sf.array_contains(col("qualifiers_values"), qualifier_map["from_corner"])) &
                (col("type_display_name") == "goal"),
                True
            )
        ).alias("corners_leading_to_goal")
    )
    .withColumnRenamed("current_corner_taker", "player_id")
)

events_with_flags_df = (
    events_with_flags_df
    .withColumn(
        "is_official_goal",
        col("is_goal") & ~col("is_own_goal") & ~col("is_penalty_shootout")
    ).withColumn(
        "is_extra_time_goal",
        col("is_goal") & col("is_extra_time")
    )
    
    .withColumn(
        "is_official_penalty_taken",
        col("is_penalty") & ~col("is_penalty_shootout")
    )
    
    .withColumn(
        "is_pso_penalty_taken",
        col("is_penalty") & col("is_penalty_shootout")
    ).withColumn(
        "is_pso_penalty_goal",
        col("is_goal") & col("is_penalty_shootout")
    )

    .withColumn(
        "is_valid_long_ball",
        col("is_longball") & pass_definition
    ).withColumn(
        "is_valid_through_ball",
        col("is_throughball") & pass_definition
    )

    .withColumn(
        "is_valid_dribble",
        col("is_dribble_won") | col("is_dribble_lost")
    )

    .withColumn(
        "is_duel_aerial",
        col("is_duel_aerial_won") | col("is_duel_aerial_lost")
    )

    .withColumn(
        "is_corner_won",
        col("is_corner_awarded") & col("is_successful")
    ).withColumn(
        "is_corner_conceded",
        col("is_corner_awarded") & ~col("is_successful")
    )

    .withColumn(
        "is_official_penalty_faced",
        col("is_penalty_faced") & ~col("is_penalty_shootout")
    ).withColumn(
        "is_official_penalty_conceded",
        col("is_keeper_missed") & col("is_official_penalty_faced")
    )

    .withColumn(
        "is_pso_penalty_faced",
        col("is_penalty_faced") & col("is_penalty_shootout")
    )
)

agg_expressions = []

if 'simple_flag_counts' in rules['aggregations']:
    for alias, source_flag in rules['aggregations']['simple_flag_counts'].items():
        agg_expressions.append(
            sf.count(sf.when(col(source_flag), True)).alias(alias)
        )

if 'custom_expression_counts' in rules['aggregations']:
    for alias, expression_str in rules['aggregations']['custom_expression_counts'].items():
        agg_expressions.append(
            sf.expr(f"COUNT(CASE WHEN {expression_str} THEN true END)").alias(alias)
        )

# Aggregation
player_match_stats = (
    events_with_flags_df
    .groupBy(*rules['group_by_cols'])
    .agg(*agg_expressions)
)

# Add possession % stats
w_match = Window.partitionBy("match_id")

player_match_stats = (
    player_match_stats
    .withColumn(
        "pass_success_percentage",
        sf.round(col("accurate_passes") * 100 / col("total_passes"), 2)
    )
    .withColumn(
        "total_possession",
        sf.sum("possession").over(w_match)
    )
    .withColumn(
        "possession_percentage",
        sf.round(col("possession") * 100 / col("total_possession"), 2)
    )
    .drop("total_possession")
)

select_exprs = []
for c in rules["final_column_order"]:
    if isinstance(c, str):
        select_exprs.append(col(c))
    else:
        select_exprs.append(col(c["expr"]).alias(c["name"]))

player_match_stats = (
    player_match_stats.alias("ps")
    .filter((col("player_id").isNotNull()))
    .join(player.alias("pl"), on="player_id", how="inner")
    .join(match.alias("m"), on="match_id", how="inner")
    .join(participation_with_minutes.alias("pt"), on=["player_id", "match_id"], how="inner") 
    .join(team.alias("t1"), col("pt.team_id") == col("t1.team_id"), "inner")
    .withColumn(
        "is_home",
        sf.when(
            col("home_team_id") == col("pt.team_id"),
            True
        ).otherwise(False)
    )
    .withColumn(
        "opposing_team_id",
        sf.when(
            col("is_home"),
            col("away_team_id")
        ).otherwise(col("home_team_id"))
    )
    .join(team.alias("t2"), col("opposing_team_id") == col("t2.team_id"), "inner")
    .join(stage, on="stage_id", how="inner")
    .join(corners_to_goal_stats, on=["player_id", "match_id"], how="left")
    # .filter(
    #     col("total_saves") > col("collected_saves") + col("parried_saves") + col("parried_danger_saves")
    #     # (col("match_id") == 1201831)
    #     # (col("tournament_name") == "FIFA World Cup") &
    #     # (col("season_name") == 2022) &
    #     # (col("player_name") == "Rodri")
    # )
    .select(*select_exprs)
    .orderBy(col("possession_percentage").desc())
)

# Fill after left join
columns_to_fill = ["corners_taken", "corners_leading_to_goal"]
player_match_stats = player_match_stats.fillna(0, subset=columns_to_fill)

(
    player_match_stats
    .filter(
       col("match_id") == 1697443
        # (col("player_id") == 406347)
        # & (col("tournament_name") == "Champions League")
        # & (col("season_name") == "2023/2024")
    )
    .show(50)
)

25/08/26 09:41:28 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+--------+---------+-------------------+---------------+--------------+------------+----------------+-------+------------------+------------------+----------------+----------------+---------------+----------+----------------------+---------------+-------------+---------------+---------+---------------+----------------+-------------------+--------------------+------------+-------------------+---------+------------+---------------+----------+----------+---------------------+-------------+--------------------+-------------+----------------+-------------------+----------------+----------------------+-------------------+-----------+---------------------+----------------------+--------------------+-------------------+------------+------------------+-----------+-----------------+-----------------+-----------------+------------------+-------------+----------------+----------+-----------------------+-----------------+-----------+----------------+-------------+----------------+------------------

In [49]:
print(player_match_stats.count())

                                                                                

222068


In [56]:
write_table_to_gold(
    df=player_match_stats,
    table_name="player_match_stats",
    primary_keys=["match_id", "player_id"]
)

--- Writing gold table: player_match_stats ---
  - Primary Keys: ['match_id', 'player_id']
  - Partition Columns: None




  - Merge complete for player_match_stats.


                                                                                

In [33]:
player_stats.printSchema()

root
 |-- match_id: integer (nullable = true)
 |-- player_id: integer (nullable = true)
 |-- player_name: string (nullable = true)
 |-- is_first_eleven: boolean (nullable = true)
 |-- minutes_played: integer (nullable = false)
 |-- goals: long (nullable = false)
 |-- extra_time_goals: long (nullable = false)
 |-- assists: long (nullable = false)
 |-- goals_six_yard_box: long (nullable = false)
 |-- goals_penalty_area: long (nullable = false)
 |-- goals_out_of_box: long (nullable = false)
 |-- goals_right_foot: long (nullable = false)
 |-- goals_left_foot: long (nullable = false)
 |-- goals_head: long (nullable = false)
 |-- goals_other_body_parts: long (nullable = false)
 |-- goals_open_play: long (nullable = false)
 |-- goals_counter: long (nullable = false)
 |-- goals_set_piece: long (nullable = false)
 |-- own_goals: long (nullable = false)
 |-- penalties_taken: long (nullable = false)
 |-- penalties_scored: long (nullable = false)
 |-- pso_penalties_taken: long (nullable = false)
 

In [36]:
import yaml

with open('config.yaml', 'r') as f:
    CONFIG = yaml.safe_load(f)

In [45]:
def get_expressions(layer: str, table_name: str):
    rules = CONFIG[layer][table_name]

    agg_expressions = []

    if 'flags_agg' in rules['aggregations']:
        for col_name, fn in rules['aggregations']['flags_agg'].items():
            if fn == "sum":
                agg_expressions.append(sf.sum(col_name).alias(col_name))
            elif fn == "avg":
                agg_expressions.append(sf.round(sf.avg(col_name), 2).alias(col_name))

    if 'entity_meta' in rules['aggregations']:
        for col_name, fn in rules['aggregations']['entity_meta'].items():
            if fn == "first":
                agg_expressions.append(sf.first(col_name, ignorenulls=True).alias(col_name))

    if 'flag_counts' in rules['aggregations']:
        for alias, source_flag in rules['aggregations']['flag_counts'].items():
            if source_flag == "*":
                agg_expressions.append(
                    sf.count("*").alias(alias)
                )
            else:
                agg_expressions.append(
                    sf.count(sf.when(col(source_flag), True)).alias(alias)
            )

    return agg_expressions


def get_select_expressions(layer: str, table_name: str):
    rules = CONFIG[layer][table_name]

    select_exprs = []
    for c in rules["final_column_order"]:
        if isinstance(c, str):
            select_exprs.append(col(c))
        else:
            select_exprs.append(col(c["expr"]).alias(c["name"]))

    return select_exprs

In [38]:
pts_rules = CONFIG['gold_layer']['player_team_season_stats']
pts_agg_expressions = get_expressions("gold_layer", "player_team_season_stats")
pts_select_expressions = get_select_expressions("gold_layer", "player_team_season_stats")

# Aggregation
player_team_season_stats = (
    player_match_stats
    .groupBy(*pts_rules['group_by_cols'])
    .agg(*pts_agg_expressions)    
).select(*pts_select_expressions)

(
    player_team_season_stats
    .filter(
       # col("match_id") == 1697443
        (col("player_id") == 13754)
        # & (col("tournament_name") == "Champions League")
        # & (col("season_name") == "2023/2024")
    )
    .show(50)
)

                                                                                

+---------+------------+----------------+-------------+-----------+---------+-----------+-------------+--------------+------------+----------------+-------+------------------+------------------+----------------+----------------+---------------+----------+----------------------+---------------+-------------+---------------+---------+---------------+----------------+-------------------+--------------------+------------+-------------------+---------+------------+---------------+----------+----------+-------------+--------------------+-------------+----------------+-------------------+----------------+----------------------+-------------------+-----------+---------------------+----------------------+--------------------+-------------------+------------+------------------+-----------+-----------------+-----------------+-----------------+------------------+-------------+----------------+----------+-----------------------+-----------------+-----------+----------------+-------------+----------

--- Writing gold table: player_team_season_stats ---
  - Primary Keys: ['season_id', 'team_id', 'player_id']
  - Partition Columns: None




  - Successfully created and wrote data to player_team_season_stats.


                                                                                

In [12]:
agg_cols = [
    # Goals and Assists
    "goals_scored",
    "extra_time_goals",
    "assists",
    "goals_six_yard_box",
    "goals_penalty_area",
    "goals_out_of_box",
    "goals_right_foot",
    "goals_left_foot",
    "goals_head",
    "goals_other_body_parts",
    "goals_open_play",
    "goals_counter",
    "goals_set_piece",
    "own_goals",
    "penalties_taken",
    "penalties_scored",
    "pso_penalties_taken",
    "pso_penalties_scored",

    # Cards
    "yellow_cards",
    "second_yellow_cards",
    "red_cards",

    # Passes
    "total_passes",
    "accurate_passes",
    "key_passes",

    "possession",
    "total_touches",
    "unsuccessful_touches",
    
    "total_crosses",
    "accurate_crosses",
    "total_long_balls",
    "accurate_long_balls",
    "total_through_balls",
    "accurate_through_balls",

    # Shots
    "total_shots",
    "total_shots_on_target",
    "total_shots_off_target",
    "total_woodwork_shots",
    "total_shots_blocked",

    # Dribbles
    "dribbles_won",
    "dribbles_attempted",

    # Aerials
    "aerials_won",
    "offensive_aerials",
    "defensive_aerials",

    # Tackles
    "tackles_attempted",
    "successful_tackles",
    "dribbled_past",
    "last_man_tackles",
    "clearances",
    "clearances_off_the_line",
    "interceptions_won",

    # Corners
    "corners_won",
    "corners_conceded",
    "corners_taken",
    "accurate_corners",
    "corners_leading_to_goal",

    # Dispossessed
    "dispossessed",
    "fouls_committed",
    "was_fouled",
    "errors",
    "errors_lead_to_goal",
    "offsides",

    # Goalkeeping stats
    "total_saves",
    "collected_saves",
    "parried_saves",
    "parried_danger_saves",
    "successful_claims",
    "total_claims",
    "official_penalties_faced",
    "official_penalties_saved",
    "official_penalties_conceded",
    "pso_penalties_faced",
    "pso_penalties_saved",
    "pso_penalties_conceded",
]

non_agg_cols = [
    "player_name",
    
    "region_name",
    "region_id",
    "tournament_name",
    "tournament_id",
    "season_name",
]

In [32]:
sum_expressions = [
    sf.sum(col(c)).alias(c) for c in agg_cols
]

keep_expressions = [
    sf.first(col(c)).alias(c) for c in non_agg_cols
]

all_expressions = sum_expressions + keep_expressions

In [None]:
player_season_stats = (
    player_stats
    .groupBy("season_id", "player_id")
    .agg(
        sf.count("*").alias("appearances"),
        sf.count(when(col("is_first_eleven"), True)).alias("games_started"),
        *all_expressions
    ).select(
        "player_id",
        "player_name",
        "tournament_name",
        "season_name",
        "season_id",

        "appearances",
        *agg_cols,

        "region_name",
        "region_id",
        "tournament_id",
    )
)

In [None]:
(
    player_season_stats
    .filter(
        col("player_id") == 11119
    ).orderBy(
        col("goals_scored").desc()
    ).show()
)    

                                                                                

+---------+------------+----------------+-----------+---------+-----------+--------------+-----+----------------+-------+------------------+------------------+----------------+----------------+---------------+----------+----------------------+---------------+-------------+---------------+---------+---------------+----------------+-------------------+--------------------+------------+-------------------+---------+------------+---------------+----------+----------+--------------------+-------------+-------------+----------------+-------------------+----------------+----------------------+-------------------+-----------+---------------------+----------------------+--------------------+-------------------+------------+------------------+-----------+-----------------+-----------------+-----------------+------------------+-------------+----------------+----------+-----------------------+-----------------+-----------+----------------+-------------+----------------+-----------------------+----

In [13]:
import pyspark.sql.functions as sf
from pyspark.sql.functions import col
from pyspark.sql import DataFrame
from typing import List

# Aggregation on dataframe
def aggregate_stats(
    df: DataFrame,
    group_by_cols: List[str],
    sum_cols: List[str],
    attribute_cols: List[str],
    other_agg_exprs: List = []
) -> DataFrame:
    sum_expressions = [sf.sum(col(c)).alias(c) for c in sum_cols]
    keep_expressions = [sf.first(col(c), ignorenulls=True).alias(c) for c in attribute_cols]
    
    all_expressions = other_agg_exprs + sum_expressions + keep_expressions
    
    aggregated_df = df.groupBy(*group_by_cols).agg(*all_expressions)
    
    return aggregated_df

In [34]:
match.printSchema()

root
 |-- match_id: integer (nullable = true)
 |-- stage_id: integer (nullable = true)
 |-- home_team_id: integer (nullable = true)
 |-- away_team_id: integer (nullable = true)
 |-- start_time_utc: timestamp (nullable = true)
 |-- home_score: integer (nullable = true)
 |-- away_score: integer (nullable = true)
 |-- home_ht_score: integer (nullable = true)
 |-- away_ht_score: integer (nullable = true)
 |-- home_ft_score: integer (nullable = true)
 |-- away_ft_score: integer (nullable = true)
 |-- home_et_score: integer (nullable = true)
 |-- away_et_score: integer (nullable = true)
 |-- home_pk_score: integer (nullable = true)
 |-- away_pk_score: integer (nullable = true)



In [52]:
print(CONFIG['gold_layer'].keys())

dict_keys(['player_match_stats', 'player_team_season_stats', 'team_match_stats'])


In [39]:
tm_rules = CONFIG['gold_layer']['team_match_stats']
tm_agg_expressions = get_expressions("gold_layer", "team_match_stats")
tm_select_expressions = get_select_expressions("gold_layer", "team_match_stats")

# Aggregation
team_match_stats = (
    player_match_stats
    .groupBy(*tm_rules['group_by_cols'])
    .agg(*tm_agg_expressions)    
)

In [40]:
team_match_group_by = ["match_id", "team_id"]
team_match_agg = [c for c in agg_cols if c not in ["minutes_played"]]
team_match_attributes = [
    "start_time_utc",
    "team_name",
    "opposing_team_id",
    "opposing_team_name",
    "is_home",

    "region_name",
    "region_id",
    "tournament_name",
    "tournament_id",
    "season_name",
    "season_id",
    "stage_name",
    "stage_id"
]
team_match_other_agg = [
    sf.sum("possession").over(w_match).alias("total_possession"),
    sf.round(col("possession") * 100 / col("total_possession"), 2)
]

# Recalc possession_% to avoid rounding errors
# and drop + rename for clarity
team_match_stats = aggregate_stats(
    df=player_stats,
    group_by_cols=team_match_group_by,
    sum_cols=team_match_agg,
    attribute_cols=team_match_attributes
).withColumn(
    "total_possession",
    sf.sum("possession").over(w_match)
).withColumn(
    "possession_percentage",
    sf.round(col("possession") * 100 / col("total_possession"), 2)
).drop("total_possession")

NameError: name 'agg_cols' is not defined

In [61]:
match.printSchema()

root
 |-- match_id: integer (nullable = true)
 |-- stage_id: integer (nullable = true)
 |-- home_team_id: integer (nullable = true)
 |-- away_team_id: integer (nullable = true)
 |-- start_time_utc: timestamp (nullable = true)
 |-- home_score: integer (nullable = true)
 |-- away_score: integer (nullable = true)
 |-- home_ht_score: integer (nullable = true)
 |-- away_ht_score: integer (nullable = true)
 |-- home_ft_score: integer (nullable = true)
 |-- away_ft_score: integer (nullable = true)
 |-- home_et_score: integer (nullable = true)
 |-- away_et_score: integer (nullable = true)
 |-- home_pk_score: integer (nullable = true)
 |-- away_pk_score: integer (nullable = true)



In [41]:
# Get score data
match_scores_df = match.select(
    "match_id",
    col("home_score"),
    col("away_score"),
    col("home_ht_score"),
    col("away_ht_score"),
    col("home_ft_score"),
    col("away_ft_score"),
    col("home_et_score"),
    col("away_et_score"),
    col("home_pk_score"),
    col("away_pk_score")
)

# Join with agg table
team_match_stats_with_scores = team_match_stats.join(
    match_scores_df,
    on="match_id",
    how="left"
)

# derived columns
team_match_stats = (
    team_match_stats_with_scores
    .withColumn("goals_for", sf.when(col("is_home"), col("home_score")).otherwise(col("away_score")))
    .withColumn("goals_against", sf.when(col("is_home"), col("away_score")).otherwise(col("home_score")))

    .withColumn("goals_for_ht", sf.when(col("is_home"), col("home_ht_score")).otherwise(col("away_ht_score")))
    .withColumn("goals_against_ht", sf.when(col("is_home"), col("away_ht_score")).otherwise(col("home_ht_score")))
    
    .withColumn("goals_for_ft", sf.when(col("is_home"), col("home_ft_score")).otherwise(col("away_ft_score")))
    .withColumn("goals_against_ft", sf.when(col("is_home"), col("away_ft_score")).otherwise(col("home_ft_score")))

    .withColumn("goals_for_et", sf.when(col("is_home"), col("home_et_score")).otherwise(col("away_et_score")))
    .withColumn("goals_against_et", sf.when(col("is_home"), col("away_et_score")).otherwise(col("home_et_score")))
    
    .withColumn("goals_for_pk", sf.when(col("is_home"), col("home_pk_score")).otherwise(col("away_pk_score")))
    .withColumn("goals_against_pk", sf.when(col("is_home"), col("away_pk_score")).otherwise(col("home_pk_score")))
).withColumnRenamed(
    "own_goals", "own_goals_conceded"
)


# Add more derived cols
team_match_stats = (
    # Recalc possession_% to avoid rounding errors
    team_match_stats
    .withColumn(
        "total_possession",
        sf.sum("possession").over(w_match)
    ).withColumn(
        "possession_percentage",
        sf.round(col("possession") * 100 / col("total_possession"), 2)
    ).drop("total_possession")

    .withColumn(
        "is_win",
        sf.when(col("goals_for") > col("goals_against"), True).otherwise(False)
    ).withColumn(
        "is_draw",
        sf.when(col("goals_for") == col("goals_against"), True).otherwise(False)
    ).withColumn(
        "is_loss",
        sf.when(col("goals_for") < col("goals_against"), True).otherwise(False)
    )
)

In [40]:
team_match_cols = [
    "match_id",
    "team_id",
    "team_name",

    # match summary
    "start_time_utc",
    "opposing_team_id",
    "opposing_team_name",
    "is_home",

    "goals_for",
    "goals_against",
    "goals_for_ht",
    "goals_against_ht",
    "goals_for_ft",
    "goals_against_ft",
    "goals_for_et",
    "goals_against_et",
    "goals_for_pk",
    "goals_against_pk",

    # Goals and Assists
    "goals_scored",
    "extra_time_goals",
    "assists",
    "goals_six_yard_box",
    "goals_penalty_area",
    "goals_out_of_box",
    "goals_right_foot",
    "goals_left_foot",
    "goals_head",
    "goals_other_body_parts",
    "goals_open_play",
    "goals_counter",
    "goals_set_piece",
    "own_goals_against",
    
    "penalties_taken",
    "penalties_scored",
    "pso_penalties_taken",
    "pso_penalties_scored",

    # Cards
    "yellow_cards",
    "second_yellow_cards",
    "red_cards",

    # Passes
    "total_passes",
    "accurate_passes",
    "key_passes",

    "possession",
    "possession_percentage",
    "total_touches",
    "unsuccessful_touches",
    
    "total_crosses",
    "accurate_crosses",
    "total_long_balls",
    "accurate_long_balls",
    "total_through_balls",
    "accurate_through_balls",

    # Shots
    "total_shots",
    "total_shots_on_target",
    "total_shots_off_target",
    "total_woodwork_shots",
    "total_shots_blocked",

    # Dribbles
    "dribbles_won",
    "dribbles_attempted",

    # Aerials
    "aerials_won",
    "offensive_aerials",
    "defensive_aerials",

    # Tackles
    "tackles_attempted",
    "successful_tackles",
    "dribbled_past",
    "last_man_tackles",
    "clearances",
    "clearances_off_the_line",
    "interceptions_won",

    # Corners
    "corners_won",
    "corners_conceded",
    "corners_taken",
    "accurate_corners",
    "corners_leading_to_goal",

    # Dispossessed
    "dispossessed",
    "fouls_committed",
    "was_fouled",
    "errors",
    "errors_lead_to_goal",
    "offsides",

    # Goalkeeping stats
    "total_saves",
    "collected_saves",
    "parried_saves",
    "parried_danger_saves",
    "successful_claims",
    "total_claims",
    "official_penalties_faced",
    "official_penalties_saved",
    "official_penalties_conceded",
    "pso_penalties_faced",
    "pso_penalties_saved",
    "pso_penalties_conceded",

    "region_name",
    "region_id",
    "tournament_name",
    "tournament_id",
    "season_name",
    "season_id",
    "stage_name",
    "stage_id"
]

In [41]:
team_match_stats = team_match_stats.select(
    *team_match_cols
)

In [42]:
tm_select_expressions = get_select_expressions("gold_layer", "team_match_stats")

team_match_stats = team_match_stats.select(
    *tm_select_expressions
)

In [59]:
# final_team_match_stats.show()
team_match_stats.printSchema()

root
 |-- match_id: integer (nullable = true)
 |-- team_id: integer (nullable = true)
 |-- team_name: string (nullable = true)
 |-- start_time_utc: timestamp (nullable = true)
 |-- opposing_team_id: integer (nullable = true)
 |-- opposing_team_name: string (nullable = true)
 |-- is_home: boolean (nullable = true)
 |-- goals_for: integer (nullable = true)
 |-- goals_against: integer (nullable = true)
 |-- goals_for_ht: integer (nullable = true)
 |-- goals_against_ht: integer (nullable = true)
 |-- goals_for_ft: integer (nullable = true)
 |-- goals_against_ft: integer (nullable = true)
 |-- goals_for_et: integer (nullable = true)
 |-- goals_against_et: integer (nullable = true)
 |-- goals_for_pk: integer (nullable = true)
 |-- goals_against_pk: integer (nullable = true)
 |-- goals_scored: long (nullable = true)
 |-- extra_time_goals: long (nullable = true)
 |-- assists: long (nullable = true)
 |-- goals_six_yard_box: long (nullable = true)
 |-- goals_penalty_area: long (nullable = true)


In [None]:
# validation_df = final_team_match_stats.withColumn(
#     "score_mismatch",
#     ((col("goals_for_ft") != col("goals_scored")) & col("goals_for_et").isNull()) |
#     (col("goals_for_et") != col("goals_scored"))
# )

# mismatched_games = validation_df.filter(col("score_mismatch"))

# if mismatched_games.count() > 0:
#     print("!!! DATA QUALITY WARNING: Found games where calculated goals do not match official score !!!")
#     mismatched_games.select("match_id", "team_name", "goals_for_ft", "goals_scored").show()

                                                                                



                                                                                

+--------+--------------------+------------+-----+
|match_id|           team_name|goals_for_ft|goals|
+--------+--------------------+------------+-----+
|  789629|             Croatia|           1|    0|
|  789642|           Argentina|           2|    1|
|  789643|              France|           3|    2|
|  789643|            Honduras|           0|    1|
|  789654|            Portugal|           2|    1|
|  835257|              France|           2|    1|
|  958428|             Chelsea|           2|    1|
|  958431|   Manchester United|           1|    0|
|  958473|             Arsenal|           2|    1|
|  958810|             Arsenal|           4|    3|
|  958821|         Bournemouth|           1|    0|
|  959594|         Aston Villa|           1|    0|
|  959598|West Bromwich Albion|           1|    0|
|  959608|          Sunderland|           1|    0|
|  959614|          Sunderland|           4|    3|
|  959623|      Crystal Palace|           1|    0|
|  959630|           Liverpool|

In [32]:
team_match_stats.filter(
    col("match_id") == 1481961
).show()

                                                                                

+--------+-------+-------------+-------------------+----------------+------------------+-------+------+-------+-------+---------+-------------+------------+----------------+------------+----------------+------------+----------------+------------+----------------+------------+----------------+-------+------------------+------------------+----------------+----------------+---------------+----------+----------------------+---------------+-------------+---------------+------------------+---------------+----------------+-------------------+--------------------+------------+-------------------+---------+------------+---------------+----------+----------+---------------------+-------------+--------------------+-------------+----------------+----------------+-------------------+-------------------+----------------------+-----------+---------------------+----------------------+--------------------+-------------------+------------+------------------+-----------+-----------------+----------------

In [14]:
match.printSchema()

root
 |-- match_id: integer (nullable = true)
 |-- stage_id: integer (nullable = true)
 |-- home_team_id: integer (nullable = true)
 |-- away_team_id: integer (nullable = true)
 |-- start_time_utc: timestamp (nullable = true)
 |-- home_score: integer (nullable = true)
 |-- away_score: integer (nullable = true)
 |-- home_ht_score: integer (nullable = true)
 |-- away_ht_score: integer (nullable = true)
 |-- home_ft_score: integer (nullable = true)
 |-- away_ft_score: integer (nullable = true)
 |-- home_et_score: integer (nullable = true)
 |-- away_et_score: integer (nullable = true)
 |-- home_pk_score: integer (nullable = true)
 |-- away_pk_score: integer (nullable = true)



In [21]:
dim_match = (
    match
    .join(stage, on="stage_id")
    .join(
        team.alias("home_team"), 
        col("home_team_id") == col("home_team.team_id"),
        how="left"
    )
    .join(
        team.alias("away_team"), 
        col("away_team_id") == col("away_team.team_id"),
        how="left"
    )
    .select(
        "match_id",

        "start_time_utc",
        "home_team_id",
        col("home_team.team_name").alias("home_team_name"),
        "away_team_id",
        col("away_team.team_name").alias("away_team_name"),
        
        "home_score",
        "away_score",
        "home_ht_score",
        "away_ht_score",
        "home_ft_score",
        "away_ft_score",
        "home_et_score",
        "away_et_score",
        "home_pk_score",
        "away_pk_score",

        "region_name",
        "region_id",
        "tournament_name",
        "tournament_id",
        "season_name",
        "season_id",
        "stage_name",
        "stage_id"
    )
)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `match`.`home_team_id` cannot be resolved. Did you mean one of the following? [`home_team_id`, `away_team_id`, `match_id`, `tournament_id`, `home_score`].;
'Join LeftOuter, ('match.home_team_id = team_id#48)
:- Project [stage_id#75, match_id#74, home_team_id#76, away_team_id#77, start_time_utc#78, home_score#79, away_score#80, home_ht_score#81, away_ht_score#82, home_ft_score#83, away_ft_score#84, home_et_score#85, away_et_score#86, home_pk_score#87, away_pk_score#88, region_id#54, region_name#55, tournament_id#56, tournament_name#57, season_id#58, season_name#59, stage_name#60, league#61, season#62]
:  +- Join Inner, (stage_id#75 = stage_id#63)
:     :- Relation [match_id#74,stage_id#75,home_team_id#76,away_team_id#77,start_time_utc#78,home_score#79,away_score#80,home_ht_score#81,away_ht_score#82,home_ft_score#83,away_ft_score#84,home_et_score#85,away_et_score#86,home_pk_score#87,away_pk_score#88] parquet
:     +- Relation [region_id#54,region_name#55,tournament_id#56,tournament_name#57,season_id#58,season_name#59,stage_name#60,league#61,season#62,stage_id#63] parquet
+- SubqueryAlias home_team
   +- Relation [team_id#48,team_name#49,country_name#50] parquet


In [27]:
stage.select(
    "region_id",
    "region_name",
    "tournament_id",
    "tournament_name",
).distinct().show()

+---------+-------------+-------------+----------------+
|region_id|  region_name|tournament_id| tournament_name|
+---------+-------------+-------------+----------------+
|      247|International|           36|  FIFA World Cup|
|      250|       Europe|           12|Champions League|
|      252|      England|            2|  Premier League|
|       81|      Germany|            3|      Bundesliga|
+---------+-------------+-------------+----------------+



In [28]:
import pyspark.sql.functions as sf
from pyspark.sql.functions import col, when

# ==============================================================================
# STEP 1: DEFINE YOUR HARDCODED CONFIGURATION MAPS
# The keys are now tuples: (region_id, tournament_id)
# ==============================================================================

# Format: {(region_id, tournament_id): format_string}
format_map = {
    (252, 2): "League",           # England, Premier League
    (81, 3): "League",            # Germany, Bundesliga
    (250, 12): "Tournament",   # Europe, Champions League
    (247, 36): "Tournament",   # International, FIFA World Cup
    (247, 67): "Tournament",   # International, FIFA Club World Cup
    # ... add other (region_id, tournament_id) pairs here
}

# Format: {(region_id, tournament_id): is_national_boolean}
is_national_map = {
    (252, 2): False,              # England, Premier League
    (81, 3): False,               # Germany, Bundesliga
    (250, 12): False,              # Europe, Champions League
    (247, 36): True,               # International, FIFA World Cup
    (247, 67): False,              # International, FIFA Club World Cup (The exception)
    # ... add other (region_id, tournament_id) pairs here
}

# Format: {(region_id, tournament_id): is_domestic_boolean}
is_domestic_map = {
    (252, 2): True,               # England, Premier League
    (81, 3): True,                # Germany, Bundesliga
    (250, 12): False,              # Europe, Champions League
    (247, 36): False,              # International, FIFA World Cup
    (247, 67): False,              # International, FIFA Club World Cup
    # ... add other (region_id, tournament_id) pairs here
}

# ==============================================================================
# STEP 2: APPLY THE MAPS TO CREATE THE STAGE DIMENSION
# ==============================================================================

# We will build a series of WHEN/OTHERWISE statements from the maps.
# This is the most robust way to handle composite keys.

# Start with the base stage table
stage_dim = stage

# Build the `competition_format` column
format_expr = None
for (region_id, tour_id), fmt in format_map.items():
    condition = (col("region_id") == region_id) & (col("tournament_id") == tour_id)
    if format_expr is None:
        format_expr = when(condition, fmt)
    else:
        format_expr = format_expr.when(condition, fmt)
stage_dim = stage_dim.withColumn("competition_format", format_expr.otherwise("Unknown"))


# Build the `is_national_team_competition` column
is_national_expr = None
for (region_id, tour_id), is_nat in is_national_map.items():
    condition = (col("region_id") == region_id) & (col("tournament_id") == tour_id)
    if is_national_expr is None:
        is_national_expr = when(condition, is_nat)
    else:
        is_national_expr = is_national_expr.when(condition, is_nat)
stage_dim = stage_dim.withColumn("is_national_team_competition", is_national_expr.otherwise(None)) # Null if not mapped


# Build the `is_domestic_competition` column
is_domestic_expr = None
for (region_id, tour_id), is_dom in is_domestic_map.items():
    condition = (col("region_id") == region_id) & (col("tournament_id") == tour_id)
    if is_domestic_expr is None:
        is_domestic_expr = when(condition, is_dom)
    else:
        is_domestic_expr = is_domestic_expr.when(condition, is_dom)
stage_dim = stage_dim.withColumn("is_domestic_competition", is_domestic_expr.otherwise(None)) # Null if not mapped


# It's good practice to check for any tournaments not in your map
unmapped_tournaments = stage_dim.filter(col("competition_format") == "Unknown")
if unmapped_tournaments.count() > 0:
    print("!!! WARNING: Found tournaments not in the hardcoded maps !!!")
    unmapped_tournaments.select("region_id", "region_name", "tournament_id", "tournament_name").distinct().show()


print("--- Final, Clean Stage Dimension (Hardcoded with Composite Keys) ---")
stage_dim.select(
    "tournament_name", 
    "competition_format", 
    "is_national_team_competition",
    "is_domestic_competition"
).distinct().show(50, truncate=False)

--- Final, Clean Stage Dimension (Hardcoded with Composite Keys) ---
+----------------+------------------+----------------------------+-----------------------+
|tournament_name |competition_format|is_national_team_competition|is_domestic_competition|
+----------------+------------------+----------------------------+-----------------------+
|FIFA World Cup  |Tournament        |true                        |false                  |
|Champions League|Tournament        |false                       |false                  |
|Premier League  |League            |false                       |true                   |
|Bundesliga      |League            |false                       |true                   |
+----------------+------------------+----------------------------+-----------------------+



In [35]:
player_match_stats.filter(
    col("extra_time_goals") > 0
).show()



+--------+---------+-----------------+---------------+--------------+------------+----------------+-------+------------------+------------------+----------------+----------------+---------------+----------+----------------------+---------------+-------------+---------------+---------+---------------+----------------+-------------------+--------------------+------------+-------------------+---------+------------+---------------+----------+----------+---------------------+-------------+--------------------+-------------+----------------+-------------------+----------------+----------------------+-------------------+-----------+---------------------+----------------------+--------------------+-------------------+------------+------------------+-----------+-----------------+-----------------+-----------------+------------------+-------------+----------------+----------+-----------------------+-----------------+-----------+----------------+-------------+----------------+--------------------

                                                                                

In [47]:
ts_rules = CONFIG['gold_layer']['team_season_stats']
ts_agg_expressions = get_expressions("gold_layer", "team_season_stats")
ts_select_expressions = get_select_expressions("gold_layer", "team_season_stats")

team_season_stats = (
    team_match_stats
    .groupBy(*ts_rules['group_by_cols'])
    .agg(*ts_agg_expressions)
    .select(*ts_select_expressions)
)

In [48]:
team_season_stats.filter(
    col("team_id") == 65
).show()



+----------------+-------------+-----------+---------+-------+---------+-------+----+-----+------+---------+-------------+------------+----------------+-------+------------------+------------------+----------------+----------------+---------------+----------+----------------------+---------------+-------------+---------------+------------------+---------------+----------------+-------------------+--------------------+------------+-------------------+---------+------------+---------------+----------+---------------------+-------------+--------------------+-------------+----------------+----------------+-------------------+-------------------+----------------------+-----------+---------------------+----------------------+--------------------+-------------------+------------+------------------+-----------+-----------------+-----------------+-----------------+------------------+-------------+----------------+----------+-----------------------+-----------------+-----------+----------------+

                                                                                

In [62]:
team_season_group_by = [
    "team_id", 
    "season_id"
]
team_season_agg = [c if c != "own_goals" else "own_goals_against" for c in agg_cols]
team_season_attributes = [
    "team_name",
    "region_name",
    "region_id",
    "tournament_name",
    "tournament_id",
    "season_name",
]
team_season_other_agg = [
    sf.avg(col("possession_percentage")).alias("possession_percentage"),
    sf.count("*").alias("matches")]

In [63]:
team_season_stats = aggregate_stats(
    df=team_match_stats,
    group_by_cols=team_season_group_by,
    sum_cols=team_season_agg,
    attribute_cols=team_season_attributes,
    other_agg_exprs=team_season_other_agg
)

final_team_season_cols = [
    "tournament_name",
    "tournament_id",
    "season_name",
    "season_id",
    "team_id",
    "team_name",

    "matches",

    # Goals and Assists
    "goals_scored",
    "extra_time_goals",
    "assists",
    "goals_six_yard_box",
    "goals_penalty_area",
    "goals_out_of_box",
    "goals_right_foot",
    "goals_left_foot",
    "goals_head",
    "goals_other_body_parts",
    "goals_open_play",
    "goals_counter",
    "goals_set_piece",
    "own_goals_against",
    "penalties_taken",
    "penalties_scored",
    "pso_penalties_taken",
    "pso_penalties_scored",

    # Cards
    "yellow_cards",
    "second_yellow_cards",
    "red_cards",

    # Passes
    "total_passes",
    "accurate_passes",
    "key_passes",

    "possession_percentage",
    "total_touches",
    "unsuccessful_touches",
    
    "total_crosses",
    "accurate_crosses",
    "total_long_balls",
    "accurate_long_balls",
    "total_through_balls",
    "accurate_through_balls",

    # Shots
    "total_shots",
    "total_shots_on_target",
    "total_shots_off_target",
    "total_woodwork_shots",
    "total_shots_blocked",

    # Dribbles
    "dribbles_won",
    "dribbles_attempted",

    # Aerials
    "aerials_won",
    "offensive_aerials",
    "defensive_aerials",

    # Tackles
    "tackles_attempted",
    "successful_tackles",
    "dribbled_past",
    "last_man_tackles",
    "clearances",
    "clearances_off_the_line",
    "interceptions_won",

    # Corners
    "corners_won",
    "corners_conceded",
    "corners_taken",
    "accurate_corners",
    "corners_leading_to_goal",

    # Dispossessed
    "dispossessed",
    "fouls_committed",
    "was_fouled",
    "errors",
    "errors_lead_to_goal",
    "offsides",

    # Goalkeeping stats
    "total_saves",
    "collected_saves",
    "parried_saves",
    "parried_danger_saves",
    "successful_claims",
    "total_claims",
    "official_penalties_faced",
    "official_penalties_saved",
    "official_penalties_conceded",
    "pso_penalties_faced",
    "pso_penalties_saved",
    "pso_penalties_conceded",
]

In [64]:
team_season_stats = team_season_stats.select(
    *final_team_season_cols
)

In [66]:
team_season_stats.orderBy(col("goals_scored").desc()).show(50)



+---------------+-------------+-----------+---------+-------+-----------------+-------+------------+----------------+-------+------------------+------------------+----------------+----------------+---------------+----------+----------------------+---------------+-------------+---------------+-----------------+---------------+----------------+-------------------+--------------------+------------+-------------------+---------+------------+---------------+----------+---------------------+-------------+--------------------+-------------+----------------+----------------+-------------------+-------------------+----------------------+-----------+---------------------+----------------------+--------------------+-------------------+------------+------------------+-----------+-----------------+-----------------+-----------------+------------------+-------------+----------------+----------+-----------------------+-----------------+-----------+----------------+-------------+----------------+-----

                                                                                

In [58]:
match.filter(
    col("home_score").isNull()
).show()

+--------+--------+------------+------------+-------------------+----------+----------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
|match_id|stage_id|home_team_id|away_team_id|     start_time_utc|home_score|away_score|home_ht_score|away_ht_score|home_ft_score|away_ft_score|home_et_score|away_et_score|home_pk_score|away_pk_score|
+--------+--------+------------+------------+-------------------+----------+----------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
| 1549723|   19793|         167|          14|2021-12-26 15:00:00|      NULL|      NULL|         NULL|         NULL|         NULL|         NULL|         NULL|         NULL|         NULL|         NULL|
| 1643149|   21026|         282|          42|2023-01-21 17:30:00|      NULL|      NULL|         NULL|         NULL|         NULL|         NULL|         NULL|         NULL|         NULL|         NULL|


In [44]:
import pyspark.sql.functions as sf

temp = events_with_flags_df.filter(
    (col("match_id") == 1697443)
    # & (col("type_display_name") == "pass")
        &col("is_pass_attempt") &
    # ~col("is_cross") &
    # ~col("is_keeper_throw") &
    ~col("is_throw_in")
    # (col("team_id") == 341 )
    # (col("type_display_name") == "interception")
    # col("is_interception_won")
    # & ~col("is_parried_safe") & ~col("is_parried_danger") & ~col("is_collected")
    # & ~col("is_outfielder_block")
    # & col("is_keeper_penalty_conceded")
    # (col("type_display_name") == "ball_touch") &
    # ~col("is_successful")
    # & (col("period_value") == 5)
    # & 
    # & (sf.array_contains(col("qualifiers_values"), qualifier_map["second_yellow"]))
    & (col("player_id") == 279423) 
    # & col("is_successful")
    # (col("type_display_name") == "goal")
    # # & (col("second") == 0)
    #sf.array_contains(col("satisfied_events_types_names"), "touches")
    #  (sf.array_contains(col("qualifiers_values"), qualifier_map["pass_end_x"]))
    #  & ~(sf.array_contains(col("satisfied_events_types"), event_type_map["goal_penalty_area"]))
).select(
    "match_id",
    "player_id",
    "_event_id",
    "team_id",
    "minute",
    "second",
    "period_value",
    "type_display_name",
    "is_successful",
    "qualifiers_display_names",
    "satisfied_events_types_names"
)

print(temp.count())
temp.show(n=3000, truncate=False)

93
+--------+---------+---------+-------+------+------+------------+-----------------+-------------+-------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|match_id|player_id|_event_id|team_id|minute|second|period_value|type_display_name|is_successful|qualifiers_display_names                   |satisfied_events_types_names                                                                                                                                          |
+--------+---------+---------+-------+------+------+------------+-----------------+-------------+-------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1697443 |279423   |20       |335    |0     |40    |1           |pass 

In [12]:
player.printSchema()
stage.printSchema()
team.printSchema()

root
 |-- player_id: integer (nullable = true)
 |-- player_name: string (nullable = true)

root
 |-- region_id: integer (nullable = true)
 |-- region_name: string (nullable = true)
 |-- tournament_id: integer (nullable = true)
 |-- tournament_name: string (nullable = true)
 |-- season_id: integer (nullable = true)
 |-- season_name: string (nullable = true)
 |-- stage_name: string (nullable = true)
 |-- league: string (nullable = true)
 |-- season: string (nullable = true)
 |-- stage_id: integer (nullable = true)

root
 |-- team_id: integer (nullable = true)
 |-- team_name: string (nullable = true)
 |-- country_name: string (nullable = true)



In [43]:
event.groupBy("type_display_name").agg(
    sf.count("*").alias("a")
).orderBy(col("a").desc()).show(5000)

+-----------------+-------+
|type_display_name|      a|
+-----------------+-------+
|             pass|7742057|
|    ball_recovery| 780170|
|           aerial| 494113|
|       ball_touch| 465255|
|             foul| 357450|
|        clearance| 339010|
|          take_on| 289991|
|           tackle| 263051|
|     interception| 179783|
|   corner_awarded| 154572|
|     dispossessed| 147466|
|        challenge| 138190|
|     blocked_pass| 123860|
|    keeper_pickup|  99358|
|       saved_shot|  99240|
|             save|  98106|
|     missed_shots|  72997|
|  substitution_on|  52834|
| substitution_off|  52834|
|              end|  46325|
|            start|  30946|
|     offside_pass|  30074|
|    offside_given|  30074|
| offside_provoked|  30074|
|             card|  29678|
|             goal|  22733|
| formation_change|  16152|
|    formation_set|  15386|
|            claim|  12440|
|  shield_ball_opp|   7315|
|            error|   6970|
|   keeper_sweeper|   6890|
|            punch| 

In [None]:
import pyspark.sql.functions as sf
from pyspark.sql.window import Window

# Sort events properly within each match
w = Window.partitionBy("match_id").orderBy("_event_id")

# Step 1: Mark corner taken events
events_with_corners = event.withColumn(
    "corner_taker",
    sf.when(sf.array_contains(col("qualifiers_values"), qualifier_map["corner_taken"]), col("player_id"))
)

# Step 2: Forward-fill the corner_taker to next events until next corner
events_with_corners = events_with_corners.withColumn(
    "current_corner_taker",
    sf.last("corner_taker", ignorenulls=True).over(w)
)

# Step 3: Identify goals from corners
events_with_corners = events_with_corners.withColumn(
    "corner_goal",
    sf.when(
        (sf.array_contains(col("qualifiers_values"), qualifier_map["from_corner"])) & 
        (col("type_display_name") == "goal"),
        1
    ).otherwise(0)
)

# Step 4: Aggregate by the actual corner taker
corner_stats = (
    events_with_corners
    .filter(col("match_id") == 1694368)
    # .groupBy("current_corner_taker")
    # .agg(
    #     sf.count("corner_taker").alias("corners_taken"),
    #     sf.sum("corner_goal").alias("corners_leading_to_goal")
    # )
    # .withColumnRenamed("current_corner_taker", "player_id")
)

corner_stats.show(10000)


+--------+---------+--------+-------+---------+------+------+------------+-------------------+-----------------+-------------+-----+-----+-----+-----+--------------------+----------------------+--------------------+---------+--------+------------+--------------------+-----------+
|match_id|_event_id|event_id|team_id|player_id|minute|second|period_value|period_display_name|type_display_name|is_successful|    x|    y|end_x|end_y|     qualifiers_list|satisfied_events_types|              league|   season|stage_id|corner_taker|current_corner_taker|corner_goal|
+--------+---------+--------+-------+---------+------+------+------------+-------------------+-----------------+-------------+-----+-----+-----+-----+--------------------+----------------------+--------------------+---------+--------+------------+--------------------+-----------+
| 1694368|        1|       2|   7614|     NULL|     0|     0|           1|         first_half|            start|         true|  0.0|  0.0| NULL| NULL|       

In [None]:
import pyspark.sql.functions as sf
from pyspark.sql.functions import col

event.filter(
    (col("match_id") == 1729454) 
    # (col("player_id") == 367185) 
    # (col("type_display_name") == "pass") &
    # # ~(sf.array_contains(col("qualifiers_values"), qualifier_map["second_yellow"])) &
    # ~(sf.array_contains(col("qualifiers_values"), qualifier_map["red"])) &
    # ~(sf.array_contains(col("qualifiers_values"), qualifier_map["cross"]))
    # (col("type_display_name") == "pass")
    # (col("period_value") > 2)
).groupBy("player_id").agg(
    sf.count(sf.when(col("type_display_name") == "dispossessed", True)).alias("dispossessed"),
    sf.count(
        sf.when(
            (col("type_display_name") == "foul") &
            ~(col("is_successful")), 
            True
        )
    ).alias("fouls"),
    sf.count(
        sf.when(
            (col("type_display_name") == "foul") &
            (col("is_successful")), 
            True
        )
    ).alias("was_fouled"),
    sf.count(sf.when(col("type_display_name") == "error", True)).alias("errors_lead_to_goal"),
    sf.count(sf.when(col("type_display_name") == "offside_given", True)).alias("offsides"),
    # sf.count(sf.when(sf.array_contains(col("qualifiers_values"), qualifier_map["keeper_throw"]), True)).alias("corners_taken"),
    # sf.count(
    #     sf.when(
    #         (sf.array_contains(col("qualifiers_values"), qualifier_map["throw_in"])) &
    #         (col("is_successful")), True
    #     )
    # ).alias("accurate_corners"),
    # sf.count(
    #     sf.when(
    #         (sf.array_contains(col("qualifiers_values"), qualifier_map["longball"])) &
    #         (col("type_display_name") == "goal"), True
    #     )
    # ).alias("corners_leading_to_goal")
# ).select(
#     "_event_id",
#     "minute",
#     "second",
#     "period_value",
#     "type_display_name",
#     "is_successful",
#     "qualifiers_list",
#     "satisfied_events_types"
).show(n=300, truncate=False)
# ).count()

+---------+------------+-----+----------+-------------------+--------+
|player_id|dispossessed|fouls|was_fouled|errors_lead_to_goal|offsides|
+---------+------------+-----+----------+-------------------+--------+
|247454   |2           |1    |1         |0                  |0       |
|279379   |2           |0    |2         |0                  |2       |
|NULL     |0           |0    |1         |0                  |0       |
|106968   |0           |0    |0         |0                  |0       |
|281049   |0           |0    |2         |0                  |0       |
|276366   |0           |0    |0         |0                  |0       |
|332867   |3           |0    |0         |0                  |0       |
|424462   |0           |0    |0         |0                  |0       |
|113994   |1           |0    |0         |0                  |0       |
|399490   |0           |0    |0         |0                  |0       |
|363686   |0           |0    |0         |0                  |0       |
|32203

In [None]:
from pyspark.sql.window import Window

window_spec = Window.partitionBy("match_id")

(
    player_stats
    .filter(col("match_id") == 1729454)
    # .withColumn(
    #     "total_possession",
    #     sf.sum(col("possession")).over(window_spec)
    # )
    # .withColumn(
    #     "possession_percentage",
    #     sf.round(col("possession") / col("total_possession") * 100, 2)
    # )
    # .filter(col("total_passes") > 10000)
    # .orderBy(col("possession_percentage").desc())
    .show(1000)
)

+--------+---------+-------------------+------------+---------------+----------+----------+-------------+-------------------+-------+-----------------+----------------+------------------+-----------+---------------+-----------+--------------+--------+
|match_id|player_id|        player_name|total_passes|accurate_passes|key_passes|possession|total_touches|     start_time_utc|team_id|        team_name|opposing_team_id|opposing_team_name|region_name|tournament_name|season_name|    stage_name|stage_id|
+--------+---------+-------------------+------------+---------------+----------+----------+-------------+-------------------+-------+-----------------+----------------+------------------+-----------+---------------+-----------+--------------+--------+
| 1729454|   273257|Oleksandr Zinchenko|         107|            100|         4|       111|          123|2024-01-30 19:30:00|    174|          Arsenal|             174| Nottingham Forest|    England| Premier League|  2023/2024|Premier League|  

In [None]:
(
    player_stats
    .groupBy("player_id", "player_name")
    .agg(
        sf.count("*").alias("games"),
        sf.sum("accurate_passes").alias("accurate_passes"),
        sf.sum("total_passes").alias("total_passes"),
        sf.sum("key_passes").alias("key_passes"),
    )
    .withColumn(
        "pass_success_percentage",
        sf.round(col("accurate_passes") / col("total_passes") * 100, 2)
    )
    # .filter(col("total_passes") > 10000)
    .orderBy(col("key_passes").desc())
    .show()
)



+---------+--------------------+-----+---------------+------------+----------+-----------------------+
|player_id|         player_name|games|accurate_passes|total_passes|key_passes|pass_success_percentage|
+---------+--------------------+-----+---------------+------------+----------+-----------------------+
|    73084|     Kevin De Bruyne|  347|          13660|       16703|       955|                  81.78|
|   283323|      Joshua Kimmich|  356|          23475|       25916|       740|                  90.58|
|    71824|         Pascal Groß|  313|          12420|       15363|       731|                  80.84|
|    37099|       Thomas Müller|  370|           9010|       11718|       654|                  76.89|
|   108226|       Mohamed Salah|  342|           7751|        9996|       604|                  77.54|
|    69344|   Christian Eriksen|  280|          10933|       13333|       604|                   82.0|
|   141556|       Julian Brandt|  351|          10352|       12582|      

                                                                                

In [None]:
import pyspark.sql.functions as sf

fct_match_events.select(
    "_event_id", "minute", "second", "qualifiers_list"
).filter(
    (col("match_id") == 1485191) &
    # (col("player_id") == 70483) &
    (col("type_display_name") == "Pass") &
    (sf.size(col("qualifiers_list")) == 5)
).orderBy("_event_id").show(truncate=False, n=70)

+---------+------+------+----------------------------------------------------------+
|_event_id|minute|second|qualifiers_list                                           |
+---------+------+------+----------------------------------------------------------+
|22       |2     |11    |[touches, passForward, passLeft, defensiveThird, pos]     |
|74       |4     |38    |[touches, throwIn, passBack, passLeft, midThird]          |
|114      |7     |33    |[touches, throwIn, passForward, passLeft, finalThird]     |
|159      |10    |58    |[touches, throwIn, passForward, passRight, finalThird]    |
|272      |17    |50    |[touches, throwIn, passForward, passLeft, finalThird]     |
|321      |22    |28    |[touches, passForward, passLeft, defensiveThird, pos]     |
|363      |25    |35    |[touches, throwIn, passBack, passRight, midThird]         |
|391      |26    |50    |[touches, passForward, passRight, midThird, pos]          |
|407      |27    |46    |[touches, throwIn, passBack, passRight, 

In [None]:
import pyspark.sql.functions as sf

fct_match_events.filter(
    (col("match_id") == 1485195) &
    # (col("player_id") == 20973) &
    (col("type_display_name") == "Pass") &
    (sf.size(col("qualifiers_list")) == 5)
).orderBy("_event_id").show(truncate=False, n=70)

+--------+---------+--------+------+------+-------+---------+----+-----+------------+-------------------+----------+-----------------+------------------+-------------------------+----------------------+----------------------+---------+--------+----------------------------------------------------------+
|match_id|_event_id|event_id|minute|second|team_id|player_id|x   |y    |period_value|period_display_name|type_value|type_display_name|outcome_type_value|outcome_type_display_name|satisfied_events_types|league                |season   |stage_id|qualifiers_list                                           |
+--------+---------+--------+------+------+-------+---------+----+-----+------------+-------------------+----------+-----------------+------------------+-------------------------+----------------------+----------------------+---------+--------+----------------------------------------------------------+
|1485195 |9        |6       |0     |41    |24     |136776   |10.5|100.0|1           |Fir

                                                                                

In [63]:
player_match_stats.cache()
player_match_stats.count() 

write_table_to_gold(
    df=player_match_stats,
    table_name="player_match_stats",
    primary_keys=["match_id", "player_id"]
)

write_table_to_gold(
    df=player_team_season_stats,
    table_name="player_team_season_stats",
    primary_keys=["season_id", "team_id", "player_id"]
)

write_table_to_gold(
    df=team_match_stats,
    table_name="team_match_stats",
    primary_keys=["match_id", "team_id"]
)


write_table_to_gold(
    df=team_season_stats,
    table_name="team_season_stats",
    primary_keys=["season_id", "team_id"]
)

player_match_stats.unpersist()

25/08/26 16:05:57 WARN CacheManager: Asked to cache already cached data.
                                                                                

--- Writing gold table: player_match_stats ---
  - Primary Keys: ['match_id', 'player_id']
  - Partition Columns: None


                                                                                

  - Successfully created and wrote data to player_match_stats.
--- Writing gold table: player_team_season_stats ---
  - Primary Keys: ['season_id', 'team_id', 'player_id']
  - Partition Columns: None


                                                                                

  - Successfully created and wrote data to player_team_season_stats.
--- Writing gold table: team_match_stats ---
  - Primary Keys: ['match_id', 'team_id']
  - Partition Columns: None


25/08/26 16:06:49 WARN DAGScheduler: Broadcasting large task binary with size 1024.1 KiB
                                                                                

  - Successfully created and wrote data to team_match_stats.
--- Writing gold table: team_season_stats ---
  - Primary Keys: ['season_id', 'team_id']
  - Partition Columns: None


25/08/26 16:06:56 WARN DAGScheduler: Broadcasting large task binary with size 1071.4 KiB
                                                                                

  - Successfully created and wrote data to team_season_stats.


DataFrame[match_id: int, player_id: int, player_name: string, is_first_eleven: boolean, minutes_played: int, goals_scored: bigint, extra_time_goals: bigint, assists: bigint, goals_six_yard_box: bigint, goals_penalty_area: bigint, goals_out_of_box: bigint, goals_right_foot: bigint, goals_left_foot: bigint, goals_head: bigint, goals_other_body_parts: bigint, goals_open_play: bigint, goals_counter: bigint, goals_set_piece: bigint, own_goals: bigint, penalties_taken: bigint, penalties_scored: bigint, pso_penalties_taken: bigint, pso_penalties_scored: bigint, yellow_cards: bigint, second_yellow_cards: bigint, red_cards: bigint, total_passes: bigint, accurate_passes: bigint, key_passes: bigint, possession: bigint, possession_percentage: double, total_touches: bigint, unsuccessful_touches: bigint, total_crosses: bigint, accurate_crosses: bigint, accurate_long_balls: bigint, total_long_balls: bigint, accurate_through_balls: bigint, total_through_balls: bigint, total_shots: bigint, total_shots_