#### Loading files and creating variables

In [1]:
import os
import json
import jsonlines
import pandas as pd
from tqdm import tqdm
import math
import glob

In [2]:
import json
import math
import os
import pandas as pd

# Function to count defenders inside the shooting cone
def count_defenders_in_cone(shooter_x, shooter_y, defenders, goal_left, goal_right):
    def point_in_triangle(pt, v1, v2, v3):
        def sign(p1, p2, p3):
            return (p1[0] - p3[0]) * (p2[1] - p3[1]) - (p2[0] - p3[0]) * (p1[1] - p3[1])
        b1 = sign(pt, v1, v2) < 0.0
        b2 = sign(pt, v2, v3) < 0.0
        b3 = sign(pt, v3, v1) < 0.0
        return b1 == b2 == b3

    cone_vertices = [(shooter_x, shooter_y), goal_left, goal_right]
    count = 0
    for d in defenders:
       if "x" in d and "y" in d and d["x"] is not None and d["y"] is not None:
        px, py = d["x"], d["y"]
        if all(v[0] is not None and v[1] is not None for v in cone_vertices):
            if point_in_triangle((px, py), *cone_vertices):
                count += 1
    return count

# Function to check if goalkeeper is in the shot path
def is_goalkeeper_in_path(shooter_x, shooter_y, gk_x, gk_y, goal_x, goal_y, tolerance=5):
    if None in (shooter_x, shooter_y, gk_x, gk_y):
        return False
    vec_shot = (goal_x - shooter_x, goal_y - shooter_y)
    vec_gk = (gk_x - shooter_x, gk_y - shooter_y)
    cross = abs(vec_shot[0] * vec_gk[1] - vec_shot[1] * vec_gk[0])
    distance = math.hypot(*vec_shot)
    lateral_offset = cross / distance if distance else float('inf')
    return lateral_offset <= tolerance

# Function to classify field zone in 6 areas
def get_field_zone_6(x, y, attacking_right=True):
    if x is None or y is None:
        return None
    near = x > 75 if attacking_right else x < 25
    if y < 33.3:
        horiz = "left"
    elif y < 66.6:
        horiz = "center"
    else:
        horiz = "right"
    return f"{horiz}_{'near' if near else 'far'}"

# Load mappings
with open(r"C:\Users\isabe\OneDrive\Desktop\Corporate project files\mappings/player_event_id_to_tracking_id.json", "r") as f:
    player_map = json.load(f)

with open(r'C:\Users\isabe\OneDrive\Desktop\Corporate project files\mappings/team_event_id_to_tracking_id.json') as f:
    team_map = json.load(f)

all_shots = []

# Process all matches
for filename in os.listdir(r"C:\Users\isabe\OneDrive\Desktop\Corporate project files\shots"):
    if filename.endswith(".json"):
        match_id = filename.replace(".json", "")
        shots_path = rf"C:\Users\isabe\OneDrive\Desktop\Corporate project files\shots\{match_id}.json"
        tracking_path = rf"C:\Users\isabe\OneDrive\Desktop\Corporate project files\jsonls\{match_id}_tracking_data.jsonl"

        if not os.path.exists(tracking_path):
            print(f"Missing tracking file for match {match_id}")
            continue

        with open(shots_path, "r") as f:
            shots_data = json.load(f)

        with open(tracking_path, "r") as f:
            metadata = json.loads(f.readline())
            players_data = metadata["players_data"]
            teams_data = metadata["teams_data"]

        team_name_to_tracking_id = {}
        for tracking_id, team_role in zip(players_data.keys(), ["home", "away"]):
            team_name = teams_data[team_role]["name"]
            team_name_to_tracking_id[team_name.lower()] = tracking_id

        tracking_frames = []
        with open(tracking_path, "r") as f:
            next(f)
            for line in f:
                obj = json.loads(line)
                if "Videotimestamp" in obj:
                    tracking_frames.append(obj)

        frame_times = [f["Videotimestamp"] for f in tracking_frames]

        for shot in shots_data:
            shot_ts = float(shot["videoTimestamp"])
            idx = min(range(len(frame_times)), key=lambda i: abs(frame_times[i] - shot_ts))
            closest_frame = tracking_frames[idx]

            player_event_id = str(shot["player"]["id"])
            team_event_id = str(shot["team"]["id"])
            opp_team_event_id = str(shot["opponentTeam"]["id"])

            player_tracking_id = player_map.get(player_event_id)
            team_tracking_id = team_map.get(team_event_id)
            opp_team_tracking_id = team_map.get(opp_team_event_id)

            ball_x = ball_y = None
            if player_tracking_id and str(team_tracking_id) in closest_frame["data"]:
                for player in closest_frame["data"][str(team_tracking_id)]:
                    if player["id"] == player_tracking_id:
                        ball_x = player["x"]
                        ball_y = player["y"]
                        break

            goalkeeper_id = None
            team_id_key = str(opp_team_tracking_id)
            for pid, info in players_data[team_id_key].items():
                if info["position"] == "GK":
                    goalkeeper_id = int(pid)
                    break

            goalkeeper_x = goalkeeper_y = None
            if goalkeeper_id:
                for player in closest_frame["data"][team_id_key]:
                    if player["id"] == goalkeeper_id:
                        goalkeeper_x = player["x"]
                        goalkeeper_y = player["y"]
                        break

            if goalkeeper_x is not None and goalkeeper_y is not None and ball_x is not None and ball_y is not None:
                dx = ball_x - goalkeeper_x
                dy = ball_y - goalkeeper_y
                distance_to_goalkeeper = (dx**2 + dy**2)**0.5
            else:
                distance_to_goalkeeper = None

            if ball_x is not None and ball_x > 50:
                x_goal, y_goal = 105, 50
            else:
                x_goal, y_goal = 0, 50

            if ball_x is not None and ball_y is not None:
                distance_to_goal = ((x_goal - ball_x)**2 + (y_goal - ball_y)**2)**0.5
            else:
                distance_to_goal = None

            goal_x = x_goal
            goal_left_y = 30.34
            goal_right_y = 37.66

            if x_goal == 105:
                goal_left = (105, goal_left_y)
                goal_right = (105, goal_right_y)
            else:
                goal_left = (0, goal_right_y)
                goal_right = (0, goal_left_y)

            if ball_x is not None and ball_y is not None:
                a = math.hypot(goal_x - ball_x, goal_left_y - ball_y)
                b = math.hypot(goal_x - ball_x, goal_right_y - ball_y)
                c = goal_right_y - goal_left_y
                try:
                    angle_to_goal = math.acos((a**2 + b**2 - c**2) / (2 * a * b))
                except:
                    angle_to_goal = None
            else:
                angle_to_goal = None

            angle_to_goal_deg = math.degrees(angle_to_goal) if angle_to_goal is not None else None

            goalkeeper_in_shot_path = is_goalkeeper_in_path(
                shooter_x=ball_x,
                shooter_y=ball_y,
                gk_x=goalkeeper_x,
                gk_y=goalkeeper_y,
                goal_x=x_goal,
                goal_y=34,
                tolerance=5
            )

            opponent_tracking_id = str(opp_team_tracking_id)
            num_defenders_nearby = 0
            if opponent_tracking_id in closest_frame["data"]:
                for player in closest_frame["data"][opponent_tracking_id]:
                    if ball_x is not None and ball_y is not None and "x" in player and "y" in player:
                        px, py = player["x"], player["y"]
                        distance = math.hypot(ball_x - px, ball_y - py)
                        if distance <= 5:
                            num_defenders_nearby += 1

            shooter_x, shooter_y = ball_x, ball_y
            opponent_players = [
                p for p in closest_frame["data"].get(opponent_tracking_id, [])
                if p["id"] != goalkeeper_id
            ]

            try:
                defenders_in_cone = count_defenders_in_cone(
                    shooter_x, shooter_y,
                    opponent_players,
                    goal_left=goal_left,
                    goal_right=goal_right
                )
            except:
                defenders_in_cone = None

            defenders_in_box = 0
            for p in opponent_players:
                if "x" in p and "y" in p:
                    px, py = p["x"], p["y"]
                    if x_goal == 105:
                        in_area = px > 83.5 and 18 <= py <= 82
                    else:
                        in_area = px < 16.5 and 18 <= py <= 82
                    if in_area:
                        defenders_in_box += 1

            attacking_right = x_goal == 105
            field_zone_6 = get_field_zone_6(ball_x, ball_y, attacking_right=attacking_right)


            # GoalKeeper in shooting cone
            if goalkeeper_id is not None:
                goalkeeper_player = next(
                    (p for p in closest_frame["data"].get(opponent_tracking_id, []) if p["id"] == goalkeeper_id and "x" in p and "y" in p),
                    None
                )
                if goalkeeper_player:
                    gk_in_cone = count_defenders_in_cone(
                        shooter_x, shooter_y,
                        [goalkeeper_player],  
                        goal_left=goal_left,
                        goal_right=goal_right
                    ) > 0
                else:
                    gk_in_cone = None
            else:
                gk_in_cone = None


            # Count attackers in shooting cone
            shooter_x, shooter_y = ball_x, ball_y
            team_players = closest_frame["data"].get(team_tracking_id, [])
            try:
                attackers_in_cone = count_defenders_in_cone(
                    shooter_x, shooter_y,
                    team_players,
                    goal_left=goal_left,
                    goal_right=goal_right
                )
            except:
                attackers_in_cone = None

            # Distance category
            if distance_to_goal is not None:
                if distance_to_goal < 10:
                    distance_category = "very_close"
                elif distance_to_goal < 20:
                    distance_category = "close"
                elif distance_to_goal < 30:
                    distance_category = "medium"
                else:
                    distance_category = "far"
            else:
                distance_category = None

            # Angle category
            if angle_to_goal_deg is not None:
                if angle_to_goal_deg < 10:
                    angle_category = "narrow"
                elif angle_to_goal_deg < 25:
                    angle_category = "medium"
                else:
                    angle_category = "wide"
            else:
                angle_category = None

            # Pressure score
            pressure_score = (
                (num_defenders_nearby or 0) +
                (defenders_in_cone or 0) +
                (defenders_in_box or 0) +
                (1 if goalkeeper_in_shot_path else 0)
            )

            # Header or not
            bodypart = shot["shot"]['bodyPart']
            if bodypart is not None:
                header_or_not = bodypart.lower() == "head_or_other"
            else:
                header_or_not = None


            # Recoger jugadores de ambos equipos (excluyendo porteros)
            attack_team_id = str(team_tracking_id)
            defense_team_id = str(opp_team_tracking_id)

            # Excluir portero del equipo rival (ya lo tienes como goalkeeper_id)
            attacking_players = [
                p for p in closest_frame["data"].get(attack_team_id, [])
                if p.get("id") != player_tracking_id  # opcional: excluir el propio tirador
            ]

            defending_players = [
                p for p in closest_frame["data"].get(defense_team_id, [])
                if p.get("id") != goalkeeper_id
            ]

            # Combinar todos los jugadores (excepto porteros)
            all_field_players = attacking_players + defending_players

            # Calcular número de jugadores (atacantes + defensores) dentro del cono
            try:
                people_in_cone = count_defenders_in_cone(
                    shooter_x, shooter_y,
                    all_field_players,
                    goal_left=goal_left,
                    goal_right=goal_right
                )
            except:
                people_in_cone = None


            # Distance from goalkeeper to center of goal
            if goalkeeper_x is not None and goalkeeper_y is not None:
                distance_to_center_goal = math.hypot(goal_x - goalkeeper_x, 34 - goalkeeper_y)
            else:
                distance_to_center_goal = None


            # Compute goalkeeper's angle to the goal (angle visible from GK's position)
            if goalkeeper_x is not None and goalkeeper_y is not None:
                a_gk = math.hypot(goal_x - goalkeeper_x, goal_left_y - goalkeeper_y)
                b_gk = math.hypot(goal_x - goalkeeper_x, goal_right_y - goalkeeper_y)
                c_gk = abs(goal_right_y - goal_left_y)  # vertical width of goal

                try:
                    goalkeeper_angle_to_goal = math.acos((a_gk**2 + b_gk**2 - c_gk**2) / (2 * a_gk * b_gk))
                    goalkeeper_angle_to_goal_deg = math.degrees(goalkeeper_angle_to_goal)
                except:
                    goalkeeper_angle_to_goal = None
                    goalkeeper_angle_to_goal_deg = None
            else:
                goalkeeper_angle_to_goal = None
                goalkeeper_angle_to_goal_deg = None

            all_shots.append({
                "period": shot['matchPeriod'],
                "minute": shot['minute'],
                "second": shot['second'],
                "ball_x": ball_x,
                "ball_y": ball_y,
                "videoTimestamp": shot_ts,
                "Frame": closest_frame['frame'],
                "player_id": player_tracking_id,
                "team_id": team_tracking_id,
                "opp_team_id": opp_team_tracking_id,
                "position": shot['player']['position'],
                "bodypart": shot["shot"]['bodyPart'],
                "isGoal": shot["shot"]["isGoal"],
                "on_target": shot["shot"]['onTarget'],
                "xg": shot["shot"]["xg"],
                "xg2": shot["shot"]["xg2"],
                "period_frame": closest_frame["period"],
                "data": closest_frame['data'],
                "goalkeeper_x": goalkeeper_x,
                "goalkeeper_y": goalkeeper_y,
                "distance_to_goalkeeper": distance_to_goalkeeper,
                "distance_to_goal": distance_to_goal,
                "angle_to_goal": angle_to_goal,
                "angle_to_goal_degrees": angle_to_goal_deg,
                "num_defenders_nearby": num_defenders_nearby,
                "poss_duration": shot['possession']['duration'],
                "poss_start_x":shot['possession']['startLocation']['x'],
                "poss_start_y":shot['possession']['startLocation']['y'],
                "goalkeeper_in_shot_path":goalkeeper_in_shot_path,
                "defenders_in_box":defenders_in_box,
                "field_zone_6": field_zone_6,
                "distance_category": distance_category,
                "angle_category": angle_category,
                "pressure_score": pressure_score,
                "header": header_or_not,
                "defenders_in_cone": defenders_in_cone,
                "attackers_in_cone":attackers_in_cone,
                "goalkeeper_in_cone": gk_in_cone,
                "distance_to_center_goal": distance_to_center_goal,
                "goalkeeper_angle_to_goal_degrees": goalkeeper_angle_to_goal_deg,
            })


In [3]:
all_shots_df = pd.DataFrame(all_shots)

#### Imports

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import tree
from sklearn.datasets import load_iris
from sklearn.preprocessing import (
    StandardScaler, OneHotEncoder, PolynomialFeatures, LabelEncoder
)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import chi2

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, export_graphviz, plot_tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR

from sklearn.metrics import (
    mean_squared_error, r2_score, brier_score_loss, roc_auc_score,
    log_loss, confusion_matrix, accuracy_score, recall_score
)
from sklearn.calibration import calibration_curve

import statsmodels.api as sm
from xgboost import XGBRegressor
import lightgbm as lgb

from pydotplus import graph_from_dot_data
from IPython.display import Image
import seaborn as sns


#### Data Cleaning & Processing

In [5]:
excluded_features = [
    'Frame','player_id', 'team_id', 'opp_team_id', 'videoTimestamp', 'period_frame',
    'period', 'minute', 'second',    'ball_x', 'ball_y', 'goalkeeper_x', 'goalkeeper_y',
     'poss_start_y','xg', 'xg2', 'position',
    'data','on_target','angle_to_goal',
]

In [6]:
df = all_shots_df.drop(columns=excluded_features)
df_info = df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5668 entries, 0 to 5667
Data columns (total 20 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   bodypart                          5668 non-null   object 
 1   isGoal                            5668 non-null   bool   
 2   distance_to_goalkeeper            5631 non-null   float64
 3   distance_to_goal                  5664 non-null   float64
 4   angle_to_goal_degrees             5664 non-null   float64
 5   num_defenders_nearby              5668 non-null   int64  
 6   poss_duration                     5668 non-null   object 
 7   poss_start_x                      5668 non-null   int64  
 8   goalkeeper_in_shot_path           5668 non-null   bool   
 9   defenders_in_box                  5668 non-null   int64  
 10  field_zone_6                      5664 non-null   object 
 11  distance_category                 5664 non-null   object 
 12  angle_

Unnamed: 0,bodypart,isGoal,distance_to_goalkeeper,distance_to_goal,angle_to_goal_degrees,num_defenders_nearby,poss_duration,poss_start_x,goalkeeper_in_shot_path,defenders_in_box,field_zone_6,distance_category,angle_category,pressure_score,header,defenders_in_cone,attackers_in_cone,goalkeeper_in_cone,distance_to_center_goal,goalkeeper_angle_to_goal_degrees
0,left_foot,False,21.639968,36.878178,13.504951,1,12.364487,60,False,4,left_near,far,medium,5,False,0,0,False,36.050501,11.59394
1,right_foot,False,19.939077,31.733673,17.597509,1,23.977938,55,True,4,left_near,far,medium,6,False,0,0,True,3.337664,95.983338
2,right_foot,False,2.851105,22.838838,27.170209,2,3.3924945,75,True,8,left_near,medium,wide,12,False,1,0,False,17.895723,23.099078
3,left_foot,False,14.677619,27.550717,22.952926,2,0.6914875,81,True,9,left_near,medium,medium,13,False,1,0,False,3.8048,87.264049
4,left_foot,False,20.175431,31.017337,18.332747,1,12.93257,44,True,7,left_near,far,medium,10,False,1,0,True,2.209072,128.168623


In [7]:
# DF copy
df_model = df.copy()

# Check the balance of the target variable
goal_counts = df_model['isGoal'].value_counts(normalize=True)

# Display the percentage of goals vs no goals and the number of nulls per column
missing_data = df_model.isnull().sum().sort_values(ascending=False)

goal_counts, missing_data.head(10)

(isGoal
 False    0.88091
 True     0.11909
 Name: proportion, dtype: float64,
 distance_to_goalkeeper              37
 goalkeeper_angle_to_goal_degrees    33
 distance_to_center_goal             33
 goalkeeper_in_cone                  33
 distance_category                    4
 angle_category                       4
 field_zone_6                         4
 angle_to_goal_degrees                4
 distance_to_goal                     4
 poss_start_x                         0
 dtype: int64)

In [8]:
df= df_model.dropna()

In [9]:
features = [
    'distance_to_goal',
    'angle_to_goal_degrees',
    'distance_to_goalkeeper',
    'goalkeeper_angle_to_goal_degrees',
    'distance_to_center_goal',
    'num_defenders_nearby',
    'defenders_in_box',
    'pressure_score',
    'defenders_in_cone',
    'poss_start_x',
    'poss_duration',
    'attackers_in_cone',

    'distance_category',
    'angle_category',
    'field_zone_6',
    'bodypart',

    'goalkeeper_in_shot_path',
    'goalkeeper_in_cone',
    'header'


    ]

target = 'isGoal'

In [10]:
df['goalkeeper_in_cone'] = df['goalkeeper_in_cone'].astype(bool)

# Recalculate numerical variables based on the currently used features
categorical_features = ['field_zone_6', 'bodypart', 'distance_category', 'angle_category']
boolean_features = ['goalkeeper_in_shot_path', 'header', 'goalkeeper_in_cone']
numerical_features = list(set(features) - set(categorical_features) - set(boolean_features))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['goalkeeper_in_cone'] = df['goalkeeper_in_cone'].astype(bool)


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5631 entries, 0 to 5667
Data columns (total 20 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   bodypart                          5631 non-null   object 
 1   isGoal                            5631 non-null   bool   
 2   distance_to_goalkeeper            5631 non-null   float64
 3   distance_to_goal                  5631 non-null   float64
 4   angle_to_goal_degrees             5631 non-null   float64
 5   num_defenders_nearby              5631 non-null   int64  
 6   poss_duration                     5631 non-null   object 
 7   poss_start_x                      5631 non-null   int64  
 8   goalkeeper_in_shot_path           5631 non-null   bool   
 9   defenders_in_box                  5631 non-null   int64  
 10  field_zone_6                      5631 non-null   object 
 11  distance_category                 5631 non-null   object 
 12  angle_categ

In [12]:
features = [
    'distance_to_goal',
    'angle_to_goal_degrees',
    'distance_to_goalkeeper',
    'goalkeeper_angle_to_goal_degrees',
    'distance_to_center_goal',
    'num_defenders_nearby',
    'defenders_in_box',
    'defenders_in_cone',
    'poss_start_x',
    'poss_duration',
    'attackers_in_cone',

    'distance_category',
    'angle_category',

    'goalkeeper_in_shot_path',
    'goalkeeper_in_cone',



    ]

In [13]:
# Recalculate numerical variables based on the currently used features
categorical_features = [ 'distance_category', 'angle_category']
boolean_features = ['goalkeeper_in_shot_path', 'goalkeeper_in_cone']
numerical_features = list(set(features) - set(categorical_features) - set(boolean_features)- set(['pressure_score']))

In [14]:
# Separate features and target
X = df.drop([target], axis=1)
y = df[target]

# Full list of categorical features (including booleans treated as categorical)
categorical_all = categorical_features + boolean_features

# Remaining features are numerical
numerical_final = [col for col in features if col not in categorical_all]

# Column transformer: one-hot encode categoricals, passthrough numericals
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_final),
        ('cat', OneHotEncoder(drop='first'), categorical_all)
    ]
)

# Apply transformation
X_processed = preprocessor.fit_transform(X)

# Train-test split with stratification to preserve goal ratio
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.3, random_state=42, stratify=y
)

X_train.shape, X_test.shape, y_train.mean(), y_test.mean()

((3941, 18), (1690, 18), 0.11849784318700837, 0.11834319526627218)

#### Model Training

In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Pipeline for LightGBM Regression
# LightGBM is generally not sensitive to feature scaling.
# However, the preprocessor with OneHotEncoder is still needed for categorical features.
reg_pipeline_lgbm = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', 'passthrough', numerical_features), # Passthrough numerical features for LightGBM
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_all) # Use handle_unknown='ignore' for OneHotEncoder
    ], remainder='passthrough')), # Keep other columns
    ('regressor', lgb.LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

# Train the pipeline with LGBMRegressor
# Using the same X_train, y_train, X_test, y_test split from previous steps
reg_pipeline_lgbm.fit(X_train, y_train)

# Predict on the test set
y_pred_lgbm = np.clip(reg_pipeline_lgbm.predict(X_test), 0, 1)

# Evaluate the LightGBM model
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
rmse_lgbm = np.sqrt(mse_lgbm)
brier_lgbm = brier_score_loss(y_test, y_pred_lgbm)

# Evaluate the model using the metric sum(xG) vs sum(isGoal)
sum_lgbm = y_pred_lgbm.sum()
sum_actual_goals_lgbm = y_test.sum()
abs_error_lgbm = abs(sum_lgbm - sum_actual_goals_lgbm)

{
    "LightGBM RMSE": rmse_lgbm,
    "LightGBM Brier Score": brier_lgbm,
    "LightGBM Predicted sum xG": sum_lgbm,
    "LightGBM Actual sum Goals": sum_actual_goals_lgbm,
    "LightGBM Absolute Error": abs_error_lgbm
}

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000371 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1680
[LightGBM] [Info] Number of data points in the train set: 3941, number of used features: 21
[LightGBM] [Info] Start training from score 0.118498


{'LightGBM RMSE': 0.3197993536191779,
 'LightGBM Brier Score': 0.10227162657524398,
 'LightGBM Predicted sum xG': 206.71060943132014,
 'LightGBM Actual sum Goals': 200,
 'LightGBM Absolute Error': 6.710609431320137}

In [33]:
# Pipeline for LightGBM Regression
reg_pipeline_lgbm = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_all)
    ], remainder='passthrough')),
    ('regressor', lgb.LGBMRegressor(
        n_estimators=100,
        random_state=42,
        n_jobs=-1,
        verbose=-1  # Suppress LightGBM output
    ))
])

# Train the pipeline with LGBMRegressor
reg_pipeline_lgbm.fit(X_train, y_train)

# Predict on the test set
y_pred_lgbm = np.clip(reg_pipeline_lgbm.predict(X_test), 0, 1)

# Convert probabilities to binary predictions (threshold=0.5)
y_pred_lgbm_binary = (y_pred_lgbm >= 0.5).astype(int)

# Calculate classification metrics
accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm_binary)
precision_lgbm = precision_score(y_test, y_pred_lgbm_binary)
recall_lgbm = recall_score(y_test, y_pred_lgbm_binary)
f1_lgbm = f1_score(y_test, y_pred_lgbm_binary)
auc_lgbm = roc_auc_score(y_test, y_pred_lgbm)

# Evaluate the LightGBM model
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
rmse_lgbm = np.sqrt(mse_lgbm)
brier_lgbm = brier_score_loss(y_test, y_pred_lgbm)
r2_lgbm = r2_score(y_test, y_pred_lgbm)

# Evaluate the model using the metric sum(xG) vs sum(isGoal)
sum_lgbm = y_pred_lgbm.sum()
sum_actual_goals_lgbm = y_test.sum()
abs_error_lgbm = abs(sum_lgbm - sum_actual_goals_lgbm)

# Calculate goal difference percentage
goal_diff_percentage = abs_error_lgbm / sum_actual_goals_lgbm * 100

{
    "LightGBM RMSE": rmse_lgbm,
    "LightGBM Brier Score": brier_lgbm,
    "LightGBM R²": r2_lgbm,
    "LightGBM Predicted sum xG": sum_lgbm,
    "LightGBM Actual sum Goals": sum_actual_goals_lgbm,
    "LightGBM Absolute Error": abs_error_lgbm,
    "LightGBM Goal Diff %": f"{goal_diff_percentage:.2f}%",
    "LightGBM Accuracy": accuracy_lgbm,
    "LightGBM Precision": precision_lgbm,
    "LightGBM Recall": recall_lgbm,
    "LightGBM F1 Score": f1_lgbm,
    "LightGBM AUC": auc_lgbm
}

{'LightGBM RMSE': 0.3197993536191779,
 'LightGBM Brier Score': 0.10227162657524398,
 'LightGBM R²': 0.019805393753173473,
 'LightGBM Predicted sum xG': 206.71060943132014,
 'LightGBM Actual sum Goals': 200,
 'LightGBM Absolute Error': 6.710609431320137,
 'LightGBM Goal Diff %': '3.36%',
 'LightGBM Accuracy': 0.8781065088757396,
 'LightGBM Precision': 0.42857142857142855,
 'LightGBM Recall': 0.09,
 'LightGBM F1 Score': 0.1487603305785124,
 'LightGBM AUC': 0.658993288590604}

In [34]:
import joblib

# Save the trained LightGBM pipeline to a file
joblib.dump(reg_pipeline_lgbm, 'reg_pipeline_lgbm.pkl')

['reg_pipeline_lgbm.pkl']

In [35]:
# Load the saved pipeline
reg_pipeline_lgbm_loaded = joblib.load('reg_pipeline_lgbm.pkl')

# Use it to predict
y_pred_loaded = np.clip(reg_pipeline_lgbm_loaded.predict(X_test), 0, 1)

#### Model Tunning

In [36]:
param_grid = {
    'regressor__n_estimators': [300],
    'regressor__learning_rate': [0.01],
    'regressor__max_depth': [3],
    'regressor__num_leaves': [15],
    'regressor__min_child_samples': [10],
    'regressor__subsample': [0.8]
}

# Grid search with 3-fold CV
grid_search = GridSearchCV(
    reg_pipeline_lgbm,
    param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1
)

# Run search with the specified parameters
grid_search.fit(X_train, y_train)

# Best model (which will be the one with the single set of parameters)
best_model_lgbm = grid_search.best_estimator_

# Predict and evaluate
y_pred_lgbm = np.clip(best_model_lgbm.predict(X_test), 0, 1)

mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
best_rmse_lgbm = np.sqrt(mse_lgbm)
best_brier_lgbm = brier_score_loss(y_test, y_pred_lgbm)
best_sum_lgbm = y_pred_lgbm.sum()
best_sum_actual_goals_lgbm = y_test.sum()
best_abs_error_lgbm = abs(best_sum_lgbm - best_sum_actual_goals_lgbm)

print("\nBest LightGBM Parameters:", grid_search.best_params_)
print({
    "LightGBM RMSE": best_rmse_lgbm,
    "LightGBM Brier Score": best_brier_lgbm,
    "LightGBM Predicted sum xG": best_sum_lgbm,
    "LightGBM Actual sum Goals": best_sum_actual_goals_lgbm,
    "LightGBM Absolute Error": best_abs_error_lgbm
})

Fitting 3 folds for each of 1 candidates, totalling 3 fits

Best LightGBM Parameters: {'regressor__learning_rate': 0.01, 'regressor__max_depth': 3, 'regressor__min_child_samples': 10, 'regressor__n_estimators': 300, 'regressor__num_leaves': 15, 'regressor__subsample': 0.8}
{'LightGBM RMSE': 0.3081268098223102, 'LightGBM Brier Score': 0.09494213093127414, 'LightGBM Predicted sum xG': 206.33352685391839, 'LightGBM Actual sum Goals': 200, 'LightGBM Absolute Error': 6.333526853918386}


In [37]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, r2_score
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid with the specified values
param_grid = {
    'regressor__n_estimators': [300],
    'regressor__learning_rate': [0.01],
    'regressor__max_depth': [3],
    'regressor__num_leaves': [15],
    'regressor__min_child_samples': [10],
    'regressor__subsample': [0.8]
}

# Grid search with 3-fold CV
grid_search = GridSearchCV(
    reg_pipeline_lgbm,
    param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1
)



# Run search with the specified parameters
grid_search.fit(X_train, y_train)

# Best model
best_model_lgbm = grid_search.best_estimator_

# Predict probabilities
y_pred_lgbm = np.clip(best_model_lgbm.predict(X_test), 0, 1)

# Convert probabilities to binary predictions (threshold=0.5)
y_pred_lgbm_binary = (y_pred_lgbm >= 0.5).astype(int)

# Calculate classification metrics
accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm_binary)
precision_lgbm = precision_score(y_test, y_pred_lgbm_binary)
recall_lgbm = recall_score(y_test, y_pred_lgbm_binary)
f1_lgbm = f1_score(y_test, y_pred_lgbm_binary)
auc_lgbm = roc_auc_score(y_test, y_pred_lgbm)

# Evaluate the LightGBM model
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
best_rmse_lgbm = np.sqrt(mse_lgbm)
best_brier_lgbm = brier_score_loss(y_test, y_pred_lgbm)
best_r2_lgbm = r2_score(y_test, y_pred_lgbm)

# Sum metrics
best_sum_lgbm = y_pred_lgbm.sum()
best_sum_actual_goals_lgbm = y_test.sum()
best_abs_error_lgbm = abs(best_sum_lgbm - best_sum_actual_goals_lgbm)
goal_diff_percentage = best_abs_error_lgbm / best_sum_actual_goals_lgbm * 100

# Print comprehensive results
print("\nBest LightGBM Parameters:", grid_search.best_params_)
print("\nOptimized LightGBM Performance:")
print(f"RMSE: {best_rmse_lgbm:.6f}")
print(f"Brier Score: {best_brier_lgbm:.6f}")
print(f"R²: {best_r2_lgbm:.6f}")
print(f"Accuracy: {accuracy_lgbm:.6f}")
print(f"Precision: {precision_lgbm:.6f}")
print(f"Recall: {recall_lgbm:.6f}")
print(f"F1 Score: {f1_lgbm:.6f}")
print(f"AUC: {auc_lgbm:.6f}")
print(f"Predicted sum xG: {best_sum_lgbm:.6f}")
print(f"Actual sum Goals: {best_sum_actual_goals_lgbm}")
print(f"Absolute Error: {best_abs_error_lgbm:.6f}")
print(f"Goal Diff %: {goal_diff_percentage:.2f}%")

# Return results as dictionary for potential comparison
optimized_metrics = {
    "LightGBM RMSE": best_rmse_lgbm,
    "LightGBM Brier Score": best_brier_lgbm,
    "LightGBM R²": best_r2_lgbm,
    "LightGBM Accuracy": accuracy_lgbm,
    "LightGBM Precision": precision_lgbm,
    "LightGBM Recall": recall_lgbm,
    "LightGBM F1 Score": f1_lgbm,
    "LightGBM AUC": auc_lgbm,
    "LightGBM Predicted sum xG": best_sum_lgbm,
    "LightGBM Actual sum Goals": best_sum_actual_goals_lgbm,
    "LightGBM Absolute Error": best_abs_error_lgbm,
    "LightGBM Goal Diff %": f"{goal_diff_percentage:.2f}%"
}

Fitting 3 folds for each of 1 candidates, totalling 3 fits

Best LightGBM Parameters: {'regressor__learning_rate': 0.01, 'regressor__max_depth': 3, 'regressor__min_child_samples': 10, 'regressor__n_estimators': 300, 'regressor__num_leaves': 15, 'regressor__subsample': 0.8}

Optimized LightGBM Performance:
RMSE: 0.308127
Brier Score: 0.094942
R²: 0.090053
Accuracy: 0.884024
Precision: 0.700000
Recall: 0.035000
F1 Score: 0.066667
AUC: 0.739413
Predicted sum xG: 206.333527
Actual sum Goals: 200
Absolute Error: 6.333527
Goal Diff %: 3.17%


In [38]:
import joblib

# Save the best LightGBM model to a .pkl file
joblib.dump(best_model_lgbm, 'best_model_lgbm.pkl')

['best_model_lgbm.pkl']

In [39]:
# Load the saved model
best_model_lgbm_loaded = joblib.load('best_model_lgbm.pkl')

# Predict using the loaded model
y_pred_loaded = np.clip(best_model_lgbm_loaded.predict(X_test), 0, 1)