# Ingénieurie des données II

In [3]:
from ift6758.data import load_train_test_dataframes

import pandas as pd
import numpy as np

In [4]:
train_data, test_data = load_train_test_dataframes(all_types=True)

Found 413339 events
Found 434744 events
Found 433203 events
Found 367886 events
Found 287926 events


In [5]:
# Trier les données par match, période, et temps
train_data = train_data.sort_values(by=["game_id", "period_number", "time_in_period"])
test_data = test_data.sort_values(by=["game_id", "period_number", "time_in_period"])


In [6]:
# keep only game id : 2017021065
train_data = train_data[train_data["game_id"] == 2017021065]
test_data = test_data[test_data["game_id"] == 2017021065]

### Fonctions utiles

In [7]:
# Convert time_in_period from MM:SS to seconds
def convert_to_seconds(time_str):
    minutes, seconds = map(int, time_str.split(':'))
    return minutes * 60 + seconds

In [8]:
def add_previous_event_features(data):
    # Ajouter le type d'événement précédent
    data["last_event_type"] = data["type_desc_key"].shift(1)
    
    # Ajouter les coordonnées de l'événement précédent
    data["last_x"] = data["x_coord"].shift(1)
    data["last_y"] = data["y_coord"].shift(1)
    
    # Ajouter le temps écoulé depuis l'événement précédent
    data["time_since_last_event"] = data["game_seconds"] - data["game_seconds"].shift(1)
    
    # Calculer la distance depuis l'événement précédent
    def calculate_distance(row):
        if pd.isna(row["last_x"]) or pd.isna(row["last_y"]):
            return np.nan
        return np.sqrt((row["x_coord"] - row["last_x"])**2 + (row["y_coord"] - row["last_y"])**2)
    
    data["distance_from_last_event"] = data.apply(calculate_distance, axis=1)
    
    return data

In [9]:
def add_last_angle(data):
    """
    Ajoute la colonne `last_angle` au DataFrame.
    La valeur de `last_angle` correspond à `goal_angle` de l'événement précédent,
    et est mise à NaN si l'événement précédent appartient à un autre match.
    """
    # Décaler la colonne `goal_angle` pour obtenir l'angle précédent
    data["last_angle"] = data["goal_angle"].shift(1)
    
    # Mettre à NaN si le match précédent est différent
    data.loc[data["game_id"] != data["game_id"].shift(1), "last_angle"] = np.nan
    
    return data


In [10]:
def calculate_absolute_angle_change(row):
    """
    Calcule le changement d'angle en additionnant les valeurs absolues de l'angle précédent et de l'actuel.
    Retourne 0 si ce n'est pas un rebond.
    """
    if not row["is_rebound"]:  # Si ce n'est pas un rebond, le changement d'angle est 0
        return 0
    # Appliquer la valeur absolue directement lors du calcul
    return abs(row["goal_angle"]) + abs(row["last_angle"])


In [11]:
def calculate_speed(row):
    """
    Calcule la vitesse (distance / temps écoulé).
    Retourne 0 si le temps écoulé est nul ou NaN.
    """
    if pd.isna(row["time_since_last_event"]) or row["time_since_last_event"] <= 0:
        return 0
    return row["distance_from_last_event"] / row["time_since_last_event"]



In [12]:
def calculate_power_play_time(data):
    """
    Ajoute la caractéristique `power_play_time_elapsed` :
    - Temps écoulé depuis le début du jeu de puissance (en secondes).
    - Réinitialisé à 0 lorsque toutes les pénalités associées expirent.
    - Gère les empilements de pénalités (pénalités multiples actives simultanément).
    """
    data = data.copy()

    data["power_play_time_elapsed"] = 0

    active_penalties = {"home": [], "away": []}  # Format : [(expiration_time, start_time)]

    # Parcourir les événements dans l'ordre chronologique
    for idx, row in data.iterrows():
        current_time = row["game_seconds"]

        # Supprimer les pénalités expirées pour chaque équipe
        for team in ["home", "away"]:
            active_penalties[team] = [
                (end_time, start_time)
                for end_time, start_time in active_penalties[team]
                if end_time > current_time  # Garder uniquement les pénalités encore actives
            ]

        # Vérifier si l'événement est une pénalité
        if row["type_desc_key"] == "penalty":
            penalized_team = "home" if row["event_owner_team_id"] == row["away_team_id"] else "away"

            # Déterminer la durée de la pénalité
            if row["details_type_code"] == "MIN":
                penalty_duration = 120  # Pénalité mineure (2 min)
                # Gérer les double mineures (deux pénalités consécutives)
                if idx + 1 < len(data.index) and data.iloc[idx + 1]["details_type_code"] == "MIN" and \
                   data.iloc[idx + 1]["game_seconds"] == current_time:
                    # Double mineure : Ajouter deux pénalités de 2 minutes
                    active_penalties[penalized_team].append((current_time + penalty_duration, current_time))
                    active_penalties[penalized_team].append((current_time + 2 * penalty_duration, current_time))
                    continue  # Éviter de traiter deux fois la double mineure
                else:
                    # Pénalité simple
                    active_penalties[penalized_team].append((current_time + penalty_duration, current_time))
            elif row["details_type_code"] == "MAJ":
                penalty_duration = 300  # Pénalité majeure (5 min)
                active_penalties[penalized_team].append((current_time + penalty_duration, current_time))
            else:
                continue

        # Calculer le temps de jeu de puissance actif
        home_penalties = len(active_penalties["home"])
        away_penalties = len(active_penalties["away"])

        if home_penalties > away_penalties:
            # L'équipe à domicile est en désavantage numérique
            if active_penalties["home"]:
                start_time = min([start for _, start in active_penalties["home"]])  # Temps de début de la pénalité
                power_play_time = current_time - start_time
            else:
                power_play_time = 0
        elif away_penalties > home_penalties:
            # L'équipe visiteuse est en désavantage numérique
            if active_penalties["away"]:
                start_time = min([start for _, start in active_penalties["away"]])  # Temps de début de la pénalité
                power_play_time = current_time - start_time
            else:
                power_play_time = 0
        else:
            # Pas de jeu de puissance actif
            power_play_time = 0

        # S'assurer que `power_play_time` ne devient pas négatif
        power_play_time = max(0, power_play_time)

        # Mettre à jour la colonne power_play_time_elapsed
        data.at[idx, "power_play_time_elapsed"] = power_play_time

    return data


### Preprocessing function (Advanced)

In [13]:
def preprocess_advanced(data):
    """
    Prétraitement avancé des données :
    - Conversion de `time_in_period` en secondes.
    - Calcul de `game_seconds`.
    - Ajout des informations sur l'événement précédent (type, distance, temps écoulé).
    - Détection des rebonds.
    - Calcul de la vitesse et du changement d'angle.
    - Ajout du temps de jeu de puissance (power_play_time_elapsed).
    - Filtrage pour conserver uniquement les goals et shots-on-goal.
    """
    # Faire une copie pour éviter les vues
    data = data.copy()

    # Convertir `time_in_period` en secondes
    def convert_to_seconds(time_str):
        minutes, seconds = map(int, time_str.split(':'))
        return minutes * 60 + seconds

    data['time_in_period_seconds'] = data['time_in_period'].apply(convert_to_seconds)
    
    # Calculer game_seconds
    data['game_seconds'] = (data['period_number'] - 1) * 1200 + data['time_in_period_seconds']
    
    # Ajouter les informations sur l'événement précédent
    data = add_previous_event_features(data)

    # Ajouter la colonne `is_rebound` pour détecter les rebonds
    data["is_rebound"] = data["last_event_type"].isin(["shot-on-goal", "missed-shot", "blocked-shot"])

    # Calculer la vitesse depuis le dernier événement
    data["speed"] = data.apply(calculate_speed, axis=1)
    
    # Ajouter la colonne last_angle (angle précédent)
    data = add_last_angle(data)

    # Ajouter la colonne absolute_angle_change
    data["absolute_angle_change"] = data.apply(calculate_absolute_angle_change, axis=1)

    # Calculer le temps de jeu de puissance (avant filtrage)
    data = calculate_power_play_time(data)

    # Filtrer pour ne conserver que les goals et shots-on-goal
    data = data[data["type_desc_key"].isin(["goal", "shot-on-goal"])].copy()

    return data


In [17]:
# Preprocessing the train_data explicitly
data_2017021065 = preprocess_advanced(train_data)

# Display the processed train_data
pd.set_option('display.max_columns', None)
display(data_2017021065.head(100))


Unnamed: 0,game_id,season,game_type,game_date,venue,venue_location,away_team_id,away_team_abbrev,away_team_name,home_team_id,home_team_abbrev,home_team_name,event_id,event_idx,sort_order,period_number,period_type,max_regulation_periods,time_in_period,time_remaining,situation_code,is_empty_net,is_goal,type_code,type_desc_key,away_score,home_score,away_sog,home_sog,x_coord,y_coord,zone_code,shot_type,description,event_owner_team_id,details_type_code,scoring_player_total,assist1_player_total,assist2_player_total,goal_distance,goal_angle,goal_side,goal_x_coord,shooting_player_id,shooting_player_name,shooting_player_team_id,shooting_player_position_code,goalie_in_net_id,goalie_in_net_name,goalie_in_net_team_id,goalie_in_net_position_code,scoring_player_id,scoring_player_name,scoring_player_team_id,scoring_player_position_code,assist1_player_id,assist1_player_name,assist1_player_team_id,assist1_player_position_code,assist2_player_id,assist2_player_name,assist2_player_team_id,assist2_player_position_code,time_in_period_seconds,game_seconds,last_event_type,last_x,last_y,time_since_last_event,distance_from_last_event,is_rebound,speed,last_angle,absolute_angle_change,power_play_time_elapsed
339923,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,13,9,22,1,REG,3,01:51,18:09,1551,False,False,506,shot-on-goal,,,0.0,1.0,-50.0,36.0,O,snap,Connor Hellebuyck stops a shot from Michal Kempny,15.0,,,,,53.075418,-42.709390,left,-89,8479482.0,Michal Kempny,15.0,D,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,111,111,,,,,,False,0.000000,,0.000000,0
339924,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,14,10,23,1,REG,3,01:55,18:05,1551,False,False,506,shot-on-goal,,,0.0,2.0,-85.0,-25.0,O,wrist,Connor Hellebuyck stops a shot from John Carlson,15.0,,,,,25.317978,80.909723,right,-89,8474590.0,John Carlson,15.0,D,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,115,115,shot-on-goal,-50.0,36.0,4.0,70.327804,True,17.581951,-42.709390,123.619113,0
339928,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,16,14,29,1,REG,3,02:04,17:56,1551,False,False,506,shot-on-goal,,,1.0,2.0,73.0,-16.0,O,backhand,Philipp Grubauer stops a shot from Brandon Tanev,52.0,,,,,22.627417,-45.000000,left,89,8479293.0,Brandon Tanev,52.0,L,8475831.0,Philipp Grubauer,15.0,G,,,,,,,,,,,,,124,124,shot-on-goal,-85.0,-25.0,9.0,158.256122,True,17.584014,80.909723,125.909723,0
339932,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,17,18,33,1,REG,3,02:31,17:29,1551,False,False,506,shot-on-goal,,,1.0,3.0,-29.0,-6.0,O,slap,Connor Hellebuyck stops a shot from Brooks Orpik,15.0,,,,,60.299254,5.710593,right,-89,8468498.0,Brooks Orpik,15.0,D,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,151,151,shot-on-goal,73.0,-16.0,27.0,102.489024,True,3.795890,-45.000000,50.710593,0
339933,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,18,19,35,1,REG,3,02:39,17:21,1551,False,False,506,shot-on-goal,,,2.0,3.0,23.0,-34.0,N,wrist,Philipp Grubauer stops a shot from Joel Armia,52.0,,,,,74.242845,-27.255328,left,89,8476469.0,Joel Armia,52.0,R,8475831.0,Philipp Grubauer,15.0,G,,,,,,,,,,,,,159,159,shot-on-goal,-29.0,-6.0,8.0,59.059292,True,7.382412,5.710593,32.965922,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340261,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,775,347,672,4,OT,3,01:35,03:25,1341,False,False,506,shot-on-goal,,,26.0,37.0,71.0,-8.0,O,snap,Connor Hellebuyck stops a shot from Nicklas Ba...,15.0,,,,,19.697716,-23.962489,left,89,8473563.0,Nicklas Backstrom,15.0,C,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,95,3695,shot-on-goal,76.0,-5.0,7.0,5.830952,True,0.832993,-21.037511,45.000000,0
340262,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,776,348,673,4,OT,3,01:46,03:14,1341,False,False,506,shot-on-goal,,,26.0,38.0,58.0,10.0,O,slap,Connor Hellebuyck stops a shot from Alex Ovechkin,15.0,,,,,32.572995,17.878697,right,89,8471214.0,Alex Ovechkin,15.0,L,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,106,3706,shot-on-goal,71.0,-8.0,11.0,22.203603,True,2.018509,-23.962489,41.841186,0
340264,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,777,350,677,4,OT,3,02:14,02:46,1341,False,False,506,shot-on-goal,,,26.0,39.0,60.0,23.0,O,slap,Connor Hellebuyck stops a shot from Alex Ovechkin,15.0,,,,,37.013511,38.418055,right,89,8471214.0,Alex Ovechkin,15.0,L,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,134,3734,shot-on-goal,58.0,10.0,28.0,13.152946,True,0.469748,17.878697,56.296752,0
340270,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,780,356,690,4,OT,3,03:27,01:33,1441,False,False,506,shot-on-goal,,,26.0,40.0,74.0,1.0,O,wrist,Connor Hellebuyck stops a shot from John Carlson,15.0,,,,,15.033296,3.814075,right,89,8474590.0,John Carlson,15.0,D,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,207,3807,shot-on-goal,60.0,23.0,73.0,26.076810,True,0.357217,38.418055,42.232130,0


In [15]:
# columns
print(train_data.columns)

Index(['game_id', 'season', 'game_type', 'game_date', 'venue',
       'venue_location', 'away_team_id', 'away_team_abbrev', 'away_team_name',
       'home_team_id', 'home_team_abbrev', 'home_team_name', 'event_id',
       'event_idx', 'sort_order', 'period_number', 'period_type',
       'max_regulation_periods', 'time_in_period', 'time_remaining',
       'situation_code', 'is_empty_net', 'is_goal', 'type_code',
       'type_desc_key', 'away_score', 'home_score', 'away_sog', 'home_sog',
       'x_coord', 'y_coord', 'zone_code', 'shot_type', 'description',
       'event_owner_team_id', 'details_type_code', 'scoring_player_total',
       'assist1_player_total', 'assist2_player_total', 'goal_distance',
       'goal_angle', 'goal_side', 'goal_x_coord', 'shooting_player_id',
       'shooting_player_name', 'shooting_player_team_id',
       'shooting_player_position_code', 'goalie_in_net_id',
       'goalie_in_net_name', 'goalie_in_net_team_id',
       'goalie_in_net_position_code', 'scoring_p