# Ingénierie des données II

## Set up

### Importing Libraries

In [26]:
import pandas as pd
import numpy as np
from ift6758.data import load_train_test_dataframes

### Loading Data

In [27]:
# Load train and test datasets
train_data, test_data = load_train_test_dataframes(all_types=True)

# Merge train and test data
full_data = pd.concat([train_data, test_data])

# Sort data by game_id, period_number, and time_in_period
full_data = full_data.sort_values(by=["game_id", "period_number", "time_in_period"])

# Filter data for game_id: 2017021065
data_2017021065 = full_data[full_data["game_id"] == 2017021065]


Found 413339 events
Found 434744 events
Found 433203 events
Found 367886 events
Found 287926 events


## Features Engineering

### Helper Functions

In [29]:
def convert_to_seconds(time_str):
    minutes, seconds = map(int, time_str.split(':'))
    return minutes * 60 + seconds

def add_previous_event_features(data):
    data["last_event_type"] = data["type_desc_key"].shift(1)
    data["last_x"] = data["x_coord"].shift(1)
    data["last_y"] = data["y_coord"].shift(1)
    data["time_since_last_event"] = data["game_seconds"] - data["game_seconds"].shift(1)
    
    def calculate_distance(row):
        if pd.isna(row["last_x"]) or pd.isna(row["last_y"]):
            return np.nan
        return np.sqrt((row["x_coord"] - row["last_x"])**2 + (row["y_coord"] - row["last_y"])**2)
    
    data["distance_from_last_event"] = data.apply(calculate_distance, axis=1)
    return data

def add_last_angle(data):
    data["last_angle"] = data["goal_angle"].shift(1)
    data.loc[data["game_id"] != data["game_id"].shift(1), "last_angle"] = np.nan
    return data

def calculate_absolute_angle_change(row):
    if not row["is_rebound"]:
        return 0
    return abs(row["goal_angle"]) + abs(row["last_angle"])

def calculate_speed(row):
    if pd.isna(row["time_since_last_event"]) or row["time_since_last_event"] <= 0:
        return 0
    return row["distance_from_last_event"] / row["time_since_last_event"]

def calculate_power_play_time(data):
    data = data.copy()
    data["power_play_time_elapsed"] = 0
    active_penalties = {"home": [], "away": []}

    for idx, row in data.iterrows():
        current_time = row["game_seconds"]
        for team in ["home", "away"]:
            active_penalties[team] = [
                (end_time, start_time) for end_time, start_time in active_penalties[team] if end_time > current_time
            ]
        if row["type_desc_key"] == "penalty":
            penalized_team = "home" if row["event_owner_team_id"] == row["away_team_id"] else "away"
            penalty_duration = 120 if row["details_type_code"] == "MIN" else 300
            active_penalties[penalized_team].append((current_time + penalty_duration, current_time))
        home_penalties = len(active_penalties["home"])
        away_penalties = len(active_penalties["away"])
        if home_penalties > away_penalties:
            power_play_time = current_time - min(start for _, start in active_penalties["home"])
        elif away_penalties > home_penalties:
            power_play_time = current_time - min(start for _, start in active_penalties["away"])
        else:
            power_play_time = 0
        power_play_time = max(0, power_play_time)
        data.at[idx, "power_play_time_elapsed"] = power_play_time
    return data

def add_skater_counts_for_shooting_team(data):
    def parse_skater_counts(row):
        try:
            home_skaters = int(str(row["situation_code"])[1])
            away_skaters = int(str(row["situation_code"])[2])
            if row["event_owner_team_id"] == row["home_team_id"]:
                return home_skaters, away_skaters
            else:
                return away_skaters, home_skaters
        except (ValueError, TypeError):
            return 5, 5

    skater_counts = data.apply(parse_skater_counts, axis=1)
    data["shooting_team_skaters"] = skater_counts.apply(lambda x: x[0])
    data["opposing_team_skaters"] = skater_counts.apply(lambda x: x[1])
    return data


### Preprocessing function (Advanced)

In [30]:
def preprocess_advanced(data):
    data = data.copy()
    data['time_in_period_seconds'] = data['time_in_period'].apply(convert_to_seconds)
    data['game_seconds'] = (data['period_number'] - 1) * 1200 + data['time_in_period_seconds']
    data = add_previous_event_features(data)
    data["is_rebound"] = data["last_event_type"].isin(["shot-on-goal", "missed-shot", "blocked-shot"])
    data["speed"] = data.apply(calculate_speed, axis=1)
    data = add_last_angle(data)
    data["absolute_angle_change"] = data.apply(calculate_absolute_angle_change, axis=1)
    data = calculate_power_play_time(data)
    data = add_skater_counts_for_shooting_team(data)
    data = data[data["type_desc_key"].isin(["shot-on-goal", "goal"])]
    return data

# Apply preprocessing
processed_data = preprocess_advanced(data_2017021065)


In [24]:
# Preprocessing the train_data explicitly
data_2017021065 = preprocess_advanced(train_data)

# Display the processed train_data
pd.set_option('display.max_columns', None)
display(data_2017021065.head(100))


Unnamed: 0,game_id,season,game_type,game_date,venue,venue_location,away_team_id,away_team_abbrev,away_team_name,home_team_id,home_team_abbrev,home_team_name,event_id,event_idx,sort_order,period_number,period_type,max_regulation_periods,time_in_period,time_remaining,situation_code,is_empty_net,is_goal,type_code,type_desc_key,away_score,home_score,away_sog,home_sog,x_coord,y_coord,zone_code,shot_type,description,event_owner_team_id,details_type_code,scoring_player_total,assist1_player_total,assist2_player_total,goal_distance,goal_angle,goal_side,goal_x_coord,shooting_player_id,shooting_player_name,shooting_player_team_id,shooting_player_position_code,goalie_in_net_id,goalie_in_net_name,goalie_in_net_team_id,goalie_in_net_position_code,scoring_player_id,scoring_player_name,scoring_player_team_id,scoring_player_position_code,assist1_player_id,assist1_player_name,assist1_player_team_id,assist1_player_position_code,assist2_player_id,assist2_player_name,assist2_player_team_id,assist2_player_position_code,time_in_period_seconds,game_seconds,last_event_type,last_x,last_y,time_since_last_event,distance_from_last_event,is_rebound,speed,last_angle,absolute_angle_change,power_play_time_elapsed,shooting_team_skaters,opposing_team_skaters
339923,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,13,9,22,1,REG,3,01:51,18:09,1551,False,False,506,shot-on-goal,,,0.0,1.0,-50.0,36.0,O,snap,Connor Hellebuyck stops a shot from Michal Kempny,15.0,,,,,53.075418,-42.709390,left,-89,8479482.0,Michal Kempny,15.0,D,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,111,111,,,,,,False,0.000000,,0.000000,0,5,5
339924,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,14,10,23,1,REG,3,01:55,18:05,1551,False,False,506,shot-on-goal,,,0.0,2.0,-85.0,-25.0,O,wrist,Connor Hellebuyck stops a shot from John Carlson,15.0,,,,,25.317978,80.909723,right,-89,8474590.0,John Carlson,15.0,D,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,115,115,shot-on-goal,-50.0,36.0,4.0,70.327804,True,17.581951,-42.709390,123.619113,0,5,5
339928,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,16,14,29,1,REG,3,02:04,17:56,1551,False,False,506,shot-on-goal,,,1.0,2.0,73.0,-16.0,O,backhand,Philipp Grubauer stops a shot from Brandon Tanev,52.0,,,,,22.627417,-45.000000,left,89,8479293.0,Brandon Tanev,52.0,L,8475831.0,Philipp Grubauer,15.0,G,,,,,,,,,,,,,124,124,shot-on-goal,-85.0,-25.0,9.0,158.256122,True,17.584014,80.909723,125.909723,0,5,5
339932,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,17,18,33,1,REG,3,02:31,17:29,1551,False,False,506,shot-on-goal,,,1.0,3.0,-29.0,-6.0,O,slap,Connor Hellebuyck stops a shot from Brooks Orpik,15.0,,,,,60.299254,5.710593,right,-89,8468498.0,Brooks Orpik,15.0,D,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,151,151,shot-on-goal,73.0,-16.0,27.0,102.489024,True,3.795890,-45.000000,50.710593,0,5,5
339933,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,18,19,35,1,REG,3,02:39,17:21,1551,False,False,506,shot-on-goal,,,2.0,3.0,23.0,-34.0,N,wrist,Philipp Grubauer stops a shot from Joel Armia,52.0,,,,,74.242845,-27.255328,left,89,8476469.0,Joel Armia,52.0,R,8475831.0,Philipp Grubauer,15.0,G,,,,,,,,,,,,,159,159,shot-on-goal,-29.0,-6.0,8.0,59.059292,True,7.382412,5.710593,32.965922,0,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340261,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,775,347,672,4,OT,3,01:35,03:25,1341,False,False,506,shot-on-goal,,,26.0,37.0,71.0,-8.0,O,snap,Connor Hellebuyck stops a shot from Nicklas Ba...,15.0,,,,,19.697716,-23.962489,left,89,8473563.0,Nicklas Backstrom,15.0,C,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,95,3695,shot-on-goal,76.0,-5.0,7.0,5.830952,True,0.832993,-21.037511,45.000000,0,3,4
340262,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,776,348,673,4,OT,3,01:46,03:14,1341,False,False,506,shot-on-goal,,,26.0,38.0,58.0,10.0,O,slap,Connor Hellebuyck stops a shot from Alex Ovechkin,15.0,,,,,32.572995,17.878697,right,89,8471214.0,Alex Ovechkin,15.0,L,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,106,3706,shot-on-goal,71.0,-8.0,11.0,22.203603,True,2.018509,-23.962489,41.841186,0,3,4
340264,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,777,350,677,4,OT,3,02:14,02:46,1341,False,False,506,shot-on-goal,,,26.0,39.0,60.0,23.0,O,slap,Connor Hellebuyck stops a shot from Alex Ovechkin,15.0,,,,,37.013511,38.418055,right,89,8471214.0,Alex Ovechkin,15.0,L,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,134,3734,shot-on-goal,58.0,10.0,28.0,13.152946,True,0.469748,17.878697,56.296752,0,3,4
340270,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,780,356,690,4,OT,3,03:27,01:33,1441,False,False,506,shot-on-goal,,,26.0,40.0,74.0,1.0,O,wrist,Connor Hellebuyck stops a shot from John Carlson,15.0,,,,,15.033296,3.814075,right,89,8474590.0,John Carlson,15.0,D,8476945.0,Connor Hellebuyck,52.0,G,,,,,,,,,,,,,207,3807,shot-on-goal,60.0,23.0,73.0,26.076810,True,0.357217,38.418055,42.232130,0,4,4


In [32]:
# Save the final dataset to a CSV
processed_data.to_csv("wpg_v_wsh_2017021065.csv", index=False)

# Display a preview
pd.set_option('display.max_columns', None)
display(processed_data.head(100))


Unnamed: 0,game_id,season,game_type,game_date,venue,venue_location,away_team_id,away_team_abbrev,away_team_name,home_team_id,home_team_abbrev,home_team_name,event_id,event_idx,sort_order,period_number,period_type,max_regulation_periods,time_in_period,time_remaining,situation_code,is_empty_net,is_goal,type_code,type_desc_key,away_score,home_score,away_sog,home_sog,x_coord,y_coord,zone_code,shot_type,description,event_owner_team_id,details_type_code,scoring_player_total,assist1_player_total,assist2_player_total,goal_distance,goal_angle,goal_side,goal_x_coord,shooting_player_id,shooting_player_name,shooting_player_team_id,shooting_player_position_code,goalie_in_net_id,goalie_in_net_name,goalie_in_net_team_id,goalie_in_net_position_code,scoring_player_id,scoring_player_name,scoring_player_team_id,scoring_player_position_code,assist1_player_id,assist1_player_name,assist1_player_team_id,assist1_player_position_code,assist2_player_id,assist2_player_name,assist2_player_team_id,assist2_player_position_code,time_in_period_seconds,game_seconds,last_event_type,last_x,last_y,time_since_last_event,distance_from_last_event,is_rebound,speed,last_angle,absolute_angle_change,power_play_time_elapsed,shooting_team_skaters,opposing_team_skaters
339914,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,8,0,8,1,REG,3,00:00,20:00,1551,False,False,520,period-start,,,,,,,,,Event period-start,,,,,,0.000000,0.000000,center,89,,,,,,,,,,,,,,,,,,,,,0,0,,,,,,False,0.000000,,0.000000,0,5,5
339915,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,9,1,9,1,REG,3,00:00,20:00,1551,False,False,502,faceoff,,,,,0.0,0.0,N,,Event faceoff,52.0,,,,,89.000000,0.000000,center,89,,,,,,,,,,,,,,,,,,,,,0,0,period-start,,,0.0,,False,0.000000,0.000000,0.000000,0,5,5
339916,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,51,2,10,1,REG,3,00:17,19:43,1551,False,False,503,hit,,,,,-94.0,35.0,O,,Event hit,15.0,,,,,35.355339,-98.130102,left,-89,,,,,,,,,,,,,,,,,,,,,17,17,faceoff,0.0,0.0,17.0,100.304536,False,5.900267,0.000000,0.000000,0,5,5
339917,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,10,3,11,1,REG,3,00:31,19:29,1551,False,False,508,blocked-shot,,,,,-68.0,31.0,D,,Event blocked-shot,15.0,,,,,37.443290,-55.885527,left,-89,8471214.0,Alex Ovechkin,15.0,L,,,,,,,,,,,,,,,,,31,31,hit,-94.0,35.0,14.0,26.305893,False,1.878992,-98.130102,0.000000,0,5,5
339918,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,52,4,15,1,REG,3,01:20,18:40,1551,False,False,516,stoppage,,,,,,,,,Event stoppage,,,,,,0.000000,0.000000,center,89,,,,,,,,,,,,,,,,,,,,,80,80,blocked-shot,-68.0,31.0,49.0,,True,,-55.885527,55.885527,0,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340009,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,251,95,186,1,REG,3,16:05,03:55,1541,False,False,516,stoppage,,,,,,,,,Event stoppage,,,,,,0.000000,0.000000,center,89,,,,,,,,,,,,,,,,,,,,,965,965,faceoff,69.0,-22.0,4.0,,False,,7.926927,0.000000,81,4,5
340010,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,209,96,187,1,REG,3,16:05,03:55,1541,False,False,502,faceoff,,,,,69.0,22.0,D,,Event faceoff,15.0,,,,,159.524293,-7.926927,left,-89,,,,,,,,,,,,,,,,,,,,,965,965,stoppage,,,0.0,,False,0.000000,0.000000,0.000000,81,5,4
340011,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,252,97,188,1,REG,3,16:12,03:48,1541,False,False,504,giveaway,,,,,81.0,-36.0,D,,Event giveaway,15.0,,,,,173.769963,11.956584,right,-89,,,,,,,,,,,,,,,,,,,,,972,972,faceoff,69.0,22.0,7.0,59.228372,False,8.461196,-7.926927,0.000000,88,5,4
340012,2017021065,20172018,2,2018-03-12,Capital One Arena,Washington,52,WPG,Jets,15,WSH,Capitals,210,98,189,1,REG,3,16:23,03:37,1541,False,False,506,shot-on-goal,,,6.0,10.0,89.0,17.0,O,wrist,Philipp Grubauer stops a shot from Joel Armia,52.0,,,,,17.000000,90.000000,right,89,8476469.0,Joel Armia,52.0,R,8475831.0,Philipp Grubauer,15.0,G,,,,,,,,,,,,,983,983,giveaway,81.0,-36.0,11.0,53.600373,False,4.872761,11.956584,0.000000,99,4,5
