# 3. Data Cleaning

In [3]:
import pandas as pd
import numpy as np
from ift6758.data import load_events_dataframe

In [4]:
def data_cleaning(df):
    df_clean = df[['time_in_period', 'time_remaining', 'period_number', 
                  'event_owner_team_id', 'home_team_id', 'home_team_name', 
                  'away_team_id', 'away_team_name', 'type_desc_key', 'x_coord', 
                  'y_coord', 'shooting_player_name', 'goalie_in_net_name', 
                  'shot_type', 'goalie_in_net_position_code', 'situation_code']].copy()

    df_clean['is_empty_net'] = df_clean['goalie_in_net_position_code'].isna().replace({True: 'yes', False: 'no'})

    df_clean = df_clean.drop(columns=['goalie_in_net_position_code'])

    df_clean.loc[:, 'event_team_name'] = df_clean.apply(
        lambda x: x['home_team_name'] if x['event_owner_team_id'] == x['home_team_id'] else x['away_team_name'],
        axis=1
    )

    def get_player_counts(situation_code):
        away_players = int(situation_code[0]) + int(situation_code[1])
        home_players = int(situation_code[2]) + int(situation_code[3])  
        return away_players, home_players

    df_clean[['away_player', 'home_player']] = df_clean['situation_code'].astype(str).apply(get_player_counts).apply(pd.Series)

    def determine_situation(row):
        if row['away_player'] == row['home_player']:
            return 'even strength'
        elif row['away_player'] > row['home_player'] and row['away_team_name'] == row['event_team_name']:
            return 'power play'
        elif row['away_player'] > row['home_player'] and row['away_team_name'] != row['event_team_name']:
            return 'shorthanded'
        elif row['home_player'] > row['away_player'] and row['home_team_name'] == row['event_team_name']:
            return 'power play'
        elif row['home_player'] > row['away_player'] and row['home_team_name'] != row['event_team_name']:
            return 'shorthanded'
        else:
            return 'unknown'

    df_clean['situation'] = df_clean.apply(determine_situation, axis=1)

    df_clean = df_clean.drop(columns=['away_team_name', 'home_team_name', 'situation_code',
                                  'event_owner_team_id', 'home_team_id', 'away_team_id',
                                  'away_player', 'home_player'])

    return df_clean

In [5]:
# few minutes to run
df = load_events_dataframe()
data_cleaning(df)

Found 647679 events


Unnamed: 0,time_in_period,time_remaining,period_number,type_desc_key,x_coord,y_coord,shooting_player_name,goalie_in_net_name,shot_type,is_empty_net,event_team_name,situation
0,01:11,18:49,1,shot-on-goal,-77.0,5.0,Mitch Marner,Craig Anderson,wrist,no,Maple Leafs,even strength
1,02:53,17:07,1,shot-on-goal,86.0,13.0,Chris Kelly,Frederik Andersen,wrist,no,Senators,even strength
2,04:01,15:59,1,shot-on-goal,23.0,-38.0,Cody Ceci,Frederik Andersen,wrist,no,Senators,even strength
3,04:46,15:14,1,shot-on-goal,33.0,-15.0,Erik Karlsson,Frederik Andersen,slap,no,Senators,even strength
4,06:46,13:14,1,shot-on-goal,-34.0,28.0,Martin Marincin,Craig Anderson,wrist,no,Maple Leafs,even strength
...,...,...,...,...,...,...,...,...,...,...,...,...
647674,12:57,07:03,3,shot-on-goal,85.0,1.0,Zach Hyman,Sergei Bobrovsky,poke,no,Oilers,even strength
647675,14:25,05:35,3,shot-on-goal,-53.0,-27.0,Vladimir Tarasenko,Stuart Skinner,wrist,no,Panthers,even strength
647676,15:23,04:37,3,shot-on-goal,-59.0,-29.0,Aleksander Barkov,Stuart Skinner,snap,no,Panthers,even strength
647677,15:48,04:12,3,shot-on-goal,57.0,-26.0,Darnell Nurse,Sergei Bobrovsky,wrist,no,Oilers,even strength
