In [246]:
import pandas as pd
import numpy as np

In [247]:
df_pbp = pd.read_csv(r"RAW/nhl_pbp_20212022.csv")
df_shifts = pd.read_csv(r"RAW/nhl_shifts_20212022.csv")

# Cleaning play-by-play data

In [248]:
df_pbp.head(5)

Unnamed: 0.1,Unnamed: 0,Game_Id,Date,Period,Event,Description,Time_Elapsed,Seconds_Elapsed,Strength,Ev_Zone,...,Away_Score,Home_Score,Away_Goalie,Away_Goalie_Id,Home_Goalie,Home_Goalie_Id,xC,yC,Home_Coach,Away_Coach
0,0,20001,2021-10-12,1,PSTR,Period Start- Local time: 7:43 EDT,0:00,0.0,5x5,,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,,,Jon Cooper,Mike Sullivan
1,1,20001,2021-10-12,1,FAC,PIT won Neu. Zone - PIT #77 CARTER vs TBL #21 ...,0:00,0.0,5x5,Neu,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,0.0,0.0,Jon Cooper,Mike Sullivan
2,2,20001,2021-10-12,1,HIT,"TBL #18 PALAT HIT PIT #77 CARTER, Off. Zone",0:18,18.0,5x5,Off,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,46.0,40.0,Jon Cooper,Mike Sullivan
3,3,20001,2021-10-12,1,STOP,PUCK IN NETTING,0:38,38.0,5x5,,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,,,Jon Cooper,Mike Sullivan
4,4,20001,2021-10-12,1,FAC,TBL won Neu. Zone - PIT #9 RODRIGUES vs TBL #7...,0:38,38.0,5x5,Neu,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,0.0,0.0,Jon Cooper,Mike Sullivan


In [249]:
#Drop all unnecessary columns
columns = ['Game_Id','Date','Period','Event', 'Description', 'Strength', 'Type',
      'Away_Team', 'Home_Team', 'p1_name', 'p1_ID',
       'p2_name', 'p2_ID', 'p3_name', 'p3_ID']

df_pbp = df_pbp[columns]

In [250]:
# We have three goals that are missing a 'type' - I could just drop these but since the number is so low,
# I decided to manually fill it in by watching the game highlights to see how each goal was scored.
list_shots = ['SNAP SHOT', 'BACKHAND', 'BACKHAND']
df_pbp.iloc[[96417, 110893, 225472],6] = list_shots

In [251]:
#Include the type of shot used in the shot on goal or goal

df_pbp['Event'].replace('MISS', 'SHOT', inplace = True) # I decided to group in shots-on-goal with misses

def concat_event_type(row):
    #shot or goal type
    if row['Event'] in ['SHOT', 'GOAL']:
        return f"{row['Event']} ({row['Type']})"
    #fighting or regular penalty
    elif row['Event'] == 'PENL' and row['Type'] in ['Fighting (maj)', 'Instigator(2 min)',
                                                'Instigator - Misconduct(10 min)', 'Aggressor(10 min)'
                                               ]:
        return f"{row['Event']} (FIGHTING)"
   
    elif row['Event'] == 'PENL':
        return f"{row['Event']} (OTHER)"
    else:
        return row['Event']
   
    #drop original Event and Type Column
    row.drop('Event', axis = 'columns', inplace = True)
     row.drop('Type', axis = 'columns', inplace = True)

# apply the function to create a new column
df_pbp['Event_New'] = df_pbp.apply(concat_event_type, axis=1)


In [255]:
#Let's now filter only for the events that we care about (goals, hit, shots, takeaways, giveaways, etc.)
relevant_events = ['HIT','SHOT (WRIST SHOT)', 'TAKE', 'BLOCK',
       'SHOT (SLAP SHOT)', 'SHOT (BACKHAND)', 'SHOT (TIP-IN)',
       'SHOT (SNAP SHOT)', 'GIVE','GOAL (SNAP SHOT)',
       'GOAL (WRIST SHOT)', 'SHOT (WRAP-AROUND)', 'SHOT (DEFLECTED)',
       'PENL (OTHER)', 'GOAL (TIP-IN)', 'GOAL (BACKHAND)', 'GOAL (DEFLECTED)', 'GOAL (SLAP SHOT)',
       'PENL (FIGHTING)', 'GOAL (WRAP-AROUND)']

df_pbp = df_pbp.loc[df_pbp['Event_New'].isin(relevant_events)]


In [257]:
#Filtering to only include 5x5 plays
df_pbp = df_pbp.loc[df_pbp.Strength == '5x5']

In [262]:
df_pbp.head(5)

Unnamed: 0,Game_Id,Date,Period,Event,Description,Strength,Type,Away_Team,Home_Team,p1_name,p1_ID,p2_name,p2_ID,p3_name,p3_ID,Event_New
2,20001,2021-10-12,1,HIT,"TBL #18 PALAT HIT PIT #77 CARTER, Off. Zone",5x5,,PIT,T.B,ONDREJ PALAT,8476292.0,JEFF CARTER,8470604.0,,,HIT
5,20001,2021-10-12,1,HIT,"TBL #71 CIRELLI HIT PIT #28 PETTERSSON, Off. Zone",5x5,,PIT,T.B,ANTHONY CIRELLI,8478519.0,MARCUS PETTERSSON,8477969.0,,,HIT
6,20001,2021-10-12,1,SHOT,"TBL ONGOAL - #91 STAMKOS, Wrist, Off. Zone, 42...",5x5,WRIST SHOT,PIT,T.B,STEVEN STAMKOS,8474564.0,,,,,SHOT (WRIST SHOT)
9,20001,2021-10-12,1,TAKE,"TBL TAKEAWAY - #7 JOSEPH, Off. Zone",5x5,,PIT,T.B,MATHIEU JOSEPH,8478472.0,,,,,TAKE
10,20001,2021-10-12,1,BLOCK,"TBL #24 BOGOSIAN BLOCKED BY PIT #23 MCGINN, W...",5x5,WRIST SHOT,PIT,T.B,BROCK MCGINN,8476934.0,ZACH BOGOSIAN,8474567.0,,,BLOCK
