In [1]:
#Install required libraries
import pandas as pd
import numpy as np
import random
from collections import defaultdict
from sklearn.cluster import KMeans

### Import the event data from CSV format
df = pd.read_csv('Linhac_df_keyed_20_games.csv')
df.head(10)

Unnamed: 0,gameid,opposingteamgoalieoniceid,opposingteamid,playerid,teamgoalieoniceid,teamid,teaminpossession,currentpossession,xg,compiledgametime,...,manpowersituation,opposingteamskatersonicecount,outcome,period,playerprimaryposition,scoredifferential,teamskatersonicecount,type,xadjcoord,yadjcoord
0,66445,506563.0,916,358235,940804.0,742,,,,0.0,...,evenStrength,5,failed,1,F,0,5,none,0.305008,-0.252941
1,66445,940804.0,742,586302,506563.0,916,,,,0.0,...,evenStrength,5,successful,1,F,0,5,recoveredwithentry,-0.305008,0.252941
2,66445,506563.0,916,358235,940804.0,742,,,,0.033333,...,evenStrength,5,failed,1,F,0,5,contested,-0.197929,0.752941
3,66445,940804.0,742,689086,506563.0,916,916.0,0.0,,0.1,...,evenStrength,5,successful,1,D,0,5,faceoff,-7.849129,-4.77647
4,66445,940804.0,742,689086,506563.0,916,916.0,0.0,,0.166667,...,evenStrength,5,successful,1,D,0,5,south,-6.843246,-3.267647
5,66445,940804.0,742,591556,506563.0,916,916.0,0.0,,1.0,...,evenStrength,5,successful,1,D,0,5,regular,-23.943245,7.294117
6,66445,940804.0,742,591556,506563.0,916,916.0,0.0,,2.6,...,evenStrength,5,successful,1,D,0,5,south,-21.428535,7.797058
7,66445,940804.0,742,689086,506563.0,916,916.0,0.0,,3.3,...,evenStrength,5,successful,1,D,0,5,regular,-29.475601,-18.355881
8,66445,940804.0,742,689086,506563.0,916,916.0,0.0,,3.633333,...,evenStrength,5,successful,1,D,0,5,none,-24.949127,-24.894119
9,66445,940804.0,742,689086,506563.0,916,,,,3.633333,...,evenStrength,5,successful,1,D,0,5,carrywithplay,-24.949127,-24.894119


In [2]:
### Drop shootout situations due to minimal counts and unique rules
df = df[(df['eventname']!='soshot')&(df['eventname']!='sogoal')&(df['eventname']!='sopuckprotection')]

### Create a feature for manpower difference between the two teams
df['manpower_diff'] = df['teamskatersonicecount'] - df['opposingteamskatersonicecount']

### Discretize the x,y lcoations of each event by clustering into 10 locations for Q-learning.
### This Q-learning approach uses a "Q-table" to track and update q-values; the dimensions of the table are
### (# of states, # of actions). If we use x,y coordinates the q-table would be large & sparse, and the model
### not generalizable to different states
df['location'] = 0
for event in df['eventname'].unique():
    X = df[df['eventname']==event][['xadjcoord','yadjcoord']]
    clusters = KMeans(n_clusters=10, random_state=99).fit_predict(X)
    df.loc[df['eventname']==event,'location'] = clusters

# #Defining the unique states that occur over the 20 games in the dataset
unique_states = df[['scoredifferential', 'period', 'manpower_diff']] \
                   .drop_duplicates() \
                   .to_records(index=False) \
                   .tolist()

# #Defining the unique actions that occur over the 20 games in the dataset
unique_actions = df[['eventname', 'location']] \
                   .drop_duplicates() \
                   .to_records(index=False) \
                   .tolist()

num_states = len(unique_states)

num_actions = len(unique_actions)

#Create dictionaries containing these unique states and actions
state_mapping = {state: idx for idx, state in enumerate(unique_states)}
id_to_state_mapping = {idx: state for idx, state in enumerate(unique_states)}

action_to_id_mapping = {action: idx for idx, action in enumerate(unique_actions)}
id_to_action_mapping = {idx: action for idx, action in enumerate(unique_actions)}

# id_to_action_mapping

In [3]:
### Base code used from Assignment #2 in SIADS 644
### We are creating a Q-learning class to build and train our Q-model

class QLearningAgent:

    def __init__(self, num_states, num_actions, seed=None, ):
        ### Hyperparameters for Q-learning model
        
        ### Q-table
        self.q_table = np.zeros((num_states, num_actions))
        self.learning_rate = 0.2
        ### Decay the learning rate to lower the magnitude of updates as we approach minimum Q-delta
        self.learning_rate_decay = 0.998
        self.min_learning_rate = 0.001
        ### Epsilon is to allow for exploration, which helps to generalize the model
        self.epsilon = 0.9
        ### Decay the epsilon value to decrease exploration as training converges
        self.epsilon_decay = 0.99
        if seed is None:
            self.rng = np.random.default_rng()
        else:
            self.rng = np.random.default_rng(seed)

    def select_action(self, state):
        """
        This function returns an action for the agent to take.
        Args:
            state: the state in the current step
        Returns:
            action: the action that the agent plans to take in the current step
        """

        if self.rng.random() > self.epsilon:
            action = np.argmax(self.q_table[state,:])
        else:
            action = self.rng.choice(num_actions,1)[0]
        
        return action

    def train(self, cur_state, cur_action, reward, next_state, done):
        """
        This function is used for the update of the Q table
        Args:
            - cur_state: the current state
            - cur_action: the current action
            - reward: the reward received
            - next_state: the next state observed
            - `done=1` means that the agent reaches the terminal state ('goal')
              `done=0` means that the current episode does not terminate
              `done=-1` means that the current episode reaches the maximum length and terminates.
        """

        self.q_table[cur_state,cur_action] = self.q_table[cur_state,cur_action] + self.learning_rate*(
            reward + 0.95*np.max(self.q_table[next_state,:]) - self.q_table[cur_state,cur_action])
        
        # Update epsilon and learning rate
        if done != 0:
            self.learning_rate = self.learning_rate * self.learning_rate_decay
            if self.learning_rate < self.min_learning_rate:
                self.learning_rate = self.min_learning_rate
            self.epsilon = self.epsilon * self.epsilon_decay
        
        

In [4]:
### Define state changes in a hockey game based on specific actions
### Reward a sequence with 1 when a goal is scored
### Finish the training episode when a goal is scored

def hockey_game(state, action):
    goal_diff,period,manpower = state
    action,location = action
    reward = 0
    done = 0
    if action == 'goal':
        next_state = (goal_diff+1, period, manpower)
        reward = 1
        done = 1
    elif action == 'penalty':
        next_state = (goal_diff, period, max(manpower-1,-2))
    elif action == 'penaltydrawn':
        next_state = (goal_diff, period, min(manpower+1,2))
    else:
        next_state = state
    return next_state, reward, done


In [5]:
### Initiate the Q-learning agent
### Iterate through the events dataframe to train the Q-model

agent = QLearningAgent(num_states, num_actions, seed=99)

df_iterator = iter(df.iterrows())
prev_state = None
prev_action = None
for _, row in df_iterator:
    cur_state = (row['scoredifferential'],row['period'],row['manpower_diff'])
    cur_action = (row['eventname'],row['location'])

    if prev_state == None:
        prev_state = cur_state
        prev_action = cur_action
        continue

    if prev_action[0] == 'goal':
        reward = 1
        done = 1
    else:
        reward = 0
        done = 0

    agent.train(state_mapping[prev_state],action_to_id_mapping[prev_action],reward,state_mapping[cur_state],done)

    prev_state = cur_state
    prev_action = cur_action

    


In [6]:
### Here we begin to explore the previously trained model with exploration using a random starting state
### We use the hockey game function to determine the next state
### We then train and decay the learning rate & epsilon at the end of each episode
### The output of this loop are the ideal actions to take to maximize rewards

print("Your actions during the last episode:")
state = np.random.default_rng().choice(num_states,1)[0]
for step in range(100):
    action = int(agent.select_action(state))
    print(action, end=" ")

    state_decode = id_to_state_mapping[state]
    action_decode = id_to_action_mapping[action] 

    next_state_decode, reward, done = hockey_game(state_decode, action_decode)

    if next_state_decode not in state_mapping:
        break
    next_state = state_mapping[next_state_decode]

    if done:
        agent.train(state, action, reward, next_state, 1)
    elif step == 99:
        agent.train(state, action, reward, next_state, -1)
    else:
        agent.train(state, action, reward, next_state, 0)

    if done:
        break
    else:
        state = next_state

Your actions during the last episode:
181 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=52c67ded-f4f7-4855-8de6-8871afe84448' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>