<a href="https://colab.research.google.com/github/kgrossnickle/Deep_RL_EPL_Predictions/blob/master/epl_deep_rl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install tensorflow-gpu


In [0]:
from google.colab import drive
drive.mount('/content/drive')

#The Deep Q Learning Class

Implementing Q Learning in this class and using Nueral network to make function approximation for optimal Q Value

In [0]:
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

class deep_q_learning_agent(object):
    def __init__(self, alpha, gamma, memory_size, num_actions, epsilon, batch_size,
                 num_games, input_dims=(210,160,4), epsilon_decay_close_to_one=0.9,
                 epsilon_min_val=0.0, save_dir='/content/drive/My Drive/Colab Notebooks/epl_data/saved_models/'):
        self.action_space = [i for i in range(num_actions)]
        self.num_games = num_games
        self.epsilon = epsilon
        self.epsilon_decay_close_to_one = epsilon_decay_close_to_one
        self.epsilon_min_val = epsilon_min_val
        self.gamma = gamma
        self.num_actions = num_actions
        self.memory_size = memory_size
        self.memory_loc = 0
        self.batch_size = batch_size
        self.q_val_for_cur_state = deep_q_learning_nn(alpha, num_actions, input_dims=input_dims,
                                   name='q_val_for_cur_state', chkpt_dir=save_dir)
        self.state_memory = np.zeros((self.memory_size, *input_dims))
        self.new_state_mem = np.zeros((self.memory_size, *input_dims))
        self.action_mem = np.zeros((self.memory_size, self.num_actions),
                                      dtype=np.int8)
        self.reward_mem = np.zeros(self.memory_size)
        self.is_terminal_mem = np.zeros(self.memory_size, dtype=np.int8)

    def put_transition_in_NN(self, state, action, reward, next_state, terminal):
        idx = self.memory_loc % self.memory_size
        self.state_memory[idx] = state
        actions = np.zeros(self.num_actions)
        actions[action] = 1.0
        self.action_mem[idx] = actions
        self.reward_mem[idx] = reward
        self.new_state_mem[idx] = next_state
        self.is_terminal_mem[idx] = 1 - terminal
        self.memory_loc += 1

    def take_action_based_on_epsilon(self, state):
        cur_state = state[np.newaxis, :]
        rand_0_to_1 = np.random.random()
        if rand_0_to_1 < self.epsilon:
            action_taken = np.random.choice(self.action_space)
        else:
            actions = self.q_val_for_cur_state.sess.run(self.q_val_for_cur_state.Q_values,
                      feed_dict={self.q_val_for_cur_state.input: cur_state} )
            action_taken = np.argmax(actions)

        return action_taken

    def save_models(self):
        self.q_val_for_cur_state.save_model()

    def load_models(self):
        self.epsilon = 0.0 
        self.epsilon_min_val =0.0
        self.q_val_for_cur_state.load_model()
    def learn_from_transition(self):
        max_mem = None
        if self.memory_loc < self.memory_size :
            max_mem = self.memory_loc
        else :
            max_mem = self.memory_size

        cur_random_batch = np.random.choice(max_mem, self.batch_size)
        cur_reward_batch = self.reward_mem[cur_random_batch]
        enter_new_state_batch = self.new_state_mem[cur_random_batch]
        terminal_batch = self.is_terminal_mem[cur_random_batch]
        cur_state_space_batch = self.state_memory[cur_random_batch]
        cur_action_set_batch_batch = self.action_mem[cur_random_batch]
        cur_action_vals = np.array(self.action_space, dtype=np.int8)
        action_idxs = np.dot(cur_action_set_batch_batch, cur_action_vals)


        q_val_for_cur_state = self.q_val_for_cur_state.sess.run(self.q_val_for_cur_state.Q_values,
                                     feed_dict={self.q_val_for_cur_state.input: cur_state_space_batch})

        q_val_for_next_state = self.q_val_for_cur_state.sess.run(self.q_val_for_cur_state.Q_values,
                    feed_dict={self.q_val_for_cur_state.input: enter_new_state_batch})

        optimal_q_from_bellman = q_val_for_cur_state.copy()
        batch_index = np.arange(self.batch_size, dtype=np.int32)

        # Fixed this line to use all actions
        #correctly implements bellman target q val now
        optimal_q_from_bellman[batch_index,action_idxs] = cur_reward_batch + (self.gamma*(np.max(q_val_for_next_state, axis=1)*terminal_batch))

        self.q_val_for_cur_state.sess.run(self.q_val_for_cur_state.train_object,
                        feed_dict={self.q_val_for_cur_state.input: cur_state_space_batch,
                                   self.q_val_for_cur_state.actions: cur_action_set_batch_batch,
                                   self.q_val_for_cur_state.optimal_q_from_bellman: optimal_q_from_bellman})
        if self.epsilon > self.epsilon_min_val:
            self.epsilon = self.epsilon*self.epsilon_decay_close_to_one 
        else :
            self.epsilon = self.epsilon_min_val



class deep_q_learning_nn(object):
    def __init__(self, alpha_val, num_actions, name, input_dims,
                 fc1_dims=256, fc2_dims=256, chkpt_dir='/content/drive/My Drive/Colab Notebooks/epl_data/saved_models/deppqinfo'):
        self.alpha_val = alpha_val
        self.chkpt_dir = chkpt_dir
        self.input_dims = input_dims
        self.num_actions = num_actions
        self.name = name
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.sess = tf.Session()
        self.build_NN()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.compat.v1.train.Saver()
        self.save_fie = os.path.join(chkpt_dir,'tf_deepqnet.ckpt')
        self.trainable_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        scope=self.name)
    def build_NN(self):
        with tf.variable_scope(self.name):
            self.input = tf.compat.v1.placeholder (tf.float32,
                                        shape=[None, *self.input_dims],
                                        name='inputs')
            self.actions = tf.compat.v1.placeholder (tf.float32,
                                          shape=[None, self.num_actions],
                                          name='action_taken')
            self.optimal_q_from_bellman = tf.compat.v1.placeholder (tf.float32,
                                           shape=[None, self.num_actions],
                                           name='q_value')

            flattened_input = tf.layers.flatten(self.input)
            dense_layer_1 = tf.layers.dense(flattened_input, units=self.fc1_dims,
                                     activation=tf.nn.relu,)
            dense_layer_2 = tf.layers.dense(dense_layer_1, units=self.fc2_dims,
                                     activation=tf.nn.relu,)
            self.Q_values = tf.layers.dense(dense_layer_2, units=self.num_actions,)

            self.loss = tf.reduce_mean(tf.square(self.Q_values - self.optimal_q_from_bellman))
            self.train_object = tf.compat.v1.train.AdamOptimizer(self.alpha_val).minimize(self.loss)



    def save_model(self):
        self.saver.save(self.sess, self.save_fie)

    def load_model(self):
        print(self.save_fie)
        self.saver =tf.train.import_meta_graph(self.save_fie+".meta")
        self.saver.restore(self.sess, tf.train.latest_checkpoint("/content/drive/My Drive/Colab Notebooks/epl_data/saved_models/"))

**Environment Class**

How we interact with the environment.

We store avg values for last 15 Home games and 15 away games from team in two seperate arrays.

We also implement steps here (so a step would return who actually won the game and what were the stats, and also the next teams past 15 games for the next prediction)

In [0]:
class environment():
  def __init__(self):
    df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/epl_data/season-1617_csv.csv")
    df = df.drop(columns=["Date","Div","Referee","HTR"])
    df2 = pd.read_csv("/content/drive/My Drive/Colab Notebooks/epl_data/season-1718_csv.csv")
    df2 = df2.drop(columns=["Date","Div","Referee","HTR"])
    df = df.append(df2, ignore_index = True, sort=False)
    df3 = pd.read_csv("/content/drive/My Drive/Colab Notebooks/epl_data/season-1819_csv.csv")
    df3 = df3.drop(columns=["Date","Div","Referee","HTR"])
    df = df.append(df3, ignore_index = True , sort=False)
    #print (df)
    self.teams_idx_val_array = []
    for i in range(len(df)):
      actual_outcome = df.at[i, "FTR"]
      if actual_outcome == "A":
        actual_outcome = np.float64(0)
      elif actual_outcome == "D":
        actual_outcome = np.float64(1)
      else:
        actual_outcome = np.float64(2)
      df.at[i,"FTR"] = actual_outcome
      
      if df.at[i,"HomeTeam"] not in self.teams_idx_val_array:
        self.teams_idx_val_array.append(df.at[i,"HomeTeam"])
      
      df.at[i,"HomeTeam"] = self.teams_idx_val_array.index(df.at[i,"HomeTeam"])
      
      if df.at[i,"AwayTeam"] not in self.teams_idx_val_array:
        self.teams_idx_val_array.append(df.at[i,"AwayTeam"])
      
      df.at[i,"AwayTeam"] = self.teams_idx_val_array.index(df.at[i,"AwayTeam"])
    
    #print (df)
 
    
    
    # NOTE, Away/Home isn't used here. Home is stats FOR team, away is stats AGAINST team
    #self.home_team_15_game_state = pd.DataFrame(columns=['Team', 'Num_Games', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
    #   'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY',
    #   'HR', 'AR'])
    self.home_team_15_game_state = pd.DataFrame(columns=['Team', 'Num_Games', 'FTHG', 'FTAG', 'FTR'])
    self.away_team_15_game_state = pd.DataFrame(columns=['Team', 'Num_Games', 'FTHG', 'FTAG', 'FTR'])
    
    df.drop(df.iloc[:,21:(len(df.columns))], axis=1, inplace=True)
    self.df = df
        
    #print (self.df.columns)  
    self.cur_line = 0
    self.number_of_games = len(self.df)-1
    '''
    make all teams num 0 - 19
    number referees as well
    '''
    
  def update_team_info(self, home_team, away_team):
    
    if home_team not in self.home_team_15_game_state['Team'].values:
      dic = {'Team' : int(home_team)}
      self.home_team_15_game_state = self.home_team_15_game_state.append(dic , ignore_index=True)
      self.home_team_15_game_state  = self.home_team_15_game_state.fillna(0)
      

    row = self.home_team_15_game_state.loc[self.home_team_15_game_state['Team']==home_team].index[0]
    self.home_team_15_game_state.at[row, "Num_Games"] = min(self.home_team_15_game_state.at[row, "Num_Games"] +1 , 10)
    num_games = self.home_team_15_game_state.at[row, "Num_Games"]

    for col in self.home_team_15_game_state.columns:
      if str(col) != "Num_Games" and str(col) != 'Team':
        self.home_team_15_game_state.at[row, col] = (self.home_team_15_game_state.at[row, col].astype(np.float64)*(num_games-1) + self.df.at[self.cur_line,col].astype(np.float64))/num_games


    if away_team not in self.away_team_15_game_state['Team'].values:
      dic = { 'Team' : away_team}
      self.away_team_15_game_state = self.away_team_15_game_state.append(dic , ignore_index=True)
      self.away_team_15_game_state = self.away_team_15_game_state.fillna(0)

    row = self.away_team_15_game_state.loc[self.away_team_15_game_state['Team']==away_team].index[0]
    self.away_team_15_game_state.at[row, "Num_Games"] = min(self.away_team_15_game_state.at[row, "Num_Games"] +1 , 10)
    num_games = self.away_team_15_game_state.at[row, "Num_Games"]
    for col in self.away_team_15_game_state.columns:
      if str(col) != "Num_Games" and str(col) != "Team":
        if str(col) == "FTR":
          if self.away_team_15_game_state.at[row, col] == 0.0:
            self.away_team_15_game_state.at[row, col] = 2.0
          elif self.away_team_15_game_state.at[row, col] == 2.0:
            self.away_team_15_game_state.at[row, col] = 0.0

        # Change home/away bc we want the same correlation for away games
        if col.count("H") == 1 and "A" not in col:
          col = col.replace("H","A")
        elif "A" in col and "H" not in col:
          col = col.replace("A","H")
        elif col.count("H") == 2:
          col = col[0:2] + 'A' + col[2+1:]
        else:
          col = col.replace("A","H")
        self.away_team_15_game_state.at[row, col] = (self.away_team_15_game_state.at[row, col]*(num_games-1) + self.df.at[self.cur_line,col])/num_games
          
    
 
  def get_team_info(self, home_team , away_team , is_swapped):
    ht = self.home_team_15_game_state.loc[self.home_team_15_game_state['Team'] == home_team]
    at = self.away_team_15_game_state.loc[self.away_team_15_game_state['Team'] == away_team]
    if ht.empty :
      ht = ht.append({"Team" : home_team} , ignore_index=True)
      ht = ht.fillna(0)
    if at.empty :
      at = at.append({"Team" : away_team} , ignore_index=True)
      at = at.fillna(0)
    ht = ht.drop(columns=["Num_Games","Team"])
    at = at.drop(columns=["Num_Games","Team"])
    ret = np.append(ht.to_numpy()[0] , at.to_numpy()[0])
    if is_swapped:
      temp = ret[:3].copy()
      ret[:3] = ret[3:]
      ret[3:] = temp
    return ret


  def get_reward_and_outcome(self, agent_predicted_outcome, is_swapped):
    actual_outcome = self.df.at[self.cur_line, "FTR"]
    if is_swapped and actual_outcome == 0:
      actual_outcome = 2
    elif is_swapped and actual_outcome == 2:
      actual_outcome = 0


    home_team = self.df.at[self.cur_line , "HomeTeam"]
    away_team = self.df.at[self.cur_line , "AwayTeam"]
    
    self.update_team_info(home_team, away_team)
    
    

    if agent_predicted_outcome == actual_outcome and actual_outcome == 1:
      reward = 1
    elif agent_predicted_outcome == actual_outcome and actual_outcome == 2:
      reward = 1
    elif agent_predicted_outcome == actual_outcome and actual_outcome == 0:
      reward = 1
    else:
      reward = -1
    
    self.cur_line +=1
    new_home_team = self.df.at[self.cur_line , "HomeTeam"]
    new_away_team = self.df.at[self.cur_line , "AwayTeam"]
    
    return self.get_team_info(home_team, away_team,is_swapped) , self.get_team_info(new_home_team, new_away_team,False), reward
    
    

**Training / Running**

If training, we get the array of past home/away 15 games for the home and away teams. We then make our predicition from the Q function, and learn from the actual result that happened. We also update the 15 games arrays with the current game.

If we are just running the model, we do the same as above but elect not to learn form the actual result.

In [26]:
import random
import time
tf.reset_default_graph()     
env = environment()
alpha = .1
num_dims = 6
train = False
threshold = .58
eps_decay = .995
if train == True:
  eps = 1.0
  eps_min = .2
else:
  eps = 0.0
  eps_min =0.0
agent = deep_q_learning_agent(gamma=1, epsilon=eps, alpha=alpha, input_dims=[num_dims], num_actions=3,  \
                              memory_size=5000000, num_games=env.number_of_games, batch_size=512, epsilon_min_val=eps_min ,  \
                              epsilon_decay_close_to_one = eps_decay)


#load Pretrained model if you want
if train == False:
  agent.load_models()

list_of_scores = []
list_of_test_scores = []
epsilons_values_over_time = []
score = 0

# 0s array
cur_team_info = np.zeros( num_dims ) #env.step(0)
league_table = [0] * len( env.teams_idx_val_array ) 
i=0
backthrough=0
is_swapped = False

while (True):

   

    
    score = 0
    have_finished_episode = False
    #print ("SHOULD BE SAME AS ABOVE : " + str(cur_team_info[0]))
    cur_action = agent.take_action_based_on_epsilon(cur_team_info)
    if i %10 ==0:
      #print(cur_action)
      #print('episode: ', i,'score: ', score)
      average_score = np.mean(list_of_scores[max(0, i-100):(i)])
      #print('100 averages -- episode: ', i,'score: ', list_of_scores[-1:],
      #        ' mean score %.2f' % average_score,
      #        'epsilon_val %.2f' % agent.epsilon)
   

    actual_outcome ,new_team_info, reward = env.get_reward_and_outcome(cur_action,False)
    # if random.random() > .5 : 
    #   temp = new_team_info[:3].copy()
    #   new_team_info[:3] = new_team_info[3:]
    #   new_team_info[3:] = temp
    #   is_swapped = True
    # else:
    #   is_swapped = False
    if i > 760:
      actual_res = env.df.at[env.cur_line-1, "FTR"]
      actual_res = cur_action
      if actual_res == 0:
        league_table[int(cur_team_info[5])] +=3
      elif actual_res == 1:
        league_table[int(cur_team_info[0])] +=1
        league_table[int(cur_team_info[5])] +=1
      else:
        league_table[int(cur_team_info[0])] +=3
    score += reward
    if train == True:
      agent.put_transition_in_NN(cur_team_info, cur_action,
                            reward, actual_outcome, int(1))
      agent.learn_from_transition()
    cur_team_info = new_team_info

    epsilons_values_over_time.append(agent.epsilon)
    if score > 1.0:
      score = 1.0
    elif score < 0:
      score = 0
    list_of_scores.append(score)
    if i >= 760:
      if backthrough <1:
        i=0
        env.cur_line = 0
        backthrough += 1
      else:
        agent.epsilon_min_val = 0.0
        agent.epsilon =0.0
      list_of_test_scores.append(score)
      #if i % 10 ==0 :
        #print("Of "+str(len(list_of_test_scores))+" Games in 18-19 EPL, we predicted "+ str(np.mean(list_of_test_scores) ))
    i+=1
    if i >= len(env.df):
      break
# for i in range (len(env.teams_idx_val_array)):
#   if league_table[i] >0 :
#     print (env.teams_idx_val_array[i]+ " "+str(league_table[i])+" pts")
print("\n\n Of "+str(len(list_of_test_scores))+" Games in 18-19 EPL, we predicted "+ str(np.mean(list_of_test_scores))[:4] +" Correctly" )
if train == True and np.mean(list_of_test_scores) > threshold :
  agent.save_models()

/content/drive/My Drive/Colab Notebooks/epl_data/saved_models/tf_deepqnet.ckpt


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)




 Of 381 Games in 18-19 EPL, we predicted 0.58 Correctly
