<a href="https://colab.research.google.com/github/ianog88/Statistics_DataScience/blob/main/Actor_Critic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Actor Critic Model
The goal is to train an actor critic model to learn, for each time step, the optimal policy (actor) and the value function (critic).

In [None]:
import pandas as pd
import numpy as np
import math
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

from tqdm import tqdm_notebook
from collections import deque

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('AAPL_5min.csv')

In [None]:
df = df[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34',
       '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46',
       '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58',
       '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70',
       '71', '72', '73', '74', '75', '76', '77', '78']]

In [None]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,69,70,71,72,73,74,75,76,77,78
0,0.005021,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.00000,0.000000
1,0.005021,0.000567,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.00000,-0.000773
2,0.005021,0.000567,0.000309,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.00000,-0.000463
3,0.005021,0.000567,0.000309,0.001673,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.00000,-0.000437
4,0.005021,0.000567,0.000309,0.001673,-0.001927,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.00000,-0.002672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4363,0.002684,-0.006115,0.000591,-0.002809,0.000235,0.004458,-0.001051,0.003625,-0.00134,-0.001546,...,-0.000954,-0.000029,0.000637,0.000559,-0.00041,0.00000,0.000000,0.000000,0.00000,-0.000732
4364,0.002684,-0.006115,0.000591,-0.002809,0.000235,0.004458,-0.001051,0.003625,-0.00134,-0.001546,...,-0.000954,-0.000029,0.000637,0.000559,-0.00041,0.00043,0.000000,0.000000,0.00000,-0.000062
4365,0.002684,-0.006115,0.000591,-0.002809,0.000235,0.004458,-0.001051,0.003625,-0.00134,-0.001546,...,-0.000954,-0.000029,0.000637,0.000559,-0.00041,0.00043,-0.000838,0.000000,0.00000,-0.001359
4366,0.002684,-0.006115,0.000591,-0.002809,0.000235,0.004458,-0.001051,0.003625,-0.00134,-0.001546,...,-0.000954,-0.000029,0.000637,0.000559,-0.00041,0.00043,-0.000838,-0.001504,0.00000,-0.001619


In [None]:
new_col = []

for i in range(4368):
  x = df.iloc[i,(i%78)]
  new_col.append(x)

In [None]:
df[79] = new_col

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
0,0.005021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005021
1,0.005021,0.000567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000773,0.000567
2,0.005021,0.000567,0.000309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000463,0.000309
3,0.005021,0.000567,0.000309,0.001673,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000437,0.001673
4,0.005021,0.000567,0.000309,0.001673,-0.001927,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.002672,-0.001927


# Convert to numpy and scale the data

In [None]:
b = df.to_numpy()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(b)
scaled_array = scaler.fit_transform(b)

# Create the env.step() function
The env.step() is typically import from OpenAI gym. It is a function that keeps track of what state we are currently in, takes in an action and outputs the next state, the result from taking that action and a boolean value indicating whether the episode is done or not. I created my own equivalent to this function with the following steps:
*   The global variable "count" keeps track of how many times the function is called and returns the next state by iterating through the dataframe
*   The logic compares the action to the price movement and returns done if the action would've exited the stock
*   The result is simply the return receievd if we are still in the stock






In [None]:
def test_two(action):
  global count
  count += 1
  dones = None

  d = df.to_numpy()
  scaled = scaled_array
  x = count

  state = scaled[x-1:x][0][0:78]
  limit = d[x,-2]
  reward = d[x,-1]

  if(action > limit):
    dones = True
    result = action
  else:
    dones = False
    result = reward

  return state, result, dones

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Create the models
Both the policy and the state value networks take in a 78x1 vector which represents the current time step. The policy network uses the softmax function to output the probability of taking each possible action. The state value network outputs a scalar value representing the estimated cumulative reward of that state.

In [None]:
class PolicyNetwork(nn.Module):

    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.input_layer = nn.Linear(78, 128)
        self.output_layer = nn.Linear(128, 10)

    def forward(self, x):
        x = self.input_layer(x)
        x = F.relu(x)
        actions = self.output_layer(x)
        action_probs = F.softmax(actions, dim=1)
        return action_probs

In [None]:
class StateValueNetwork(nn.Module):

    def __init__(self):
        super(StateValueNetwork, self).__init__()

        self.input_layer = nn.Linear(78, 128)
        self.output_layer = nn.Linear(128, 1)

    def forward(self, x):
        x = self.input_layer(x)
        x = F.relu(x)
        state_value = self.output_layer(x)
        return state_value

# Action Function
The action function takes in the policy network and selects the action by sampling from the probability distribution produced by the policy network. It returns the action and the log probability of that action to be used in the training process

In [None]:
def select_action(network, state):
  action_list = [-.0001,-.0002,-.0004,-.0006,-.0012,-.0025,-.004, -.006,-.008,-.01]

  state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)

  action_probs = network(state)
  state = state.detach()

  m = Categorical(action_probs)
  action = m.sample()
  actions = action_list[action.item()]

  return actions, m.log_prob(action)

# Define the Optimizer and Learning Rate
The networks are trained using the stochastic gradient descent algorithm with a learning rate of 0.0001

In [None]:
policy_network = PolicyNetwork().to(DEVICE)
stateval_network = StateValueNetwork().to(DEVICE)

policy_optimizer = optim.SGD(policy_network.parameters(), lr=0.0001)
stateval_optimizer = optim.SGD(stateval_network.parameters(), lr=0.0001)

# Training Loop
*   The training loop is composed of 1000 episodes, where backpropagation is performed after each episode
*   The critic network is trained by minimising the difference between the estimated state value and the next state value plus the reward, using the mean-squared error loss function
*   The policy network is trained using the advantage function which measures the difference in rewards between taking a certain action in the current state and the expected return of the current state with the current policy. This will encourage actions that increase the expected returns
*   A discount factor is not appropriate in this scenario as the value of current returns does not differ from the value of returns an hour to two into the future


In [None]:
NUM_EPISODES = 1000

for episode in range(NUM_EPISODES):
  count = (random.randrange(56))*78
  state, reward, dones = test_two(-0.99)
  dones = False

  for i in range(78):
    action, lp = select_action(policy_network, state)

    new_state, reward, dones = test_two(action)

    state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)
    state_val = stateval_network(state_tensor)

    new_state_tensor = torch.from_numpy(new_state).float().unsqueeze(0).to(DEVICE)
    new_state_val = stateval_network(new_state_tensor)

    if dones:
      new_state_val = torch.tensor([0]).float().unsqueeze(0).to(DEVICE)

      val_loss = F.mse_loss(reward + new_state_val, state_val)

      advantage = (reward + new_state_val.item()) - state_val.item()
      policy_loss = -lp * advantage

      policy_optimizer.zero_grad()
      policy_loss.backward(retain_graph=True)
      policy_optimizer.step()

      stateval_optimizer.zero_grad()
      val_loss.backward()
      stateval_optimizer.step()

      if dones:
        break

        state = new_state


In [None]:
a = scaled_array

In [None]:
sample_state = a[78:79][0][0:78]
sample_state

array([0.67832413, 0.46023796, 0.6046463 , 0.67394136, 0.53854249,
       0.48857364, 0.49430669, 0.4703401 , 0.46863767, 0.32766847,
       0.60585644, 0.46317952, 0.6494716 , 0.74584292, 0.46056028,
       0.49919701, 0.60907401, 0.62861202, 0.54144403, 0.54880303,
       0.46440001, 0.35467455, 0.47742959, 0.37430279, 0.3655624 ,
       0.38974808, 0.64180292, 0.60806041, 0.44085608, 0.39826647,
       0.47647899, 0.54235127, 0.64731776, 0.29496649, 0.51795282,
       0.52651398, 0.40375645, 0.56252686, 0.53536016, 0.72678073,
       0.68117729, 0.59986399, 0.54362421, 0.48043564, 0.34273209,
       0.40233641, 0.71410412, 0.40181112, 0.59670338, 0.49769747,
       0.51436764, 0.45759426, 0.60486361, 0.508969  , 0.46865904,
       0.52977528, 0.54065764, 0.32462102, 0.67392637, 0.46540454,
       0.37857207, 0.22239173, 0.46344798, 0.66327391, 0.61152229,
       0.59137042, 0.3894091 , 0.67546782, 0.62561462, 0.60955262,
       0.52015379, 0.4839219 , 0.70735797, 0.51056531, 0.61785

In [None]:
test_network = StateValueNetwork().to(DEVICE)

In [None]:
sample_state = torch.from_numpy(sample_state).float().unsqueeze(0).to(DEVICE)

In [None]:
test_network(sample_state)

tensor([[-0.1205]], grad_fn=<AddmmBackward0>)