In [1]:
!pip install tf_agents
!pip install dm-reverb[tensorflow]
!pip install pandapower
!pip install numba==0.56.4

Collecting tf_agents
  Downloading tf_agents-0.18.0-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting gym<=0.23.0,>=0.17.0 (from tf_agents)
  Downloading gym-0.23.0.tar.gz (624 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m624.4/624.4 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pygame==2.1.3 (from tf_agents)
  Downloading pygame-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.23.0-py

In [2]:
import os
import reverb
import tempfile

import tensorflow as tf

from tf_agents.agents.ddpg import critic_network
from tf_agents.agents.ddpg import actor_network
from tf_agents.agents.sac import sac_agent
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.metrics import py_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.policies import greedy_policy
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_py_policy
from tf_agents.policies import PolicySaver
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.train import actor
from tf_agents.train import learner
from tf_agents.train import triggers
from tf_agents.train.utils import spec_utils
from tf_agents.train.utils import strategy_utils
from tf_agents.train.utils import train_utils

import shutil



import abc
#import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

import pandapower as pp
import pandapower.networks as pn
import pandas as pd
import math
import random

import sqlite3
from sqlite3 import Error

import time

from calendar import monthrange

In [3]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:



def get_number_of_days_in_month(month):
  year = 2016
  return monthrange(year, month)[1]


#method to adjust profile length to different month length and comply with clock changes in march and october
def get_month_profile_length(month):
  month_profile_length = get_number_of_days_in_month(month) * 24 * 4
  if month == 3 : month_profile_length -= 4
  if month == 10: month_profile_length += 4
  return month_profile_length


def get_month_profile_start_in_list(month):
  month_profile_start_counter = 0
  for i in range(1,month):
    month_profile_start_counter += get_month_profile_length(i)

  return month_profile_start_counter





#class to implement cigre medium voltage distribution net with pv and wind (https://pandapower.readthedocs.io/en/v2.3.0/networks/cigre.html) from pandas power as a learning environment for reinforcement learning
class CigrePPEnv(py_environment.PyEnvironment):

  #environment object constructor;
  #bool_log_env: true for logging, false to shut off logging
  #log_file_name: name of logging file, can be any string if bool_log_env if false
  #month_list: list for months that shall be used in training
  #train_mode: 0=use all months in month list for one episode; 1=use one random month in month list for one episode
  #name: string used for text output overview

  def __init__(self, bool_log_env, log_file_name, month_list, train_mode, name):
    #action are 9 shift angels of s_gen
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(9,), dtype=np.float32, minimum=0, maximum=1, name='action')
    #observation is all 15 bus voltages and all 9 previous shift angles
    self._observation_spec = array_spec.BoundedArraySpec(
        shape=(24,), dtype=np.float32, minimum=0, maximum=1, name='observation')
    #action and observation are normalized to values between 0 and 1. Usage of these values is adapted

    self._log_env = bool_log_env
    self._month_list = month_list
    self._train_mode = train_mode
    self._name = name

    if self._train_mode < 0 or self._train_mode > 1:
      print("train mode must be 0 or 1!")
      quit()


    self._state = [0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5]
    self._step_counter = 0
    self._continuous_step_counter = 0
    self._episode_ended = False
    self._episode_average_error = 0

    if self._train_mode == 0:
     self._current_training_month = self._month_list[0]
    elif self._train_mode == 1:
      self._current_training_month = random.choice(self._month_list)

    self._month_profile_length = get_month_profile_length(self._current_training_month)
    self._current_profile_pointer = get_month_profile_start_in_list(self._current_training_month)
    self._month_iterator = 0
    self._mode0_month_step_counter = 0


    load_df = pd.read_csv("/content/drive/MyDrive/DRL_volt-var_control/LoadProfile.csv", sep=";")

    self.load_time_list = load_df['time']
    self.load1_p_list = load_df['mv_semiurb_pload']
    self.load1_q_list = load_df['mv_semiurb_qload']
    self.load2_p_list = load_df['mv_urban_pload']
    self.load2_q_list = load_df['mv_urban_qload']
    self.load3_p_list = load_df['mv_comm_pload']
    self.load3_q_list = load_df['mv_comm_qload']
    self.load4_p_list = load_df['lv_rural3_pload']
    self.load4_q_list = load_df['lv_rural3_qload']
    self.load5_p_list = load_df['lv_urban6_pload']
    self.load5_q_list = load_df['lv_urban6_qload']
    self.load6_p_list = load_df['mv_semiurb_pload']
    self.load6_q_list = load_df['mv_semiurb_qload']
    self.load7_p_list = load_df['mv_urban_pload']
    self.load7_q_list = load_df['mv_urban_qload']
    self.load9_p_list = load_df['mv_comm_pload']
    self.load9_q_list = load_df['mv_comm_qload']

    self.load11_p_list = load_df['lv_rural1_pload']
    self.load11_q_list = load_df['lv_rural1_qload']
    self.load12_p_list = load_df['lv_rural2_pload']
    self.load12_q_list = load_df['lv_rural2_qload']
    self.load13_p_list = load_df['lv_semiurb4_pload']
    self.load13_q_list = load_df['lv_semiurb4_qload']
    self.load14_p_list = load_df['lv_semiurb5_pload']
    self.load14_q_list = load_df['lv_semiurb5_qload']

    self.load16_p_list = load_df['mv_rural_pload']
    self.load16_q_list = load_df['mv_rural_qload']
    self.load17_p_list = load_df['lv_semiurb4_pload']
    self.load17_q_list = load_df['lv_semiurb4_qload']


    gen_df = pd.read_csv("/content/drive/MyDrive/DRL_volt-var_control/RESProfile.csv", sep=";")

    generation_multiplier = 1.5

    self.res_time_list = gen_df['time']
    self.res1_s_list = generation_multiplier*gen_df['PV8']
    self.res2_s_list = generation_multiplier*gen_df['PV2']
    self.res3_s_list = generation_multiplier*gen_df['PV5']
    self.res4_s_list = generation_multiplier*gen_df['PV1']
    self.res5_s_list = generation_multiplier*gen_df['PV6']
    self.res6_s_list = generation_multiplier*gen_df['PV3']
    self.res7_s_list = generation_multiplier*gen_df['PV4']
    self.res8_s_list = generation_multiplier*gen_df['PV7']
    self.res9_s_list = generation_multiplier*gen_df['WP8']



    self.net = pp.create_empty_network()
    self.init_net()

    if self._log_env:
      try:
        self._connection = sqlite3.connect(log_file_name)
        self._cursor = self._connection.cursor()
      except Error as e:
        print(e)

      create_steps_table_query = """ CREATE TABLE IF NOT EXISTS steps (
                                                  step integer PRIMARY KEY,
                                                  time string,
                                                  shift_angle_1 float,
                                                  shift_angle_2 float,
                                                  shift_angle_3 float,
                                                  shift_angle_4 float,
                                                  shift_angle_5 float,
                                                  shift_angle_6 float,
                                                  shift_angle_7 float,
                                                  shift_angle_8 float,
                                                  shift_angle_9 float,
                                                  average_vm_pu_deviation float
                                              ); """

      self.create_log_table(create_steps_table_query)

      self.episodeStartTime = time.perf_counter()



  def action_spec(self):
    return self._action_spec



  def observation_spec(self):
    return self._observation_spec



  #method to reset environment to default values
  def _reset(self):
    self._state = [0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5]
    self._step_counter = 0
    self._episode_average_error = 0
    self._episode_ended = False
    self._mode0_month_step_counter = 0


    if self._train_mode == 0:
      self._current_training_month = self._month_list[0]
      print(f"current {self._name} month {self._current_training_month}")
    elif self._train_mode == 1:
      self._current_training_month = random.choice(self._month_list)
      print(f"current {self._name} month {self._current_training_month}")

    self._month_profile_length = get_month_profile_length(self._current_training_month)
    self._current_profile_pointer = get_month_profile_start_in_list(self._current_training_month)
    self._month_iterator = 0

    self.episodeStartTime = time.perf_counter()
    return ts.restart(np.array(self._state, dtype=np.float32))



  #method to do oe step in environment with one action and get one observation
  def _step(self, action):

    if self._episode_ended:
      # The last action ended the episode. Ignore the current action and start a new episode.
      return self.reset()

    #database logging made issues witout float conversion, converted floats might deliver slightly different results
    action_casted = [float(action[0]), float(action[1]), float(action[2]), float(action[3]), float(action[4]), float(action[5]), float(action[6]), float(action[7]), float(action[8])]


    # update state and vary load for current step
    self.update_loads(self._current_profile_pointer)
    self.update_RES(self._current_profile_pointer)

    #check if load_time_list and res_time_list are the same, only needed if profiles are changed to make sure they work together
    #if(self.load_time_list[self._current_profile_pointer] != self.res_time_list[self._current_profile_pointer]):
    #  print("load and res lists had a mismatch, quitting Training")
    #  quit()


    current_vm_pu_list = self.createAndRunNetSimulation(action_casted)

    current_vm_pu_list = current_vm_pu_list.astype(np.float32)




    #skip first element of vm_pu list since it is te bus connected to ext_grid whic has const voltage
    current_average_vm_pu_deviation = 0
    for vm_pu in current_vm_pu_list[1:]:
      current_average_vm_pu_deviation += abs(1.0-vm_pu)
    current_average_vm_pu_deviation /= len(current_vm_pu_list[1:])

    #normalize current_vm_pu_list to values between 0 and 1
    for i in range(len(current_vm_pu_list)):
        current_vm_pu_list[i] /= 2


    self._state = [*current_vm_pu_list , *action_casted]
    #print(f"state {self._state}")


    if self._log_env:
      log_query = ''' INSERT INTO steps(step,time, shift_angle_1, shift_angle_2, shift_angle_3, shift_angle_4, shift_angle_5, shift_angle_6, shift_angle_7, shift_angle_8, shift_angle_9, average_vm_pu_deviation)
                    VALUES(?,?,?,?,?,?,?,?,?,?,?,?) '''

      self.log_action(log_query, [self._continuous_step_counter, self.load_time_list[self._current_profile_pointer], action_casted[0], action_casted[1], action_casted[2], action_casted[3], action_casted[4], action_casted[5], action_casted[6], action_casted[7], action_casted[8], current_average_vm_pu_deviation])


    self._step_counter += 1
    self._continuous_step_counter += 1
    self._current_profile_pointer += 1
    self._mode0_month_step_counter += 1

    self._episode_average_error += current_average_vm_pu_deviation

    #multipy reward with 100 because vm_pu_deviation is very small, since vm_pu_deviation is to be minimized it must be converted tobe negative because the reward is maximized
    reward = -100 * current_average_vm_pu_deviation
    #print(f"reward {reward}")


    if(self._mode0_month_step_counter == (self._month_profile_length/2) or self._mode0_month_step_counter == self._month_profile_length):
      current_average_error_per_step = self._episode_average_error/self._step_counter
      print(f"Current average Error per step in episode {current_average_error_per_step}")



    if(self._train_mode == 0):
      if(self._mode0_month_step_counter >= self._month_profile_length):
        #if the iterator was at last month of month_list the iteration over all month is complete and the episode must end in mode 0
        if(self._month_iterator + 1 == len(self._month_list)):
           self._episode_ended = True
        else :
          self._month_iterator += 1
          self._current_training_month = self._month_list[self._month_iterator]
          print(f"current {self._name} month {self._current_training_month}")
          self._month_profile_length = get_month_profile_length(self._current_training_month)
          self._current_profile_pointer = get_month_profile_start_in_list(self._current_training_month)
          self._mode0_month_step_counter = 0


    elif(self._train_mode == 1):
      if(self._step_counter >= self._month_profile_length):
        self._episode_ended = True





    if self._episode_ended:
      if self._log_env:
        self.sql_commit()

      #print _episode_average_error to check how training is going
      self._episode_average_error /= self._step_counter
      print(f"Average Error per step in episode {self._episode_average_error}")
      episodeEndTime = time.perf_counter()
      print(f"Epsidoe Time: {episodeEndTime - self.episodeStartTime:0.4f} seconds")

      return ts.termination(np.array(self._state, dtype=np.float32), reward)
    else:
      return ts.transition(np.array(self._state, dtype=np.float32), reward=reward, discount=0.0)



  def init_net(self):
    self.net = pn.create_cigre_network_mv(with_der="pv_wind")
    self.net.load.p_mw[0] = 0
    self.net.load.q_mvar[0] = 0
    self.net.load.p_mw[8] = 0
    self.net.load.q_mvar[8] = 0
    self.net.load.p_mw[10] = 0
    self.net.load.q_mvar[10] = 0
    self.net.load.p_mw[15] = 0
    self.net.load.q_mvar[15] = 0


  def is_episode_finished(self):
    return self._episode_ended


  def update_loads(self, step_counter):

    self.net.load.p_mw[1] = self.load1_p_list[step_counter]
    self.net.load.q_mvar[1] = self.load1_q_list[step_counter]
    self.net.load.p_mw[2] = self.load2_p_list[step_counter]
    self.net.load.q_mvar[2] = self.load2_q_list[step_counter]
    self.net.load.p_mw[3] = self.load3_p_list[step_counter]
    self.net.load.q_mvar[3] = self.load3_q_list[step_counter]
    self.net.load.p_mw[4] = self.load4_p_list[step_counter]
    self.net.load.q_mvar[4] = self.load4_q_list[step_counter]
    self.net.load.p_mw[5] = self.load5_p_list[step_counter]
    self.net.load.q_mvar[5] = self.load5_q_list[step_counter]
    self.net.load.p_mw[6] = self.load6_p_list[step_counter]
    self.net.load.q_mvar[6] = self.load6_q_list[step_counter]
    self.net.load.p_mw[7] = self.load7_p_list[step_counter]
    self.net.load.q_mvar[7] = self.load7_q_list[step_counter]
    self.net.load.p_mw[9] = self.load9_p_list[step_counter]
    self.net.load.q_mvar[9] = self.load9_q_list[step_counter]
    self.net.load.p_mw[11] = self.load11_p_list[step_counter]
    self.net.load.q_mvar[11] = self.load11_q_list[step_counter]
    self.net.load.p_mw[12] = self.load12_p_list[step_counter]
    self.net.load.q_mvar[12] = self.load12_q_list[step_counter]
    self.net.load.p_mw[13] = self.load13_p_list[step_counter]
    self.net.load.q_mvar[13] = self.load13_q_list[step_counter]
    self.net.load.p_mw[14] = self.load14_p_list[step_counter]
    self.net.load.q_mvar[14] = self.load14_q_list[step_counter]
    self.net.load.p_mw[16] = self.load16_p_list[step_counter]
    self.net.load.q_mvar[16] = self.load16_q_list[step_counter]
    self.net.load.p_mw[17] = self.load17_p_list[step_counter]
    self.net.load.q_mvar[17] = self.load17_q_list[step_counter]

    #print(self.net.load)



  def update_RES(self, step_counter):
    self.net.sgen['sn_mva'][0] = self.res1_s_list[step_counter]
    self.net.sgen['sn_mva'][1] = self.res2_s_list[step_counter]
    self.net.sgen['sn_mva'][2] = self.res3_s_list[step_counter]
    self.net.sgen['sn_mva'][3] = self.res4_s_list[step_counter]
    self.net.sgen['sn_mva'][4] = self.res5_s_list[step_counter]
    self.net.sgen['sn_mva'][5] = self.res6_s_list[step_counter]
    self.net.sgen['sn_mva'][6] = self.res7_s_list[step_counter]
    self.net.sgen['sn_mva'][7] = self.res8_s_list[step_counter]
    self.net.sgen['sn_mva'][8] = self.res9_s_list[step_counter]

    #print(self.net.sgen)



  def createAndRunNetSimulation(self, action):

    #print(f"action {action}")
    #print(f"pre sgens {self.net.sgen}")

    generator_counter = 0
    for generator in self.net.sgen.name:
      #print(f"generator {generator}")

      current_phase_shift_rad = math.radians(action[generator_counter]*51.68-25.84)
      self.net.sgen['p_mw'][generator_counter] = math.cos(current_phase_shift_rad) * self.net.sgen['sn_mva'][generator_counter]
      self.net.sgen['q_mvar'][generator_counter] = math.sin(current_phase_shift_rad) * self.net.sgen['sn_mva'][generator_counter]
      generator_counter += 1

    #print(f"post sgens {self.net.sgen}")

    try:
      pp.runpp(self.net)
    except:
      print("could not calculate power flow")
      return 0

    #print(net.res_trafo)
    #print(net.res_line)
    #print(net.res_load)
    #print(net.res_ext_grid)
    #print(f"result {self.net.res_bus['vm_pu']}")

    return (self.net.res_bus['vm_pu'])



  def create_log_table(self, create_table_sql):
    try:
        self._cursor.execute(create_table_sql)
    except Error as e:
        print(e)



  def log_action(self, log_query, log_data):
    try:
        self._cursor.execute(log_query, log_data)
        #self._connection.commit()
    except Error as e:
        print(e)



  def sql_commit(self):
    try:
        self._connection.commit()
    except Error as e:
        print(e)


In [5]:

%rm -rf /content/tempdir
os.mkdir("/content/tempdir")

In [6]:


#This file is mainly copied from Tensorflow SAC Minitaur example
#https://www.tensorflow.org/agents/tutorials/7_SAC_minitaur_tutorial

tempdir = "/content/tempdir"

eval_save_dir = "/content/tempdir/eval"

#if os.path.exists(eval_save_dir):
#    shutil.rmtree(eval_save_dir)


##########################################################
# Hyperparamertes
##########################################################


num_iterations = 1500000 # @param {type:"integer"} number of train steps

initial_collect_steps = 50000 # @param {type:"integer"} number of random steps in the beginning
replay_buffer_capacity = 1000000 # @param {type:"integer"}
#smaller replay buffer capacity than num_iterations can lead to big mistakes in later part of the training if there are no more bad experiences in buffer anymore. In this training it is partly avoided by deleting old replays uniformly and not by fifo

batch_size = 1000 # @param {type:"integer"} #number of fetched steps from replay buffer per training dataset. Can be varied quiet a bit but smaller batch_sizes lead to longer training time


critic_learning_rate = 3e-4 # @param {type:"number"} learning rate for critic NN
actor_learning_rate = 3e-4 # @param {type:"number"} learning rate for actor NN
alpha_learning_rate = 3e-4 # @param {type:"number"} learning rate for alpha factor which regulates entropy which is the regulating factor for exploration/exploitation tradeoff in SAC
#more on SAC here https://spinningup.openai.com/en/latest/algorithms/sac.html

#haven't changed these factors from the example
target_update_tau = 0.005 # @param {type:"number"}
target_update_period = 1 # @param {type:"number"}
gamma = 0.99 # @param {type:"number"}

reward_scale_factor = 1.0 # @param {type:"number"} at some point I set the factor to two but since reward is also scaled in env small changes should not matter to much

#fully connected layer describtion of NN
actor_fc_layer_params = (100, 100)
critic_joint_fc_layer_params = (100, 100)

num_eval_episodes = 1 # @param {type:"integer"}
train_episodes_per_eval_episode = 2 # @param {type:"integer"}


##########################################################
# Environment
##########################################################
#env = CigrePPEnv(False, "", [1], 0)
#env.reset()

#print('Observation Spec:')
#print(env.time_step_spec().observation)
#print('Action Spec:')
#print(env.action_spec())

#utils.validate_py_environment(env, episodes=2)



collect_env = CigrePPEnv(False, "collect_env_log.db",[1,3,5,7,9,11], 1, "collect")
eval_env = CigrePPEnv(True, "eval_env_log.db", [2,4,6,8,10,12], 0, "evaluation")



##########################################################
# Distribution Strategy
##########################################################
#strategy describes the hardware usesage which depends on the training platform
use_gpu = False

strategy = strategy_utils.get_strategy(tpu=False, use_gpu=use_gpu)



##########################################################
# Agent
##########################################################
observation_spec, action_spec, time_step_spec = (
      spec_utils.get_tensor_specs(collect_env))


with strategy.scope():
  critic_net = critic_network.CriticNetwork(
        (observation_spec, action_spec),
        observation_fc_layer_params=None,
        action_fc_layer_params=None,
        joint_fc_layer_params=critic_joint_fc_layer_params,
        kernel_initializer='glorot_uniform',
        last_kernel_initializer='glorot_uniform')


#used ActorNetwork instead of ActorDistributionNetwork to get deterministic results
#set activation layer to None if negative value should be trained although input/output normalization is advised
with strategy.scope():
    actor_net = actor_network.ActorNetwork(
      observation_spec,
      action_spec,
      fc_layer_params=actor_fc_layer_params,
      activation_fn=tf.keras.activations.relu)


with strategy.scope():
  train_step = train_utils.create_train_step()

  tf_agent = sac_agent.SacAgent(
        time_step_spec,
        action_spec,
        actor_network=actor_net,
        critic_network=critic_net,
        actor_optimizer=tf.keras.optimizers.Adam(
            learning_rate=actor_learning_rate),
        critic_optimizer=tf.keras.optimizers.Adam(
            learning_rate=critic_learning_rate),
        alpha_optimizer=tf.keras.optimizers.Adam(
            learning_rate=alpha_learning_rate),
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        td_errors_loss_fn=tf.math.squared_difference,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        train_step_counter=train_step)

  tf_agent.initialize()



##########################################################
# Replay Buffer
##########################################################
#not really changed these variables from example, test trainings of different values 2-100 seemed not to make a big difference
rate_limiter=reverb.rate_limiters.SampleToInsertRatio(samples_per_insert=3.0, min_size_to_sample=3, error_buffer=3.0)

#mainly as in example, just changed remover to Uniform to avoid bad results if the buffer starts deleting the first experiences
table_name = 'uniform_table'
table = reverb.Table(
    table_name,
    max_size=replay_buffer_capacity,
    sampler=reverb.selectors.Uniform(),
    remover=reverb.selectors.Uniform(),
    rate_limiter=reverb.rate_limiters.MinSize(1))

reverb_server = reverb.Server([table])

reverb_replay = reverb_replay_buffer.ReverbReplayBuffer(
    tf_agent.collect_data_spec,
    sequence_length=2,
    table_name=table_name,
    local_server=reverb_server)


#preftch(tf.data.AUTOTUNE) did not seem to make a runtime improvement
dataset = reverb_replay.as_dataset(
      sample_batch_size=batch_size, num_steps=2).prefetch(1)
experience_dataset_fn = lambda: dataset



##########################################################
# Policies
##########################################################
#eager polcies just accelerate the training
tf_eval_policy = tf_agent.policy
eval_policy = py_tf_eager_policy.PyTFEagerPolicy(
  tf_eval_policy, use_tf_function=True)

tf_collect_policy = tf_agent.collect_policy
collect_policy = py_tf_eager_policy.PyTFEagerPolicy(
  tf_collect_policy, use_tf_function=True)


random_policy = random_py_policy.RandomPyPolicy(
  collect_env.time_step_spec(), collect_env.action_spec())



##########################################################
# Actors
##########################################################
rb_observer = reverb_utils.ReverbAddTrajectoryObserver(
  reverb_replay.py_client,
  table_name,
  sequence_length=2,
  stride_length=1)


initial_collect_actor = actor.Actor(
  collect_env,
  random_policy,
  train_step,
  steps_per_run=initial_collect_steps,
  observers=[rb_observer])

print(f"Initial Collect Actor")
initial_collect_actor.run()
print(f"Initial Collect Actor finished")

#smaller collect metric, no summary_dir leading to no collect metric both did not make a significant runtime improvement
env_step_metric = py_metrics.EnvironmentSteps()
collect_actor = actor.Actor(
  collect_env,
  collect_policy,
  train_step,
  steps_per_run=1,
  metrics=actor.collect_metrics(10),
  #summary_dir=os.path.join(tempdir, learner.TRAIN_DIR),
  observers=[rb_observer, env_step_metric])


eval_actor = actor.Actor(
  eval_env,
  eval_policy,
  train_step,
  episodes_per_run=num_eval_episodes,
  metrics=actor.eval_metrics(num_eval_episodes),
  summary_dir=eval_save_dir,
)

saver = PolicySaver(tf_agent.policy)


##########################################################
# Learners
##########################################################

agent_learner = learner.Learner(
  tempdir,
  train_step,
  tf_agent,
  experience_dataset_fn,
  strategy=strategy)


##########################################################
# Metrics and Evaluation
##########################################################
def get_eval_metrics():
  eval_actor.run()
  results = {}
  for metric in eval_actor.metrics:
    results[metric.name] = metric.result()
  return results

#metrics = get_eval_metrics()

#logs AverageReturn as sum of rewards per Episode
def log_eval_metrics(step, metrics):
  eval_results = (', ').join(
      '{} = {:.6f}'.format(name, result) for name, result in metrics.items())
  print('step = {0}: {1}'.format(step, eval_results))
  with open("training_log.txt", "a") as logfile:
    logfile.write('step = {0}: {1}\n'.format(step, eval_results))

#log_eval_metrics(0, metrics)


##########################################################
# Training
##########################################################
#try:
#  %%time
#except:
#  pass

# Reset the train step
tf_agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = get_eval_metrics()["AverageReturn"]
max_return = avg_return
print(f"Initial max return set to {max_return}")


print(f"Training")

train_episode_counter = 0

average_total_loss = 0
average_actor_loss = 0
average_critic_loss = 0
average_alpha_loss = 0
episode_step_counter = 0

for _ in range(num_iterations):
  # Training, collect one step, train one step from replay buffer
  collect_actor.run()
  loss_info = agent_learner.run(iterations=1)
  episode_step_counter += 1

  average_total_loss += loss_info.loss.numpy()
  average_actor_loss += loss_info.extra.actor_loss.numpy()
  average_critic_loss += loss_info.extra.critic_loss.numpy()
  average_alpha_loss += loss_info.extra.alpha_loss.numpy()

  #print(f"actor_loss: {loss_info.extra.actor_loss.numpy()} ; average_critic_loss:{loss_info.extra.critic_loss.numpy()} ; alpha_loss:{loss_info.extra.alpha_loss.numpy()}")

  #print(loss_info)

  # Evaluating.
  step = agent_learner.train_step_numpy

  if collect_env.is_episode_finished():
    train_episode_counter += 1

    average_total_loss /= episode_step_counter
    average_actor_loss /= episode_step_counter
    average_critic_loss /= episode_step_counter
    average_alpha_loss /= episode_step_counter


    print('step = {0}: loss = {1}'.format(step, average_total_loss))
    print('average_actor_loss = {0} : average_critic_loss = {1} : average_alpha_loss={2}\n'.format(average_actor_loss, average_critic_loss, average_alpha_loss))
    with open("training_log.txt", "a") as logfile:
      logfile.write('step = {0}: loss = {1}\n'.format(step, average_total_loss))
      logfile.write('average_actor_loss = {0} : average_critic_loss = {1} : average_alpha_loss={2}\n'.format(average_actor_loss, average_critic_loss, average_alpha_loss))

    average_total_loss = 0
    average_actor_loss = 0
    average_critic_loss = 0
    average_alpha_loss = 0
    episode_step_counter = 0

    if train_episode_counter == train_episodes_per_eval_episode:
      print(f"eval episode at step {step}")
      metrics = get_eval_metrics()
      log_eval_metrics(step, metrics)
      train_episode_counter = 0

      #export best actor network, prevents overfitting
      if (metrics["AverageReturn"] > max_return):
        print(f"New best eval, saving policy")
        max_return = metrics["AverageReturn"]
        saver.save('actor_policy')
        #saver.save_checkpoint('actor_policy_checkpoint')


rb_observer.close()
reverb_server.stop()

#shutil.rmtree(eval_save_dir)
#shutil.rmtree(tempdir)


current collect month 1
Initial Collect Actor
Current average Error per step in episode 0.018151969653189475
Current average Error per step in episode 0.018271633030449024
Average Error per step in episode 0.018271633030449024
Epsidoe Time: 145.5384 seconds
current collect month 11
Current average Error per step in episode 0.018039634865191254
Current average Error per step in episode 0.01806468192990574
Average Error per step in episode 0.01806468192990574
Epsidoe Time: 135.2733 seconds
current collect month 7
Current average Error per step in episode 0.021722852537113778
Current average Error per step in episode 0.0224166814976024
Average Error per step in episode 0.0224166814976024
Epsidoe Time: 139.6501 seconds
current collect month 5
Current average Error per step in episode 0.021568605453119345
Current average Error per step in episode 0.022944085767775933
Average Error per step in episode 0.022944085767775933
Epsidoe Time: 140.5183 seconds
current collect month 11
Current averag



Current average Error per step in episode 0.016123424242592818
Average Error per step in episode 0.016123424242592818
Epsidoe Time: 1773.4354 seconds
current evaluation month 2
step = 5856: AverageReturn = -28177.296875, AverageEpisodeLength = 17476.000000
New best eval, saving policy




current collect month 5
Current average Error per step in episode 0.015698563959878717
Current average Error per step in episode 0.01535434468710843
Average Error per step in episode 0.01535434468710843
Epsidoe Time: 472.2180 seconds
step = 8832: loss = -8.011952195276496
average_actor_loss = 1.5612911246716976 : average_critic_loss = 0.3408553214763762 : average_alpha_loss=-9.914098636117032

current collect month 11
Current average Error per step in episode 0.016261676719619166
Current average Error per step in episode 0.016978240561568072
Average Error per step in episode 0.016978240561568072
Epsidoe Time: 458.4623 seconds
step = 11712: loss = -11.9934986088011
average_actor_loss = 1.5620917854209742 : average_critic_loss = 0.31045282936861945 : average_alpha_loss=-13.866043213009835

eval episode at step 11712
Current average Error per step in episode 0.017257635568823766
Current average Error per step in episode 0.017394922404551563
current evaluation month 4
Current average Error



current collect month 5
Current average Error per step in episode 0.01569790817717071
Current average Error per step in episode 0.015353998092050344
Average Error per step in episode 0.015353998092050344
Epsidoe Time: 478.2736 seconds
step = 14688: loss = -15.977908980461859
average_actor_loss = 1.550817130874562 : average_critic_loss = 0.2892023042715605 : average_alpha_loss=-17.81792843309782

current collect month 9
Current average Error per step in episode 0.015402337409082893
Current average Error per step in episode 0.015359556744436917
Average Error per step in episode 0.015359556744436917
Epsidoe Time: 463.2280 seconds
step = 17568: loss = -19.969615166054833
average_actor_loss = 1.54093014680677 : average_critic_loss = 0.2592683702862511 : average_alpha_loss=-21.769813656806946

eval episode at step 17568
Current average Error per step in episode 0.0172576692432459
Current average Error per step in episode 0.01739495083512294
current evaluation month 4
Current average Error pe



current collect month 5
Current average Error per step in episode 0.015712947360084002
Current average Error per step in episode 0.015376254446953334
Average Error per step in episode 0.015376254446953334
Epsidoe Time: 480.1110 seconds
step = 26400: loss = -31.90063913278682
average_actor_loss = 1.5381219846506913 : average_critic_loss = 0.1867082222184587 : average_alpha_loss=-33.6254693295366

current collect month 3
Current average Error per step in episode 0.017274582326996673
Current average Error per step in episode 0.01648117573937184
Average Error per step in episode 0.01648117573937184
Epsidoe Time: 479.5779 seconds
step = 29372: loss = -35.915865067678375
average_actor_loss = 1.5463574763743424 : average_critic_loss = 0.17985027599302628 : average_alpha_loss=-37.64207282008586

eval episode at step 29372
Current average Error per step in episode 0.017203064384596522
Current average Error per step in episode 0.017354385970138976
current evaluation month 4
Current average Error



current collect month 11
Current average Error per step in episode 0.016248329229179816
Current average Error per step in episode 0.016867127968737516
Average Error per step in episode 0.016867127968737516
Epsidoe Time: 465.7970 seconds
step = 32252: loss = -39.896386990282274
average_actor_loss = 1.5269437962108188 : average_critic_loss = 0.17417976821565795 : average_alpha_loss=-41.59751057094998

current collect month 1
Current average Error per step in episode 0.016900785450470256
Current average Error per step in episode 0.017224076341339812
Average Error per step in episode 0.017224076341339812
Epsidoe Time: 482.7302 seconds
step = 35228: loss = -43.843683447889106
average_actor_loss = 1.538834303417193 : average_critic_loss = 0.17316086023985858 : average_alpha_loss=-45.55567862782427

eval episode at step 35228
Current average Error per step in episode 0.01563818970533033
Current average Error per step in episode 0.01594436737098569
current evaluation month 4
Current average Er



current collect month 3
Current average Error per step in episode 0.015501993424134683
Current average Error per step in episode 0.015283838811229713
Average Error per step in episode 0.015283838811229713
Epsidoe Time: 487.1781 seconds
step = 38200: loss = -47.86870759139953
average_actor_loss = 1.5345785361921482 : average_critic_loss = 0.17274483862868228 : average_alpha_loss=-49.576030951971

current collect month 11
Current average Error per step in episode 0.015459164350278814
Current average Error per step in episode 0.015917063075753447
Average Error per step in episode 0.015917063075753447
Epsidoe Time: 469.5627 seconds
step = 41080: loss = -51.84449823697408
average_actor_loss = 1.51803221884701 : average_critic_loss = 0.16896489304490386 : average_alpha_loss=-53.53149532874425

eval episode at step 41080
Current average Error per step in episode 0.015770364008317814
Current average Error per step in episode 0.015929244880752622
current evaluation month 4
Current average Error



current collect month 11
Current average Error per step in episode 0.015468797732203745
Current average Error per step in episode 0.01592029048987324
Average Error per step in episode 0.01592029048987324
Epsidoe Time: 467.7783 seconds
step = 43960: loss = -55.73919494549433
average_actor_loss = 1.5185020974526802 : average_critic_loss = 0.16707836002784057 : average_alpha_loss=-57.42477538055844

current collect month 1
Current average Error per step in episode 0.015182457257708799
Current average Error per step in episode 0.016129400074687966
Average Error per step in episode 0.016129400074687966
Epsidoe Time: 492.3202 seconds
step = 46936: loss = -59.700504111987286
average_actor_loss = 1.5164878581880883 : average_critic_loss = 0.1659513820283195 : average_alpha_loss=-61.38294338923629

eval episode at step 46936
Current average Error per step in episode 0.01573907779086206
Current average Error per step in episode 0.01581652862997044
current evaluation month 4
Current average Error



current collect month 7
Current average Error per step in episode 0.015044726497177507
Current average Error per step in episode 0.015113405275305932
Average Error per step in episode 0.015113405275305932
Epsidoe Time: 487.9825 seconds
step = 55768: loss = -71.69826599346695
average_actor_loss = 1.4594991533185846 : average_critic_loss = 0.16018748543505626 : average_alpha_loss=-73.31795262521312

current collect month 9
Current average Error per step in episode 0.014466642940209959
Current average Error per step in episode 0.014599404985173823
Average Error per step in episode 0.014599404985173823
Epsidoe Time: 471.2318 seconds
step = 58648: loss = -75.59362738132477
average_actor_loss = 1.5118344732042817 : average_critic_loss = 0.15828100815674084 : average_alpha_loss=-77.2637429051929

eval episode at step 58648
Current average Error per step in episode 0.01540064903768313
could not calculate power flow


AttributeError: ignored