# gym Environment and Testing of Algorithms

In [None]:
# Basics
import os
import sys
import random
import warnings

# Data 
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
import numpy as np
from collections import OrderedDict
from functools import reduce
import json
from scipy.stats import norm

# Logging
import logging
import wandb
from wandb.integration.sb3 import WandbCallback

# Plotting
import plotly.express as px
import plotly.graph_objects as go
from plotly.io import to_html
import matplotlib.pyplot as plt


# RL
from gym import Env
from gym import make
from gym.spaces import Discrete, Box, Dict, Tuple, MultiDiscrete, MultiBinary
from gym.spaces import flatdim, flatten_space, unflatten, flatten

# Auxiliary 
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

## Environment Definition

In [None]:
class VPPBiddingEnv(Env):
    
    def __init__(self,
                 config_path,
                 log_level, 
                 env_type
                ):
        
        logger = logging.getLogger()
        while logger.hasHandlers():
                logger.removeHandler(logger.handlers[0])
        
        if env_type == "training":
            self.env_type = "training"
            os.remove("logs/training.log")
            fhandler = logging.FileHandler(filename='logs/training.log', mode='w')
        if env_type == "eval":
            self.env_type = "eval"
            os.remove("logs/eval.log")
            fhandler = logging.FileHandler(filename='logs/eval.log', mode='w')
        if env_type == "test":
            self.env_type = "test"
            fhandler = logging.StreamHandler()
            
        logger.addHandler(fhandler)           
        logger.setLevel(log_level)
        
        logging.debug("log level = debug")
        logging.info("log level = info")
        logging.warning("log level = warning")
        

        # data 
        self.config = self._load_config(config_path)
            
        self.renewables_df = self._load_data("renewables")
        self.tenders_df = self._load_data("tenders")
        self.market_results = self._load_data("market_results") 
        self.bids_df = self._load_data("bids") 
        self.time_features_df = self._load_data("time_features") 
        
        self.asset_data , self.asset_data_FCR = self._configure_vpp()
        self.asset_data_total = self.asset_data.loc[:,"Total"]
        self.asset_data_FCR_total = self.asset_data_FCR.loc[:,"Total"]
        self.maximum_possible_VPP_capacity = round(self.asset_data_total.max(),2) + 0.01
        
        self.total_slot_FCR_demand = None
        
        # window_size
        self.hist_window_size = self.config["config"]["time"]["hist_window_size"]
        self.forecast_window_size = self.config["config"]["time"]["forecast_window_size"]
        
        # episode        
        self.first_slot_date_start = pd.to_datetime(self.config["config"]["time"]["first_slot_date_start"])
        self.last_slot_date_end = pd.to_datetime(self.config["config"]["time"]["last_slot_date_end"])
        
        # Timeselection of Dataframes
        self.renewables_df = self.renewables_df[self.first_slot_date_start:self.last_slot_date_end]
        self.tenders_df = self.tenders_df[self.first_slot_date_start:self.last_slot_date_end]
        self.market_results = self.market_results[:self.last_slot_date_end] # start prior to first_slot_date_start as data is needed for historic market results
        self.bids_df = self.bids_df[self.first_slot_date_start:self.last_slot_date_end]
        self.time_features_df = self.time_features_df[self.first_slot_date_start:self.last_slot_date_end]
        
        logging.debug("selection self.renewables_df" + str(self.renewables_df))
        logging.debug("selection self.tenders_df" + str(self.tenders_df))
        logging.debug("selection self.market_results" + str(self.market_results))
        logging.debug("selection self.bids_df" + str(self.bids_df))
        logging.debug("selection self.time_features_df" + str(self.time_features_df))

        # slot start , gate closure, auction time 
        self.lower_slot_start_boundary = self.first_slot_date_start
        self.gate_closure = pd.to_datetime(self.tenders_df[self.lower_slot_start_boundary:]["GATE_CLOSURE_TIME"][0])
        self.slot_start = self.tenders_df[self.lower_slot_start_boundary:].index[0]
        self.bid_submission_time = self.gate_closure - pd.offsets.DateOffset(hours = 1)
        
        self.initial = True
        self.done = None
        self.total_reward = 0.
        self.total_profit = 0.
        self.history = None
        
        # Slots 
        #self.slots_won = [0, 0, 0, 0, 0, 0]
        #self.slot_prices_DE = [0., 0., 0., 0., 0., 0.]
        
        self.delivery_results = {}
        self.previous_delivery_results  = {}
        
        self.logging_step = -1
                
        # Spaces
        
        # Observation Space
        obs_low = np.float32(np.array([0.0] * 96)) #96 timesteps to min 0.0
        #obs_high = np.float32(np.array([1.0] * 96)) #96 timesteps to max 1.0
       
        obs_high = np.float32(np.array([self.maximum_possible_VPP_capacity] * 96)) #96 timesteps to max 1.0

        # Create a observation space with all observations inside
        self.observation_space = Dict({
            "asset_data_historic": Box(obs_low, obs_high, dtype=np.float32),
            "asset_data_forecast": Box(obs_low, obs_high, dtype=np.float32),
            "predicted_market_prices":  Box(low=0.0, high=np.float32(4257.07), shape=(6,), dtype=np.float32), # for each slot, can be prices of same day last week 
            "weekday": Discrete(7), # for the days of the week
            "week": Discrete(53),  # for week of the year
            "month": Discrete(12),
            "isHoliday": Discrete(2), # holiday = 1, no holiday = 0
            "followsHoliday": Discrete(2), # followsHoliday = 1, no followsHoliday = 0
            "priorHoliday": Discrete(2), # priorHoliday = 1, no priorHoliday = 0
            "slots_won": MultiBinary(6), #boolean for each slot, 0 if loss , 1 if won 
            "slot_prices_DE": Box(low=0.0, high=np.float32(4257.07), shape=(6,), dtype=np.float32)
            })
        
        self.observation = None
        
        
        # Action Space
        
        # VERSION 3
        # Convert complex action space to flattended space
        
        # maximum possible FCR capacity 
        maximum_possible_FCR_capacity = round(self.asset_data_FCR_total.max(),2)
        maximum_possible_market_price = self.bids_df["settlement_price"].max()
        
        #TODO: DELETE NEXT LINE 
        #maximum_possible_market_price = 100.0
        
        # 12 values from  min 0.0
        action_low = np.float32(np.array([0.0] * 12)) 
        # 6 values to max maximum_possible_FCR_capacity = the bid sizes 
        # 6 values to max maximum_possible_market_price = the bid prices
        #action_high = np.float32(np.array([maximum_possible_FCR_capacity] * 6 + [maximum_possible_market_price] *6 )) 
        action_high = np.float32(np.array([self.maximum_possible_VPP_capacity] * 6 + [maximum_possible_market_price] *6 )) 
        
        self.action_space = Box(low=action_low, high=action_high, shape=(12,), dtype=np.float32)
        
        # VERSION 2 
        
        '''# Convert complex action space to flattended space
        # bid sizes =  6 DISCRETE slots from 0 to 25  = [ 25, 25, 25, 25, 25 , 25]  = in flattened = 150 values [0,1]
        # bid prizes = 6 CONTINUOUS slots from 0 to 100  = [ 100., 100., 100., 100., 100. , 100.]  = in flattened = 150 values [0,1]

        # 156 values from  min 0.0
        action_low = np.float32(np.array([0.0] * 156)) 
        #150 values to max 1.0 = the bid sizes 
        # +6 values to max 100. = the bid prices
        action_high = np.float32(np.array([1.0] * 150 + [100.0]*6)) 
        self.action_space = Box(low=action_low, high=action_high, shape=(156,), dtype=np.float32)'''

        # VERSION 1
        
        
        '''
        self.complex_action_space = Tuple((
            # INFO: TSOs allow divisible and indivisible bids. Biggest divisible bid was 188 MW , maximum price was 4257.07 
            #MultiDiscrete([ 188, 188, 188, 188, 188 , 188]),
            MultiDiscrete([ 25, 25, 25, 25, 25 , 25]),
            #Box(low=0.0, high=np.float32(4257.07), shape=(6,), dtype=np.float32)))
            Box(low=0.0, high=np.float32(100.), shape=(6,), dtype=np.float32)))
        
        #flatten_action_space_64 = flatten_space(self.complex_action_space)
        #self.action_space = flatten_action_space_64

        
        #logging.debug(flatten_action_space_64)
        #logging.debug(type(flatten_action_space_64))
        #logging.debug("#" *42)
        
        #flattened_action = flatten(self.complex_action_space, self.complex_action_space.sample())
        #logging.debug(flattened_action)

        #unflattened_action = unflatten(self.complex_action_space, flattened_action)
        #logging.debug(unflattened_action)'''
    
    def _load_config(self, config_path):        
        with open(config_path, 'r') as f:
            config = json.load(f)
        return config
            
        
    def _load_data(self, data_source):
        df = pd.read_csv(self.config["config"]["csv_paths"][data_source], sep = ";", index_col = 0)
        df.index = pd.to_datetime(df.index)
        return df


    def _configure_vpp(self):
        # list to concat all dfs later on
        asset_frames_total = []
        asset_frames_FCR = []
        # for each asset type defined in the config (e.g.: "hydro", "wind")
        for asset_type in self.config["assets"].keys():
            # for every plant configuration there is per asset type
            for plant_config in range(len(self.config["assets"][asset_type])):
                # get the qunatity of plants
                quantity = self.config["assets"][asset_type][plant_config]["quantity"]
                # get the maximum capacity of these plants 
                max_capacity_MW = self.config["assets"][asset_type][plant_config]["max_capacity_MW"]
                max_FCR_capacity_share = self.config["assets"][asset_type][plant_config]["max_FCR_capacity_share"]
                # get the name of the column in the renewables csv
                asset_column_names = self.config["assets"][asset_type][plant_config]["asset_column_names"]
                # initialize a array with zeros and the length of the renewables dataframe
                total_asset_capacity = np.array([0.0] * len(self.renewables_df))
                total_asset_FCR_capacity = np.array([0.0] * len(self.renewables_df))

                i = 1
                while i < quantity: 
                    for asset_column_name in (asset_column_names):
                        asset_data = self.renewables_df[[asset_column_name]].values.flatten()
                        asset_data *= max_capacity_MW
                        asset_FCR_capacity = asset_data * max_FCR_capacity_share
                        
                        total_asset_FCR_capacity += asset_FCR_capacity 
                        total_asset_capacity += asset_data
                        
                        i += 1
                        if i < quantity:
                            continue
                        else: 
                            break                        
                    
                total_df = pd.DataFrame(index=self.renewables_df.index)
                FCR_df = pd.DataFrame(index=self.renewables_df.index)
                
                total_df[asset_type + "_class_" + str(plant_config)] = total_asset_capacity
                FCR_df[asset_type + "_class_" + str(plant_config)] = total_asset_FCR_capacity
                asset_frames_total.append(total_df)
                asset_frames_FCR.append(FCR_df)

        if not asset_frames_total: 
            logging.error("No asset data found")
        all_asset_data = reduce(lambda x, y: pd.merge(x, y, on = "time"), asset_frames_total)
        all_asset_data_FCR = reduce(lambda x, y: pd.merge(x, y, on = "time"), asset_frames_FCR)

        all_asset_data['Total'] = all_asset_data.iloc[:,:].sum(axis=1)
        all_asset_data_FCR['Total'] = all_asset_data_FCR.iloc[:,:].sum(axis=1)
        
        return all_asset_data, all_asset_data_FCR
    
    
    
    def reset(self):
        
        if self.initial is False: 
            self.lower_slot_start_boundary = self.lower_slot_start_boundary  + pd.offsets.DateOffset(days=1)
            self.gate_closure = pd.to_datetime(self.tenders_df[self.lower_slot_start_boundary:]["GATE_CLOSURE_TIME"][0])
            self.slot_start = self.tenders_df[self.lower_slot_start_boundary:].index[0]
            self.bid_submission_time = self.gate_closure - pd.offsets.DateOffset(hours = 1)
            
            logging.info("new self.lower_slot_start_boundary = " + str(self.lower_slot_start_boundary))
            logging.info("self.gate_closure = " + str(self.gate_closure))
            logging.info("self.slot_start = " + str(self.slot_start))
            logging.info("self.bid_submission_time = " + str(self.bid_submission_time))

        self.total_slot_FCR_demand = self.tenders_df[str(self.slot_start):]["total"][0] 
        self.done = False

        self.previous_delivery_results = self.delivery_results.copy()
        logging.debug("self.delivery_results = " + str(self.delivery_results))
        self.delivery_results.clear()
        logging.debug("self.delivery_results after clearing")
        logging.debug("self.delivery_results = " + str(self.delivery_results))
        logging.debug("self.previous_delivery_results after clearing")
        logging.debug("self.previous_delivery_results = " + str(self.previous_delivery_results))
        
        self.delivery_results["slots_won"] = [0, 0, 0, 0, 0, 0]
        self.delivery_results["slot_prices_DE"] = [0., 0., 0., 0., 0., 0.]
        
        # reset for each episode 
        self._get_new_timestamps()
        
        # get new observation
        self._get_observation()
        
        # when first Episode is finished, set boolean.  
        self.initial = False
        
        self.logging_step += 1
        logging.debug("logging_step: " + str(self.logging_step))
        
        return self.observation
                
    
    def _get_new_timestamps(self):
                
        self.historic_data_start = self.bid_submission_time - pd.offsets.DateOffset(days=self.hist_window_size)
        self.historic_data_end =  self.bid_submission_time - pd.offsets.DateOffset(minutes = 15)
        logging.debug("self.historic_data_start = " + str(self.historic_data_start))
        logging.debug("self.historic_data_end = " + str(self.historic_data_end))
        
        self.forecast_start = self.slot_start
        self.forecast_end = self.forecast_start + pd.offsets.DateOffset(days=self.forecast_window_size) - pd.offsets.DateOffset(minutes=15) 
        logging.debug("self.forecast_start = " + str(self.forecast_start))
        logging.debug("self.forecast_end = " + str(self.forecast_end))

        self.market_start = self.slot_start
        self.market_end = self.market_start + pd.offsets.DateOffset(hours=24) - pd.offsets.DateOffset(minutes = 15)
        logging.debug("self.market_start = " + str(self.market_start))
        logging.debug("self.market_end = " + str(self.market_end))

        self.slot_date_list = self.tenders_df[self.market_start:][0:6].index
        
        '''self.slot_date_list = []
        slot_date = self.market_start 
        for i in range(0,6):
            self.slot_date_list.append(str(slot_date))
            slot_date = slot_date + pd.offsets.DateOffset(hours=4)  '''
            
        logging.debug("self.slot_date_list = " + str( self.slot_date_list))
    
    
    def _add_gaussian_noise(self, data, whole_data):
        mean = 0.0
        standard_deviation = np.std(whole_data)
        standard_deviation_gaussian = standard_deviation *  0.2 # for 20% Gaussian noise
        noise = np.random.normal(mean, standard_deviation_gaussian, size = data.shape)
        data_noisy = data + noise
        # Set negative values to 0 
        data_noisy = data_noisy.clip(min=0)

        return data_noisy 
    
    
    def _get_observation(self):
        
        '''if (self.done is False) and (self.initial is False):
            print("if schleife 1 ")
            print("done = " + str(self.done))
            print("initial = " + str(self.initial))
            
            self.observation["slots_won"] = np.array(self.delivery_results["slots_won"], dtype=np.int32)
            self.observation["slot_prices_DE"] = np.array(self.delivery_results["slot_prices_DE"], dtype=np.float32)
            
            
        if (self.done is True) or (self.initial is True):
            print("if schleife 2 ")
            print("done = " + str(self.done))
            print("initial = " + str(self.initial))'''

        asset_data_historic = self.asset_data_total[str(self.historic_data_start) : str(self.historic_data_end)].to_numpy(dtype=np.float32)
        logging.debug("asset_data_historic = " + str(self.asset_data_total[str(self.historic_data_start) : str(self.historic_data_end)]) )

        asset_data_forecast = self.asset_data_total[str(self.forecast_start) : str(self.forecast_end)].to_numpy(dtype=np.float32)
        logging.debug("asset_data_forecast = "  + str(self.asset_data_total[str(self.forecast_start) : str(self.forecast_end)]))

        # add gaussian noise to data
        noisy_asset_data_forecast = self._add_gaussian_noise(asset_data_forecast, self.asset_data_total)
        noisy_asset_data_forecast = noisy_asset_data_forecast.astype(np.float32)
        logging.debug("noisy_asset_data_forecast = "  + str(noisy_asset_data_forecast))

        # for predicted market Prices try naive prediction: retrieve price of same day last week 
        market_start_last_week = self.market_start - pd.offsets.DateOffset(days=7) 
        market_end_last_week = self.market_end - pd.offsets.DateOffset(days=7)
        logging.debug("market_start_last_week = "  + str(market_start_last_week))
        logging.debug("market_end_last_week = "  + str(market_end_last_week))
        predicted_market_prices = self.market_results["DE_SETTLEMENTCAPACITY_PRICE_[EUR/MW]"][str(market_start_last_week) : str(market_end_last_week)].to_numpy(dtype=np.float32)
        logging.debug("predicted_market_prices = "  + str(predicted_market_prices))
        if len(predicted_market_prices) < 6:
            # predicted_market_prices list is smaller than 6 so fake is generated mean of first week
            predicted_market_prices = np.array([ 17.48, 17.48, 17.48, 17.48, 17.48, 17.48], dtype=np.float32) 
            logging.debug("predicted_market_prices list is smaller than 6 so fake is generated: "  + str(predicted_market_prices))
        
        time_features = self.time_features_df[str(self.market_start) : str(self.market_end)]
        logging.debug(self.time_features_df[str(self.market_start) : str(self.market_end)])

        weekday = int(time_features["weekday"][0])
        week = int(time_features["week"][0])
        month = int(time_features["month"][0])
        isHoliday = int(time_features["is_holiday"][0])
        followsHoliday = int(time_features["followsHoliday"][0])
        priorHoliday = int(time_features["priorHoliday"][0])

        slots_won =  np.array(self.delivery_results["slots_won"], dtype=np.int32)
        slot_prices_DE = np.array(self.delivery_results["slot_prices_DE"], dtype=np.float32)

        self.observation = OrderedDict({
            "asset_data_historic": asset_data_historic,
            "asset_data_forecast": noisy_asset_data_forecast,
            "predicted_market_prices": predicted_market_prices,
            "weekday": weekday, 
            "week": week, 
            "month": month,
            "isHoliday": isHoliday, 
            "followsHoliday": followsHoliday,
            "priorHoliday": priorHoliday,
            "slots_won": slots_won,
            "slot_prices_DE": slot_prices_DE
            })
        logging.debug("NEW Observation = "  + str(self.observation))
            
    
    
    def step(self, action):
        
        # convert action list with shape (12,) into dict
        action_dict = {
            "size": action[0:6], 
            "price": action[6:]
        }
        
        # Simulate VPP 
        self._simulate_vpp()
        
        # Simulate Market 
        # take the bid out of the action of the agent and resimulate the market clearing algorithm
        self._simulate_market(action_dict)
        
        # Prepare the data for the delivery simulation and reward calculation
        self._prepare_delivery()
        
        # calculate reward from state and action 
        step_reward = self._calculate_reward(action_dict)
        
        self.total_reward += step_reward
            
        info = dict(
            bid_submission_time = str(self.bid_submission_time),
            step_reward = round(step_reward,2),
            total_reward = round(self.total_reward,2),
            total_profit = round(self.total_profit,2)
        )
        
        
        self._update_history(info)
                
        self.done = True
        self._get_observation()
        
        
        if self.env_type != "test":
            
            if self.env_type == "training":
                wandb.log({
                    "global_step": self.logging_step,
                    "total_reward": self.total_reward,
                    "total_profit": self.total_profit,
                    "step_reward": step_reward},
                    #step=self.logging_step,
                    commit=False)
                
                self.render()
            
            if self.env_type == "eval":
                wandb.log({
                    "global_step": self.logging_step,
                    "total_reward": self.total_reward,
                    "total_profit": self.total_profit,
                    "step_reward": step_reward},
                    step=self.logging_step,
                    commit=False
                )
        
        return self.observation, step_reward, self.done, info
    
    
    def _calculate_reward(self, action_dict):        
        # Step 1 of Reward Function: The Auction
        # did the agent win the auction? 
        # what was the revenue ?
        
        step_reward = 0
        
        # per slot won: + 100
        # per slot won: + (bid size *  marginal prize)
        # per slot lost: -100

        logging.info("Reward Overview:")
        logging.debug("self.delivery_results['slots_won']: " + str(self.delivery_results["slots_won"]))
        logging.debug("len(self.delivery_results['slots_won']) : "  + str(len(self.delivery_results["slots_won"])))       
        
        for slot in range(0, len(self.delivery_results["slots_won"])):
            
            logging.debug("slot no. " + str(slot))
            
            if self.delivery_results["slots_won"][slot] == 0:
                logging.debug("slot no " + str(slot) + " was lost")
                step_reward -= 100

            if self.delivery_results["slots_won"][slot] == 1:
                logging.debug("slot no. " + str(slot)+  " was won")

                # Approach 1 : first reward the won slot, then check if it could be delivered and give huge negative reward (-1000)
                # Approach 2 : first check if won slot could be delivered and then calculate partial reward (60 minutes - penalty minutes / 60 ) * price * size 
                # we try Approach 1 
    
                
                # Step 1: award the agent for a won slot
                step_reward += 100
                
                # Step 2: Calculate the Profit of the bid if won 
                
                # extract the bid size of the agent 
                agents_bid_size = self.delivery_results["agents_bid_sizes_round"][slot]
                # and calculate the reward by multiplying the bid size with the settlement price of the slot
                step_profit = (agents_bid_size * self.delivery_results["slot_prices_DE"][slot])
                
                # Step 3: validate if the VPP can deliver the traded capacity
                self._simulate_delivery(slot, action_dict)
                logging.debug("self.delivery_results['delivered_slots']")
                logging.debug(self.delivery_results["delivered_slots"])

                # Step 4: if the capacity can not be delivered give a high Penalty
                if self.delivery_results["delivered_slots"][slot] == False:
                    step_reward -= 5000
                
                # Update the total profit and Step Reward. 
                self._update_profit(step_profit)
                step_reward +=  step_profit
                
                logging.debug("agents_bid_size: " + str(agents_bid_size))
                logging.debug("self.delivery_results['slot_prices_DE'][slot]: " + str(self.delivery_results["slot_prices_DE"][slot]))
                logging.debug("step_profit: " + str(step_profit))
                
            logging.info("step_reward Slot " + str(slot) +" = " + str(step_reward))
        
    
        
        # further rewards? 
        # diff to the settlement price
        # diff to the max. forecasted capacity of the VPP
        # incentive to go nearer to settlement price or forecasted capacity can be: 1- (abs(diff_to_capacity)/max_diff_to_capacity)^0.5
        # idea: reward for positive and negative reward separate. 
        
        # Alternative solution: 
        # A reward function, that combines penalty and delivered FCR: 
        # compensation = (60 minutes - penalty minutes / 60 ) * price * size 
        # penalty  = (penalty minutes / 60 ) * price * size 
        # reputation_damage = reputation_factor *  penalty_min/ 60 * size
            # penalty_min = number of minutes where capacity could not be provided
        # in total: r = compensation − penalty − reputation_damage,
        
        return step_reward
    
    
    def _update_profit(self, step_profit):
        
        self.total_profit += step_profit
        
    
    def _update_history(self, info):
        if not self.history:
            self.history = {key: [] for key in info.keys()}

        for key, value in info.items():
            self.history[key].append(value)

            
    def render(self, mode="human"):
        if not self.delivery_results:
            logging.debug("self.delivery_results is empty, not plotting it ")
        else:
            # only plot to wandb when not in test mode
            if self.env_type != "test":
                if self.logging_step > 0: 
                    logging.debug(" now in render()")        
                    logging.debug(" self.previous_delivery_results " + str(self.previous_delivery_results))      
                    logging.debug(" self.delivery_results['slots_won'] " + str(self.delivery_results["slots_won"]))      

                    # Render Won / Lost Slots 
                    slots_won = self.previous_delivery_results["slots_won"]
                    logging.debug(" slots_won " + str(slots_won))      
                    slots_lost = [None,None,None,None,None,None]
                    for x in range(len(slots_won)):
                        if slots_won[x] == 1:
                            slots_lost[x] = 0
                        else:
                            slots_lost[x] = 1

                    data = {'Slot Won': slots_won, 'Slot Lost': slots_lost}
                    slots_df = pd.DataFrame(data=data, index=[1, 2, 3, 4, 5, 6])
                    logging.debug(" slots_df " + str(slots_df))
                    slots_won_plot = px.bar(slots_df,  x= slots_df.index, y=['Slot Won', 'Slot Lost'], color_discrete_sequence=[ "green", "gainsboro"] )

                    # Render Delivery for Capacity 
                    delivery_plot = go.Figure()
                    delivery_plot.add_trace(go.Scatter(x=list(range(1, 97)), y=self.previous_delivery_results["vpp_total"], fill='tozeroy', fillcolor='rgba(0, 85, 255, 0.4)',  line_color="blue", name="VPP Cap."))
                    delivery_plot.add_trace(go.Scatter(x=list(range(1, 97)), y=self.previous_delivery_results["vpp_total_FCR"], fill='tozeroy', line_color="green", name="VPP FCR Cap." )) 
                    delivery_plot.add_trace(go.Scatter(x=list(range(1, 97)), y=self.previous_delivery_results["bid_sizes_all_slots"], fill='tozeroy', fillcolor='rgba(255, 0, 0, 0.5)', line_color="red", name="Agents Bid" )) 

                    # Render Delivery for each Slot 
                    slots_delivered = [None,None,None,None,None,None]
                    for slot in range(6):
                        if self.previous_delivery_results["delivered_slots"][slot] == True:
                            slots_delivered[slot] = 1
                        else: 
                            slots_delivered[slot] = 0
                    slots_not_delivered = [None,None,None,None,None,None]
                    for x in range(len(slots_delivered)):
                        if slots_delivered[x] == 1:
                            slots_not_delivered[x] = 0
                        else:
                            slots_not_delivered[x] = 1
                            
                    data = {'delivered': slots_delivered, 'NOT deliv.': slots_not_delivered}
                    slots_delivered_df = pd.DataFrame(data=data, index=[1, 2, 3, 4, 5, 6])
                    logging.debug(" slots_delivered_df " + str(slots_delivered_df))
                    slots_delivered_plot = px.bar(slots_delivered_df,  x= slots_delivered_df.index, y=['delivered', 'NOT deliv.'], color_discrete_sequence=[ "lawngreen", "red"] )

    
                    # Render Agents Slot Prices and Settlement Prices 

                    price_plot = go.Figure()
                    price_plot.add_trace(go.Scatter(x=list(range(1,7)), y=self.previous_delivery_results["settlement_price_DE"], line_color="blue", name="Market Price"))
                    price_plot.add_trace(go.Scatter(x=list(range(1,7)), y=self.previous_delivery_results["agents_bid_prices"] , line_color="red", name="Agents Price" )) 


                    if self.env_type != "test":
                        
                        if self.env_type == "training":
                            wandb.log({
                                "Won / Loss of Slots": slots_won_plot,
                                "Sold and Available Capacity" : delivery_plot,
                                "Agents and Settlement Prices per Slot" : price_plot,
                                "Delivery per Slot": slots_delivered_plot},
                                #step=self.logging_step,
                                commit=True
                            )
                            
                        if self.env_type == "eval":
                            wandb.log({
                                "global_step": self.logging_step,
                                "Won / Loss of Slots": slots_won_plot,
                                "Sold and Available Capacity" : delivery_plot,
                                "Agents and Settlement Prices per Slot" : price_plot,
                                "Delivery per Slot": slots_delivered_plot},
                                step=self.logging_step,
                                commit=False
                            )

    
    def _simulate_vpp(self):
        
        vpp_total = self.asset_data_total[str(self.market_start) : str(self.market_end)].to_numpy(dtype=np.float32)
        vpp_total_FCR = self.asset_data_FCR_total[str(self.market_start) : str(self.market_end)].to_numpy(dtype=np.float32)
        
        self.delivery_results["vpp_total"] = vpp_total
        self.delivery_results["vpp_total_FCR"] = vpp_total_FCR
        self.delivery_results["bid_sizes_all_slots"] = [0] * 96
        
    

    def _simulate_market(self, action_dict):
        
        auction_bids = self.bids_df[self.market_start : self.market_end]
        logging.debug("auction_bids = ")        
        logging.debug(self.bids_df[self.market_start : self.market_end])
        
        logging.info("Bid Submission time (D-1) = %s" % (self.bid_submission_time))
        logging.info("Gate Closure time (D-1) = %s" % (self.gate_closure))
        logging.info("Historic Data Window: from %s to %s " % (self.historic_data_start, self.historic_data_end))
        logging.info("Forecast Data Window: from %s to %s " % (self.forecast_start, self.forecast_end))

        self.delivery_results["agents_bid_prices"] = [None,None,None,None,None,None]
        self.delivery_results["settlement_price_DE"] = [None,None,None,None,None,None]
        self.delivery_results["agents_bid_sizes_round"] = [None,None,None,None,None,None]
        self.delivery_results["slots_won"] = [None,None,None,None,None,None]

        for slot in range(0, len(self.slot_date_list)):
            slot_date = self.slot_date_list[slot]
            logging.info("Current Slot Time: (D) = %s" % (slot_date)) 
            slot_bids = auction_bids[slot_date : slot_date].reset_index(drop=True).reset_index(drop=False)
            logging.debug("slot_bids = " + str(slot_bids))
            slot_bids_list = slot_bids.to_dict('records')
            logging.debug("slot_bids_list = " + str(slot_bids_list))
            # extract the bid size out of the agents action
            # ROUND TO FULL INTEGER
            agents_bid_size = round(action_dict["size"][slot])
            self.delivery_results["agents_bid_sizes_round"][slot] = agents_bid_size
            # extract the bid price out of the agents action
            agents_bid_price = action_dict["price"][slot]
            # TODO: add to delivery results
            self.delivery_results["agents_bid_prices"][slot] = agents_bid_price
            logging.info("agents_bid_size = %s" % (agents_bid_size))
            logging.info("agents_bid_price = %s" % (agents_bid_price))            
            # get settlement price
            settlement_price_DE = [bid['settlement_price'] for bid in slot_bids_list if bid['country']== "DE"][0] 
            logging.info( "settlement_price_DE : " + str(settlement_price_DE))
            self.delivery_results["settlement_price_DE"][slot] = settlement_price_DE

            
            # First check if agents bid price is higher than the settlement price of Germany 
            # OR if agents bid size is 0 
            if (agents_bid_price > settlement_price_DE) or (agents_bid_size == 0):
                # if it is higher, the slot is lost. 
                self.delivery_results["slots_won"][slot] = 0
                # set settlement price for the current auctioned slot in slot_prices_DE list
                self.delivery_results["slot_prices_DE"][slot] = settlement_price_DE
            else: 
                # If agents bid price is lower than settlement price (bid could be in awarded bids)
                # get CBMP of countries without LMP
                unique_country_bids = list({v['country']:v for v in slot_bids_list}.values())
                grouped_prices = [x['settlement_price'] for x in unique_country_bids]
                cbmp = max(set(grouped_prices), key = grouped_prices.count)
                logging.info( "cbmp : " + str(cbmp))
                # check if settlement_price_DE is same as CBMP (no limit constraints where hit)
                if cbmp == settlement_price_DE:
                    price_filter = cbmp
                    logging.debug("DE has CBMP")
                else: 
                    # if Germany has a price based on limit constraints
                    price_filter = settlement_price_DE
                    logging.debug("DE has LMP")
                                
                # as the probability is high that the agents bid moved the last bid out of the list, 
                # we have to check which bids moved out of the list and what is the new settlement price
                
                # sort the bid list based on the price
                slot_bids_list_sorted_by_price = sorted(slot_bids_list, key=lambda x: x['price'])
                # filter the bid list by the settlement price of either the CBMP or the LMP of germany 
                #slot_bids_prices_filtered = [bid['price'] for bid in slot_bids_list_sorted_by_price if bid['settlement_price']== price_filter]
                #logging.debug(slot_bids_prices_filtered)
                slot_bids_filtered = [bid for bid in slot_bids_list_sorted_by_price if bid['settlement_price']== price_filter]
                accumulated_replaced_capacity = 0
                
                slot_bids_filtered_size_sum = sum([bid['size'] for bid in slot_bids_filtered])
                    # for the case the action_dict space is not dynamic and agent can choose any bid size,
                    # it needs to be checked here if 
                if agents_bid_size >= slot_bids_filtered_size_sum:
                    logging.debug("unrealistic bid size")
                    # set auction won to false
                    self.delivery_results["slots_won"][slot] = 0
                    # set settlement price to zero as it is an unrealistic auciton
                    self.delivery_results["slot_prices_DE"][slot] = 0
                else:
                    for bid in range(0, len(slot_bids_filtered)): 
                        logging.debug("bid size = " + str(slot_bids_filtered[-(bid+1)]["size"]))
                        logging.debug("bid price = " + str(slot_bids_filtered[-(bid+1)]["price"]))
                        bid_capacity = slot_bids_filtered[-(bid+1)]["size"]
                        accumulated_replaced_capacity += bid_capacity
                        logging.debug("accumulated_replaced_capacity = " + str( accumulated_replaced_capacity))
                            
                        if accumulated_replaced_capacity >= agents_bid_size:
                            logging.debug("realistic bid size")
                            if slot_bids_filtered[-(bid+1)]["indivisible"] is False:
                                logging.debug("bid is divisible, so current bids price is new settlement price")
                                new_settlement_price_DE = slot_bids_filtered[-(bid+1)]["price"]
                            else:
                                logging.debug("bid is INDIVISIBLE, so move one bids further is new settlement price")
                                accumulated_replaced_capacity -= bid_capacity
                                continue
                            logging.info("new_settlement_price_DE = " + str( new_settlement_price_DE))
                            # set boolean for auction win
                            self.delivery_results["slots_won"][slot] = 1
                            # set settlement price for the current auctioned slot in slot_prices_DE list
                            self.delivery_results["slot_prices_DE"][slot] = new_settlement_price_DE
                            break

            logging.info("self.delivery_results['slots_won'] = ")
            logging.info("\n".join("slot won: \t{}".format(k) for k in self.delivery_results["slots_won"]))
            logging.info("     agents bid_size = ")
            logging.info("\n".join("size: \t{}".format(round(k) )for k in action_dict["size"]))            
            logging.info("self.delivery_results['slot_prices_DE'] = ")
            logging.info("\n".join("price: \t{}".format(k) for k in self.delivery_results["slot_prices_DE"]))
            
            
    def _prepare_delivery(self):
        '''
        Was macht die funktion? 
        
        parameter
        
        return
        
        '''
        
        # extend slot bid size format from 6 slots to 96 time steps
        bid_sizes_list = []
        for slot_x in range (0,6): 
            for time_step in range(0,16):
                #bid_sizes_list.append(action_dict["size"][slot_x])
                bid_sizes_list.append(self.delivery_results["agents_bid_sizes_round"][slot_x])
        bid_sizes_all_slots = np.array(bid_sizes_list)
        self.delivery_results["agents_bid_sizes_round_all_slots"] = bid_sizes_all_slots
        self.delivery_results["bid_sizes_all_slots"] = bid_sizes_all_slots
        logging.debug("self.delivery_results['bid_sizes_all_slots'] : "  + str(self.delivery_results['bid_sizes_all_slots']))
        
        # initialize slots dict
        self.delivery_results["delivered_slots"] = {}
        # initialize slots in dict 
        for slot in range (0,6):
            self.delivery_results["delivered_slots"][slot] = None
            
            
    def _check_delivery_possible(self, agent_bid_size, vpp_total_FCR_slot):
        '''
        
        '''
                
        # 2. Probability of maximum Delivery Amount (74 % :  0 - 5 %  Capacity , 18 % : 5 - 10 % , 5% : 10-15% ( 97%: max 15 %)

        max_delivery_share = random.choices(
             population=[0.05, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75, 1.0],
             weights=   [0.74, 0.18, 0.05, 0.02, 0.007, 0.001, 0.001, 0.001],
             k=1
         )

        capacity_to_deliver = max_delivery_share[0] * agent_bid_size

        logging.debug("agent_bid_size : " + str(agent_bid_size))
        logging.debug("max_delivery_share : " + str(max_delivery_share[0]))
        logging.debug("capacity_to_deliver : " + str(capacity_to_deliver))
        
        # 3. Probability of successfull Delivery (100%: 10% of HPP Capacity = Probability Curve)

        mean = 0 # symmetrical normal distribution at 0 
        sd = self.maximum_possible_VPP_capacity/7

        max_at_10_percent = norm.pdf(self.maximum_possible_VPP_capacity*0.1,mean,sd)
        scale_factor = 1 / max_at_10_percent

        logging.debug("max_at_10_percent = " + str(max_at_10_percent))
        logging.debug("scale_factor = " + str(scale_factor))

        # Plot between -max_power and max_power with .001 steps.
        #x_axis = np.arange(-max_power, max_power, 0.001)
        #plt.plot(x_axis, (norm.pdf(x_axis, mean, sd)) * scale_factor + shift_to_100)
        #plt.show()

        propab_of_delivery = round((norm.pdf(capacity_to_deliver, mean,sd) * scale_factor),3)

        if propab_of_delivery > 1.0: 
            propab_of_delivery =  1.0

        logging.debug("propab_of_delivery = " + str(propab_of_delivery))

        delivery_possible = random.choices(
             population=[True, False],
             weights=   [propab_of_delivery , (1-propab_of_delivery)],
             k=1
         )
        delivery_possible = delivery_possible[0] # as bool is in list 
        
        # 4. Check VPP Boundaries: In case of a very high or low operating point (nearly 100% or 0% power output of the HPP): 
        # then the delivery is not possible. 
        
        # check if probability of delivery is high and capacity could be deliverd
        if delivery_possible == True:
            # when negative FCR :
            if capacity_to_deliver < 0:
                if (vpp_total_FCR_slot - abs(capacity_to_deliver)) < 0:
                    logging.error("Error, FCR is smaller than vpp_total_FCR_slot")
                    delivery_possible = False
            # if positive FCR
            else:
                if (vpp_total_FCR_slot + capacity_to_deliver) > self.maximum_possible_VPP_capacity: 
                    delivery_possible = False
      
        return delivery_possible

    def _simulate_delivery(self, slot, action_dict): 
        logging.debug("Delivery Simulation for Slot No. " + str(slot))
        
        #vpp_total_slot = self.delivery_results["vpp_total"][slot *16 : (slot+1)*16]
        vpp_total_FCR_slot = self.delivery_results["vpp_total_FCR"][slot *16 : (slot+1)*16]
        bid_sizes_per_slot = self.delivery_results["bid_sizes_all_slots"][slot *16 : (slot+1)*16]
        
        logging.debug("vpp_total_FCR_slot " + str(vpp_total_FCR_slot))
        logging.debug("bid_sizes_per_slot " + str(bid_sizes_per_slot))


        delivery_possible = None
        delivery_possible_list = []

        # check for every timestep
        for time_step in range(0, 16):
        
            agent_bid_size = bid_sizes_per_slot[time_step]
           
            logging.debug("vpp_total_FCR_slot[time_step] : " + str(vpp_total_FCR_slot[time_step]))
            logging.debug("bid_sizes_per_slot[time_step] : " + str(bid_sizes_per_slot[time_step]))

            # check if positive FCR could be provided 
            delivery_possible = self._check_delivery_possible(agent_bid_size, vpp_total_FCR_slot[time_step])
            delivery_possible_list.append(delivery_possible)
            # check if negative FCR could be provided 
            delivery_possible = self._check_delivery_possible(-agent_bid_size, vpp_total_FCR_slot[time_step])
            delivery_possible_list.append(delivery_possible)

        if all(delivery_possible_list): 
            total_delivery_possible = True 
        else: 
            total_delivery_possible = False 
            
        logging.debug("total_delivery_possible for slot " + str(slot) + " : " + str(total_delivery_possible))
        self.delivery_results["delivered_slots"][slot] = total_delivery_possible
           

Idee

1. Not considered: 
    - Probability of Delivery Length (60%: Maximum of 10sec, 15% 11-20sec, 5% 21-30, 5% 31-60, 5% 1-5 min)
2. Considered
    - Probability of Delivery Amount (50 % :  0 - 5 %  Capacity , 27,5 % : 5 - 10 % , 12% : 10-15% ( 90%: max 15 %)
    - Probability of successfull Delivery (100%: 10% of HPP Capacity = Probability Curve)


## Register the Environment 

In [None]:
from gym.envs.registration import register
   
register(
    id="VPPBiddingEnv-TRAIN-v1",
    entry_point="__main__:VPPBiddingEnv",
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "DEBUG", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"training"
           }
)

register(
    id="VPPBiddingEnv-EVAL-v1",
    entry_point="__main__:VPPBiddingEnv",
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "DEBUG", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"eval"
           }
)

register(
    id="VPPBiddingEnv-TEST-v1",
    entry_point="__main__:VPPBiddingEnv",
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "INFO", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"test",
           }
)

In [None]:
from stable_baselines3.common.env_checker import check_env
# It will check your custom environment and output additional warnings if needed
env_to_check = make('VPPBiddingEnv-TEST-v1')
check_env(env_to_check)


## Stable Baselines

## DDPG: Deep Deterministic Policy Gradient (DDPG) 

### Train

In [None]:
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1')
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": 697
}

# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["new_action_high_price", "new_action_high_size", "delivery_simulation_4.0", "DDPG","training", "wind_config", "single vpp obs", "vpp config", "13MW config", "4kprice", "updated_reward" , "delivery_against_FCR", "pred_market_prices"], 
    job_type="training"
)

model = DDPG(config['policy'], env, action_noise=action_noise, verbose=1, tensorboard_log=f"runs/ddpg")

model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=2))
wandb.finish()


### Eval 

In [None]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1')
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=False,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["delivery_simulation_4.0","eval", "single vpp obs", "vpp config", "price plots","updated_reward" , "delivery_against_FCR", "pred_market_prices"],
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 697

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render(mode="human")
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation)
        observation, reward, done, info = eval_env.step(action)
        if done:
            print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({
                #"global_step": eval_env.logging_step
                "episode_reward": reward,
                "episode": i_episode},
                step=eval_env.logging_step,
                commit=True)
            break
            
'''wandb.log({"bid_submission_time" : tbl,
          "global_step": eval_env.logging_step},
         commit=True)'''

wandb.run.summary["bid_submission_time_table"] = tbl

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()

.predict() parameters: 
    
- deterministic (bool)

## Tuning DDPG

### Parameters

- policy = "MlpPolicy" , "CnnPolicy" , "MultiInputPolicy"
- **learning_rate** = staic or range(1,0)
- buffer_size (int) – size of the replay buffer
- **learning_starts (int)** – how many steps of the model to collect transitions for before learning starts
    -  For a fixed number of steps at the beginning (set with the start_steps keyword argument), the agent takes actions which are sampled from a uniform random distribution over valid actions. After that, it returns to normal DDPG exploration.
- batch_size (int) – Minibatch size for each gradient update
- **tau (float)** – the soft update coefficient (“Polyak update”, between 0 and 1)
- gamma (float) – the discount factor
- train_freq (Union[int, Tuple[int, str]]) – Update the model every train_freq steps. Alternatively pass a tuple of frequency and unit like (5, "step") or (2, "episode").
- gradient_steps (int) – How many gradient steps to do after each rollout (see train_freq) Set to -1 means to do as many gradient steps as steps done in the environment during the rollout.
- action_noise (Optional[ActionNoise]) – the action noise type (None by default), this can help for hard exploration problem. Cf common.noise for the different action noise type.
    -  uncorrelated, mean-zero Gaussian noise works perfectly well. 
    -  To facilitate getting higher-quality training data, you may reduce the scale of the noise over the course of training. (We do not do this in our implementation, and keep noise scale fixed throughout.)


- replay_buffer_class (Optional[ReplayBuffer]) – Replay buffer class to use (for instance HerReplayBuffer). If None, it will be automatically selected.
- optimize_memory_usage (bool) – Enable a memory efficient variant of the replay buffer at a cost of more complexity. See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
- create_eval_env (bool) – Whether to create a second environment that will be used for evaluating the agent periodically. (Only available when passing string for the environment)

- seed (Optional[int]) – Seed for the pseudo random generators
- _init_setup_model (bool) – Whether or not to build the network at the creation of the instance





stable_baselines3.ddpg.MlpPolicy Parameters
- lr_schedule (Callable[[float], float]) – Learning rate schedule (could be constant)
- n_critics (int) – Number of critic networks to create.

stable_baselines3.ddpg.MlpPolicy.set_training_mode()
- mode (bool) – if true, set to training mode, else set to evaluation mode

stable_baselines3.ddpg.CnnPolicy

stable_baselines3.ddpg.MultiInputPolicy


In [None]:
# hide all deprecation warnings from tensorflow
#import tensorflow as tf
#tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

import optuna

#from stable_baselines import PPO2
from stable_baselines3 import DDPG
from stable_baselines3 import HerReplayBuffer
from gym.wrappers import RecordEpisodeStatistics
from stable_baselines3.common.noise import NormalActionNoise
#from stable_baselines.common.evaluation import evaluate_policy
#from stable_baselines.common.cmd_util import make_vec_env

# https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/master/5_custom_gym_env.ipynb
#from custom_env import GoLeftEnv

# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
normal_action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))


def optimize_ddpg(trial):
    """ Learning hyperparamters we want to optimise"""
    
    replay_buffer_class = trial.suggest_categorical("replay_buffer_class", ["HER", "None"])
    replay_buffer_class = {"HER": HerReplayBuffer, "None": None}[replay_buffer_class]
    
    action_noise = trial.suggest_categorical("action_noise", ["action_noise", "None"])
    action_noise = {"action_noise": normal_action_noise, "None": None}[action_noise]
    
    params =  {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.0001, 1.0), #default: 0.001
        'learning_starts': int(trial.suggest_int('learning_starts', 0, 200, 10)),  #default: 100
        'batch_size': int(trial.suggest_int('batch_size', 0, 200,10)),  #default: 100
        'tau': trial.suggest_loguniform('tau', 0.001, 1.0), #default: 0.005
        'gamma': trial.suggest_loguniform('gamma', 0.9, 0.9999), # default: gamma=0.99
        'replay_buffer_class' : replay_buffer_class,
        'action_noise' : action_noise
    }
    
    
    return params
        



def optimize_agent(trial):
    """ Train the model and optimize
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    
    model_params = optimize_ddpg(trial)
    
    # init tracking experiment.
    # hyper-parameters, trial id are stored.
    config = dict(trial.params)
    config["trial.number"] = trial.number
    wandb.init(
        project="RL-optuna",
        entity="jlu237", 
        sync_tensorboard=True,
        config=config,
        reinit=True
    )
    
    env = make('VPPBiddingEnv-TRAIN-v1')
    env = Monitor(env) 
    env = RecordEpisodeStatistics(env) # record stats such as returns


    model = DDPG('MultiInputPolicy', env, verbose=0, tensorboard_log=f"runs/ddpg", seed = 1, **model_params)
    model.learn(total_timesteps=697, log_interval=1)
    
    wandb.finish()
    
study = optuna.create_study()
try:
    study.optimize(optimize_agent, n_trials=20)
except KeyboardInterrupt:
    print('Interrupted by keyboard.')

In [None]:
env = make('VPPBiddingEnv-TRAIN-v1')
env.observation_space.spaces["observation"]

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


In [None]:
model.get_parameters()["critic.optimizer"]["param_groups"]

In [None]:
model.get_parameters()["actor.optimizer"]["param_groups"]

In [None]:
# !apt-get install swig cmake ffmpeg freeglut3-dev xvfb

In [None]:
# Alternative from araffin for optuna from: https://github.com/optuna/optuna-examples/blob/52ed3aff3e3e936be3873b5acc6ee3ccdadea914/rl/sb3_simple.py#L60

""" Optuna example that optimizes the hyperparameters of
a reinforcement learning agent using A2C implementation from Stable-Baselines3
on a OpenAI Gym environment.

This is a simplified version of what can be found in https://github.com/DLR-RM/rl-baselines3-zoo.

You can run this example as follows:
    $ python sb3_simple.py

"""
from typing import Any
from typing import Dict

import gym
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
import torch
import torch.nn as nn


N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(2e4)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3

ENV_ID = "CartPole-v1"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV_ID,
}


def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for A2C hyperparameters."""
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5.0, log=True)
    gae_lambda = 1.0 - trial.suggest_float("gae_lambda", 0.001, 0.2, log=True)
    n_steps = 2 ** trial.suggest_int("exponent_n_steps", 3, 10)
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)
    ent_coef = trial.suggest_float("ent_coef", 0.00000001, 0.1, log=True)
    ortho_init = trial.suggest_categorical("ortho_init", [False, True])
    net_arch = trial.suggest_categorical("net_arch", ["tiny", "small"])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    # Display true values
    trial.set_user_attr("gamma_", gamma)
    trial.set_user_attr("gae_lambda_", gae_lambda)
    trial.set_user_attr("n_steps", n_steps)

    net_arch = [
        {"pi": [64], "vf": [64]} if net_arch == "tiny" else {"pi": [64, 64], "vf": [64, 64]}
    ]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "gae_lambda": gae_lambda,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef,
        "max_grad_norm": max_grad_norm,
        "policy_kwargs": {
            "net_arch": net_arch,
            "activation_fn": activation_fn,
            "ortho_init": ortho_init,
        },
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:

    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters
    kwargs.update(sample_a2c_params(trial))
    # Create the RL model
    model = A2C(**kwargs)
    # Create env used for evaluation
    eval_env = gym.make(ENV_ID)
    # Create the callback that will periodically evaluate
    # and report the performance
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))

In [None]:
# code from https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/utils/hyperparams_opt.py#L340

def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for DDPG hyperparams.
    :param trial:
    :return:
    """
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048])
    buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)])
    # Polyak coeff
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08])

    train_freq = trial.suggest_categorical("train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512])
    gradient_steps = train_freq

    noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None])
    noise_std = trial.suggest_uniform("noise_std", 0, 1)

    # NOTE: Add "verybig" to net_arch when tuning HER (see TD3)
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])

    net_arch = {
        "small": [64, 64],
        "medium": [256, 256],
        "big": [400, 300],
    }[net_arch]

    hyperparams = {
        "gamma": gamma,
        "tau": tau,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "buffer_size": buffer_size,
        "train_freq": train_freq,
        "gradient_steps": gradient_steps,
        "policy_kwargs": dict(net_arch=net_arch),
    }

    if noise_type == "normal":
        hyperparams["action_noise"] = NormalActionNoise(
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
        )
    elif noise_type == "ornstein-uhlenbeck":
        hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
        )

    if trial.using_her_replay_buffer:
        hyperparams = sample_her_params(trial, hyperparams)

    return hyperparams


In [None]:
!git clone --recursive https://github.com/DLR-RM/rl-baselines3-zoo

In [None]:
#!cd rl-baselines3-zoo/

In [None]:
!pip install -r rl-baselines3-zoo/requirements.txt

In [None]:
!python rl-baselines3-zoo/train.py --algo ddpg --env VPPBiddingEnv-TRAIN-v1 -n 697 -optimize --n-trials 5 --n-jobs -1 \
  --sampler tpe --pruner median

In [None]:
!python rl-baselines3-zoo/scripts/parse_study.py -i path/to/study.pkl --print-n-best-trials 10 --save-n-best-hyperparameters 10


## TD3 

### Train

In [None]:
from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1')
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": 697
}

# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["new_action_high_price", "new_action_high_size", "delivery_simulation_4.0", "TD3","training", "wind_config", "single vpp obs", "vpp config", "13MW config", "4kprice", "updated_reward" , "delivery_against_FCR", "pred_market_prices"], 
    job_type="training"
)

model = TD3(config['policy'], env, action_noise=action_noise, verbose=1, tensorboard_log=f"runs/ddpg")

model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=2))
wandb.finish()


### Eval

In [None]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1')
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["TD3, ""wind_config","eval", "single vpp obs", "vpp config", "price plots","updated_reward" , "delivery_against_FCR", "pred_market_prices"],
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 697

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render(mode="human")
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation)
        observation, reward, done, info = eval_env.step(action)
        if done:
            print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})
eval_env.close()
mean_ep_rew = info["total_reward"] / episodes

wandb.run.summary["mean_ep_rew"] = mean_ep_rew
print("Mean Episode Reward: " + str(mean_ep_rew))
wandb.finish()

### Tuning TD3

In [None]:
import optuna
from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback


def sample_td3_params(trial: optuna.Trial):
    """
    Sampler for TD3 hyperparams.
    :param trial:
    :return:
    """
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 200])
    buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)])
    # Polyak coeff
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08])

    #train_freq = trial.suggest_categorical("train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512])
    #gradient_steps = train_freq

    noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None])
    noise_std = trial.suggest_uniform("noise_std", 0, 1)

    # NOTE: Add "verybig" to net_arch when tuning HER
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])

    net_arch = {
        "small": [64, 64],
        "medium": [256, 256],
        "big": [400, 300],
        # Uncomment for tuning HER
        # "verybig": [256, 256, 256],
    }[net_arch]

    hyperparams = {
        "gamma": gamma,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "buffer_size": buffer_size,
        #"train_freq": train_freq,
        #"gradient_steps": gradient_steps,
        "policy_kwargs": dict(net_arch=net_arch),
        "tau": tau,
    }
    
    n_actions = 12      
    if noise_type == "normal":
        hyperparams["action_noise"] = NormalActionNoise(
            mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
        )
    elif noise_type == "ornstein-uhlenbeck":
        hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
        )

    return hyperparams




def optimize_agent_td3(trial):
    """ Train the model and optimize
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
   
    
    model_params = sample_td3_params(trial)
    
    # init tracking experiment.
    # hyper-parameters, trial id are stored.
    config = dict(trial.params)
    config["trial.number"] = trial.number
    wandb.init(
        project="RL-optuna",
        entity="jlu237", 
        sync_tensorboard=True,
        config=config,
        tags=["TD3"],
        reinit=True
    )
    
    env = make('VPPBiddingEnv-TRAIN-v1')
    env = Monitor(env) 
    env = RecordEpisodeStatistics(env) # record stats such as returns
    
    
    model = TD3('MultiInputPolicy', env, verbose=0, tensorboard_log=f"runs/td3", seed = 1, **model_params)

    model.learn(total_timesteps=697,
                log_interval=1,
                callback=WandbCallback(
                    gradient_save_freq=1,
                    verbose=0))

    wandb.finish()
    
study = optuna.create_study()
try:
    study.optimize(optimize_agent_td3, n_trials=20)
except KeyboardInterrupt:
    print('Interrupted by keyboard.')

### PPO - Proximal Policy Optimization algorithm 

#### Train the agent

#### Evaluate Agent

## A2C - synchronous, deterministic variant of Asynchronous Advantage Actor Critic (A3C)

#### Training

#### Eval

## Other Algorithm 

In [None]:
# todo

## DQN -- needs Discrete Action Space. 

# Testing

#### Run Episodes

### Check the Environment

# 2. Create a Deep Learning Model with Keras

# 3. Build Agent with Keras-RL


# 4. Reloading Agent from Memory


# Archive
