In [65]:
import numpy as np
import gym
import json

# Step 1: Defining the Environment using Gymnasium

User guide for defining custom environments: https://gymnasium.farama.org/tutorials/gymnasium_basics/environment_creation/

The only thing we need to do, really, is to map the Microgrid to an observation space, an action space and a reward function. We also need to implement a reset and step function that returns the current state.

For our actions, we have [produce, not produce] for Wind, Solar and Gas. Let's just start by having it binary, so that the production is either at max capacity or zero at a given time step.

The reward is just the Cost function from the assignment text at a time step.

The thing I struggle to wrap my head around, is the observation space. Is it here that we include the data from the files? So that the "state" of our environment is how much energy is produced given an action sequence and the wind speed/ solar irradience at a specific time step?

### Questions

- Our "episode" is one hour. But how many iterations per episode? Could we just get rid of this level? I see that most people use episodes*iterations_per_episode as the total number of steps, but we just do one step per hour becouse of our data?

In [70]:
class Microgrid(gym.Env):

    def __init__(self, microcrid_settings = {}):
        
        
        #TODO: Define observation space
        self.observation_space = spaces.Dict(
            {
                "solarIrradience": ...,
                "wind": ...,
                "load": ... 
            }
        )

        #TODO: Define action space
        self.action_space = spaces.Discrete(3*2) #Produce/Not-produce for 3 energy producing nodes
        
   
    #TODO: Define the _get_obs method
    def _get_obs(self):
        '''
        Private method that translates the environment’s state into an observation.
        Could be useful to do it here instead of inside the reset.
        '''
        pass
    
    #TODO: Define the Reset method
    def reset(self, seed=None, options=None):
    # We need the following line to seed self.np_random
        super().reset(seed=seed)

        observation = self._get_obs()

        return observation
    
    #TODO: Define the Step method
    def step(self, action):
        
        #Return the state of the environment. I guess this is the energy produced at this time step?
        observation = self._get_obs()
        
        #We only terminate after episodes, so I guess this is uneccesary?
        terminated = ...
        
        #I guess the reward is the OperationCost from the Microgrid class?
        reward = ...


        return observation, reward, terminated, False, info

    

# Step 2: Define Q learning procedure

### Example using a pre-defined environment from Gymnasium

In [67]:
# Make Env
env = gym.make("FrozenLake-v1", render_mode="rgb_array")
n_observations = env.observation_space.n
n_actions = env.action_space.n
Q_table = np.zeros((n_observations,n_actions))

# Cosntants
n_episodes = 10000
max_iter_episode = 1000
exploration_proba = 1
exploration_decreasing_decay = 0.001
min_exploration_proba = 0.01
gamma = 0.99
lr = 0.1
rewards_per_episode = list()

In [69]:
for e in range(n_episodes):
    current_state = env.reset()
    done = False
    
    #sum the rewards that the agent gets from the environment
    total_episode_reward = 0
    
    if type(current_state) != int:
            current_state = current_state[0]
    
    for i in range(max_iter_episode): 
        if np.random.uniform(0,1) < exploration_proba:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_table[current_state,:])
        
        # The environment runs the chosen action and returns
        # the next state, a reward and true if the epiosed is ended.
        next_state, reward, done, _, _ = env.step(action)        
        
        # We update our Q-table using Bellman equation, basically. 
        Q_table[current_state, action] = (1-lr) * Q_table[current_state, action] +lr*(reward + gamma*max(Q_table[next_state,:]))
        total_episode_reward = total_episode_reward + reward
        
        # If the episode is finished, we leave the for loop
        if done:
            break
        current_state = next_state
        
    #We update the exploration proba using exponential decay formula 
    exploration_proba = max(min_exploration_proba, np.exp(-exploration_decreasing_decay*e))
    rewards_per_episode.append(total_episode_reward)

print("Mean reward per thousand episodes")
for i in range(10):
    print(f"{(i+1)*1000}. Mean espiode reward: {np.mean(rewards_per_episode[1000*i:1000*(i+1)])}")

Mean reward per thousand episodes
1000. Mean espiode reward: 0.047
2000. Mean espiode reward: 0.216
3000. Mean espiode reward: 0.414
4000. Mean espiode reward: 0.624
5000. Mean espiode reward: 0.722
6000. Mean espiode reward: 0.709
7000. Mean espiode reward: 0.703
8000. Mean espiode reward: 0.715
9000. Mean espiode reward: 0.73
10000. Mean espiode reward: 0.709
