## Submission Notebook Template 

<h3> <font color='red'> WARNING : </font>  </h3>

<font color='red'> No matter which approach you've chosen, you need to re-install any custom packages you had to install to make your code work ! </font>

<b> Install your packages below: </b>

In [None]:
!pip install git+https://github.com/Total-RD/pymgrid/
## Other packages 

In the section below, you must run your methodology for solving the problem from start to finish :

In [1]:
import pickle

"""
The buildings mentionned below are specific to the hackathon and are not available in this repo.
You can replace them with any MicroGrid object generated from pymgrid
"""

with open('building_1.pkl', 'rb') as f:
    building_1 = pickle.load(f)
    building_1.train_test_split()

with open('building_2.pkl', 'rb') as f:
    building_2 = pickle.load(f)
    building_2.train_test_split()
    
with open('building_3.pkl', 'rb') as f:
    building_3 = pickle.load(f)
    building_3.train_test_split()

buildings = [building_1, building_2, building_3]

<h2> Evaluation for "Simple" Reinforcement Learning based approaches <h2>

 <font color='red'> <b> Be careful, the rewards returned by the Gym environment are negative ! Don't forget to multiply by -1 the total reward as the Profitability you will need to submit needs to be positive ! </b> </font>

<b> 1) Import all used libraries and scripts here </b>

In [2]:
import time # Necessary to evaluate frugality
import json # Necessary to export your results
import DiscreteEnvironment as DiscreteEnvironment # Imposed Discrete Environment
import numpy as np

## Other packages

<b> 2) Agent & Environment Setup before your training </b>

In [3]:
building_environments = [
    DiscreteEnvironment.Environment(env_config={'building':buildings[i]}) for i in range(3)
]
"""
Agent, potential Q-Table & other necessary setup code here 
"""

def init_qtable(env, nb_action):
    """Compulsory initialization function for Q_table"""
    state = []
    Q = {}

    for i in range(-int(env.mg.parameters['PV_rated_power']-1), int(env.mg.parameters['load']+2)):
        for j in np.arange(round(env.mg.battery.soc_min, 1), round(env.mg.battery.soc_max+0.1, 1), 0.1):

            j = round(j, 1)
            state.append((i, j))
    # Initialize Q(s,a) at zero
    for s in state:
        Q[s] = {}

        for a in range(nb_action):

            Q[s][a] = 0

    return Q


def choose_action_greedy(Q_table, state):
    """A function for choosing an action from the q-table given a certain state: greedy
    
    This is used for policy evaluation.
    """
    nb_actions = len(Q_table[state].keys())
    actions = list(range(nb_actions))
    action_values = [Q_table[state][a] for a in actions]
    action = np.argmax(action_values)
    return action


def choose_action_boltzmann(Q_table, state):
    """A function for choosing an action from the q-table given a certain state: Boltzmann
    
    This is used for policy optimization.
    """
    
    nb_actions = len(Q_table[state].keys())
    actions = list(range(nb_actions))
    action_values = [Q_table[state][a] for a in actions]
    # Softmax
    if max(action_values) < -300:  # avoid underflow in softmax
        # All actions are bad, use random choice
        softmax = np.ones(nb_actions) / nb_actions
    else:
        exp_action_values = np.exp(action_values)
        softmax = exp_action_values / np.sum(exp_action_values)
    action = np.random.choice(actions, p=softmax)
    return action


def update_q(Q_table, state, action, reward, new_state, learning_rate, discount):
    """TD update rule for Q"""
    nb_actions = len(Q_table[state].keys())
    actions = list(range(nb_actions))
    max_Q = max([Q_table[state][a] for a in actions])
    td_update = reward + discount * max_Q - Q_table[state][action]
    new_qsa = (1 - learning_rate) * Q_table[state][action] + learning_rate * td_update
    return new_qsa


def train_q_table(building, n_epis=5, alpha=0.6, gamma=1):
    """Train a Q_table for the given building
    
    Parameters
    ----------
    building: pymgrid.Microgrid.Microgrid
        The microgrid object to train the policy on
    n_epis: int
        The number of episodes
    alpha: float
        The TD learning rate
    gamma: float
        The discount factor
    
    Returns
    -------
    Q_table: dict
        The trained Q_table defining the agent strategy
    rew_hist: list(float)
        History of the cumulative reqards otained during the training phase
    """
    
    env_config = {'building': building}
    building_env = DiscreteEnvironment.Environment(env_config)
    
    nb_actions = building_env.Na
    
    Q_table = init_qtable(building_env, nb_action=nb_actions)
    rew_hist = [] # reward history during episodes


    for i in range(n_epis):
        # Reset environment
        s = building_env.reset(testing=False)
        rAll = 0
        done = False
        #The Q-Table learning algorithm
        while not done:
            # Choose action from Q table
            try:
                a = choose_action_boltzmann(Q_table, s)
            except ValueError:
                pass
                # print(s, Q_table[s])
            #Get new state & reward from environment
            s1, r, done, _ = building_env.step(a)
            #Update Q-Table with new knowledge
            Q_table[s][a] = update_q(Q_table, s, a, r, s1, alpha, gamma)
            rAll += r
            s = s1
        rew_hist.append(rAll)
    
    return Q_table, rew_hist


def evaluate_policy(building, Q_table):
    """Runs the policy on the building and returns the cost on the test set"""
    
    env_config = {'building': building}
    building_env = DiscreteEnvironment.Environment(env_config)
    
    nb_actions = building_env.Na
    
    s = building_env.reset(testing=True)
    rAll = 0
    done = False
    #The Q-Table learning algorithm
    while not done:
        # Choose action from Q table
        a = choose_action_greedy(Q_table, s)
        #Get new state & reward from environment
        s1, r, done, _ = building_env.step(a)
        
        rAll += r
        s = s1
    
    return -rAll


def average_Q_tables(Q_table_list):
    """Takes list of Q_tables and returns the averaged Q_table.
    
    Used for building 3.
    """
    Q_table_avg = {}
    # Assume all Q_tables have identical structure
    for state in Q_tables_list[0].keys():
        Q_table_avg[state] = {}
        for act in Q_tables_list[0][state].keys():
            Q_table_avg[state][act] = np.mean([Q[state][act] for Q in Q_table_list])
    
    return Q_table_avg

<b> 3) Training of the agent </b>

In [None]:

train_start = time.process_time()

"""
Training code
"""

Q_table_results = {
    "building0": None,
    "building1": None,
    "building2": None,
}

# first Building
Q_table_results["building0"], rew_hist = train_q_table(buildings[0], n_epis=25, alpha=0.6)

# second Building
Q_table_results["building1"], rew_hist = train_q_table(buildings[1], n_epis=25, alpha=0.6)

# Third building
Q_table_results["building2"], rew_hist = train_q_table(buildings[2], n_epis=10, alpha=0.4)

train_end = time.process_time()



In [None]:
train_frugality = train_end - train_start

<b> 4) Test of the agent </b>

In [None]:
"""
Below is an example for a Random Agent 

Note :
* To make your work as reproductible as possible, have a full-greedy approach (no exploration) on the test buildings
* If your algorithm has some unavoidable randomness, consider running it for many loops and return a
  mean profitability and mean frugality
  
"""

test_start = time.process_time()
total_cost = [0,0,0]

for i,building_env in enumerate(building_environments):
    Q_table = Q_table_results[f"building{i}"]
    obs = building_env.reset(testing=True)
    done = False
    while not done:
        # action = np.random.randint(building_env.action_space.n)
        action = choose_action_greedy(Q_table, obs)
        obs, reward, done, info = building_env.step(action)
        total_cost[i]+= reward

test_end = time.process_time()

In [None]:
test_frugality = test_end - test_start

<b> 5) Store & Export Results in JSON format </b>

In [None]:
final_results = {
    "building_1_performance" : -total_cost[0],
    "building_2_performance" : -total_cost[1],
    "building_3_performance" : -total_cost[2],
    "frugality" : train_frugality + test_frugality,
}
print(final_results)