In [23]:
import os
import numpy as np
import matplotlib.pyplot as plt
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd
from datetime import datetime
import time

## Agent

In [24]:
class Agent():
    def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99,
                 max_size=1000000, fc1_dims=400, fc2_dims=300, 
                 batch_size=64):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.alpha = alpha
        self.beta = beta

        self.memory = ReplayBuffer(max_size, input_dims, n_actions)

        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
                                n_actions=n_actions, name='actor', 
                                chkpt_dir = DIRECTORIO_BASE+'/tmp/ddpg')
        self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims,
                                n_actions=n_actions, name='critic', 
                                chkpt_dir = DIRECTORIO_BASE+'/tmp/ddpg')

        self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
                                n_actions=n_actions, name='target_actor',
                                chkpt_dir = DIRECTORIO_BASE+'/tmp/ddpg')

        self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims,
                                n_actions=n_actions, name='target_critic',
                                 chkpt_dir = DIRECTORIO_BASE+'/tmp/ddpg')

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        self.actor.eval()
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)
        mu = self.actor.forward(state).to(self.actor.device)
        mu_prime = mu + T.tensor(self.noise(), 
                                    dtype=T.float).to(self.actor.device)
        self.actor.train()

        return mu_prime.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        states, actions, rewards, states_, done = \
                self.memory.sample_buffer(self.batch_size)

        states = T.tensor(states, dtype=T.float).to(self.actor.device)
        states_ = T.tensor(states_, dtype=T.float).to(self.actor.device)
        actions = T.tensor(actions, dtype=T.float).to(self.actor.device)
        rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device)
        done = T.tensor(done).to(self.actor.device)

        target_actions = self.target_actor.forward(states_)
        critic_value_ = self.target_critic.forward(states_, target_actions)
        critic_value = self.critic.forward(states, actions)

        critic_value_[done] = 0.0
        critic_value_ = critic_value_.view(-1)

        target = rewards + self.gamma*critic_value_
        target = target.view(self.batch_size, 1)

        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.actor.optimizer.zero_grad()
        actor_loss = -self.critic.forward(states, self.actor.forward(states))
        actor_loss = T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_state_dict = dict(target_critic_params)
        target_actor_state_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                                (1-tau)*target_critic_state_dict[name].clone()

        for name in actor_state_dict:
             actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                 (1-tau)*target_actor_state_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)
        self.target_actor.load_state_dict(actor_state_dict)
        #self.target_critic.load_state_dict(critic_state_dict, strict=False)
        #self.target_actor.load_state_dict(actor_state_dict, strict=False)

## Precios red
Lee fichero csv de precios de red descargados.

Hay varios identificadores.
Se selecciona el identificador 1014

**Todo** Problema con la hora verano/invierno (+01, +02) que no se está teniendo en cuenta

In [25]:
class Precios_red():
    def __init__(self):
        filename = '/content/drive/MyDrive/Colab_Notebooks/DDPG/export_PrecioDeLaEnergíaExcedentariaDelAutoconsumoParaElMecanismoDeCompensaciónSimplificada (PVPC)_2020-12-07_22_18.csv'
        filename = 'export_PrecioDeLaEnergíaExcedentariaDelAutoconsumoParaElMecanismoDeCompensaciónSimplificada (PVPC)_2020-12-07_22_18.csv'

        df = pd.read_csv(filename, sep=';')
        # view head
        # df2 = df[df['id']==1739]
        # df2 = df[df['id']==1013]
        # df2 = df[df['id']==1014]
        df2 = df[df['id']==1015]

        datetime_v = df2['datetime'].values
        datetime = np.array([self.convert_datetime_str(date_str) for date_str in datetime_v])
        datetime = datetime - datetime[0]
        self.datetime_hour = datetime / (60*60)
        self.value    = np.array(df2['value'].values)

    def convert_datetime_str(self, date_time_str):
        str_split = date_time_str.split(sep='T')
        str_0 = str_split[0]
        str_0_split = str_0.split('-')
        year  = int(str_0_split[0])
        month = int(str_0_split[1])
        day   = int(str_0_split[2])
        str_1 = str_split[1]
        str_1_split = str_1.split(':')
        hour = int(str_1_split[0])

        a = time.struct_time((year, month, day, hour, 0, 0,0,0,0,'Romance Standard Time',3600))
        x = time.mktime(a)
        return x

## Env

In [26]:
import numpy as np

class Env2():
    # cada 15 minutos
    def __init__(self):
        self.Q = 24
        self.N_T = 24
        self.q_space=np.linspace(0,self.Q)
        self.a_space=np.array([-3,-1,0,1,3])
        self.t_space = np.linspace(0,23,24)
        self.t_final_epoch = 0
        self.precios_red = Precios_red()
        self.t_precios_red_t0 = 0
        self.obs = self.reset()
        self.info = []

    def set_obs(self, t_0, q_0):
        dia = np.linspace(0,23,24, dtype=int)
        x = np.zeros((25,))
        x[0] = q_0
        x[1:25] = self.precios_red.value[ int(t_0) + dia ]
        return x

    def reset(self):
        # se coge un dia aleatorio
        self.t_precios_red_t0 = int(np.random.uniform(0,650))*24
        self.t = self.t_precios_red_t0
        self.q = np.random.choice(self.q_space)
        # se fuerza a cero
        # self.q = 0 ################
        self.t_final_epoch = 24*30 + self.t
        return self.set_obs(self.t, self.q)

    # fija un estado (q)
    def reset_fix_q_0(self, q_0):
        self.t = self.t_precios_red_t0
        self.q = q_0
        self.obs = self.set_obs(self.t, self.q)
        return self.obs 

    # fija un estado (q)
    def reset_fix_t_q_0(self, t_0, q_0):
        self.reset_fix_q_0 (q_0)
        self.t = t_0 # ojo que reset_fix_q_0 pisa
        self.obs = self.set_obs(self.t, self.q)
        return self.obs 

    def step(self, action):
        penalizacion_q = 0
        done = False

        action = action[0] * 1.5 # 6 Ah
        # q_action = self.a_space[action]
        q_action = action

        t_ = self.t + 1 /4

        # la accion es continua
        q_ = self.q + q_action

        if (q_ > self.Q):
            penalizacion_q = -5
            q_ = self.Q

        if (q_ < 0):
            penalizacion_q = -5
            q_ = 0

        delta_q = q_ - self.q

        # precio
        reward = -self.precios_red.value[ int(self.t) ] * delta_q + penalizacion_q

        # consumo bateria. para todas las acciones excepto para accion ==2
        reward -= np.abs(delta_q)*2
        # fin consumo bateria

        # se actualizan 
        self.t = t_
        self.q = q_
        self.obs = self.set_obs(self.t, self.q)

        # batch
        if t_ > self.t_final_epoch:
            done = True

        return  self.obs, reward, done, self.info

## Buffer

In [27]:
class ReplayBuffer():
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, *input_shape))
        self.new_state_memory = np.zeros((self.mem_size, *input_shape))
        self.action_memory = np.zeros((self.mem_size, n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.new_state_memory[index] = state_
        self.terminal_memory[index] = done

        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        dones = self.terminal_memory[batch]

        return states, actions, rewards, states_, dones


## OUActionNoise



In [28]:
class OUActionNoise():
    def __init__(self, mu, sigma=0.15, theta=0.2, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x

        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

## ActorNetwork

In [29]:
class CriticNetwork(nn.Module):
    def __init__(self, beta, input_dims, fc1_dims, fc2_dims, n_actions, name,
                 chkpt_dir='tmp/ddpg'):
        super(CriticNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.name = name
        self.checkpoint_dir = chkpt_dir
        self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ddpg')

        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)

        self.bn1 = nn.LayerNorm(self.fc1_dims)
        self.bn2 = nn.LayerNorm(self.fc2_dims)
        #self.bn1 = nn.BatchNorm1d(self.fc1_dims)
        #self.bn2 = nn.BatchNorm1d(self.fc2_dims)

        self.action_value = nn.Linear(self.n_actions, self.fc2_dims)
        
        self.q = nn.Linear(self.fc2_dims, 1)

        f1 = 1./np.sqrt(self.fc1.weight.data.size()[0])
        self.fc1.weight.data.uniform_(-f1, f1)
        self.fc1.bias.data.uniform_(-f1, f1)

        f2 = 1./np.sqrt(self.fc2.weight.data.size()[0])
        self.fc2.weight.data.uniform_(-f2, f2)
        self.fc2.bias.data.uniform_(-f2, f2)

        f3 = 0.003
        self.q.weight.data.uniform_(-f3, f3)
        self.q.bias.data.uniform_(-f3, f3)

        f4 = 1./np.sqrt(self.action_value.weight.data.size()[0])
        self.action_value.weight.data.uniform_(-f4, f4)
        self.action_value.bias.data.uniform_(-f4, f4)

        self.optimizer = optim.Adam(self.parameters(), lr=beta,
                                    weight_decay=0.01)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

        self.to(self.device)

    def forward(self, state, action):
        state_value = self.fc1(state)
        state_value = self.bn1(state_value)
        state_value = F.relu(state_value)
        state_value = self.fc2(state_value)
        state_value = self.bn2(state_value)
        #state_value = F.relu(state_value)
        #action_value = F.relu(self.action_value(action))
        action_value = self.action_value(action)
        state_action_value = F.relu(T.add(state_value, action_value))
        #state_action_value = T.add(state_value, action_value)
        state_action_value = self.q(state_action_value)

        return state_action_value

    def save_checkpoint(self):
        print('... saving checkpoint ...')
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        print('... loading checkpoint ...')
        self.load_state_dict(T.load(self.checkpoint_file))

    def save_best(self):
        print('... saving best checkpoint ...')
        checkpoint_file = os.path.join(self.checkpoint_dir, self.name+'_best')
        T.save(self.state_dict(), checkpoint_file)

class ActorNetwork(nn.Module):
    def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, n_actions, name,
                 chkpt_dir='tmp/ddpg'):
        super(ActorNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.name = name
        self.checkpoint_dir = chkpt_dir
        self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ddpg')

        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)

        self.bn1 = nn.LayerNorm(self.fc1_dims)
        self.bn2 = nn.LayerNorm(self.fc2_dims)

        #self.bn1 = nn.BatchNorm1d(self.fc1_dims)
        #self.bn2 = nn.BatchNorm1d(self.fc2_dims)

        self.mu = nn.Linear(self.fc2_dims, self.n_actions)

        f2 = 1./np.sqrt(self.fc2.weight.data.size()[0])
        self.fc2.weight.data.uniform_(-f2, f2)
        self.fc2.bias.data.uniform_(-f2, f2)

        f1 = 1./np.sqrt(self.fc1.weight.data.size()[0])
        self.fc1.weight.data.uniform_(-f1, f1)
        self.fc1.bias.data.uniform_(-f1, f1)

        f3 = 0.003
        self.mu.weight.data.uniform_(-f3, f3)
        self.mu.bias.data.uniform_(-f3, f3)

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

        self.to(self.device)

    def forward(self, state):
        x = self.fc1(state)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = T.tanh(self.mu(x))

        return x

    def save_checkpoint(self):
        print('... saving checkpoint ...')
        T.save(self.state_dict(), self.checkpoint_file)
        print('checkpoint_file = ', self.checkpoint_file)


    def load_checkpoint(self):
        print('... loading checkpoint ...')
        print('self.checkpoint_file=',self.checkpoint_file)
        self.load_state_dict(T.load(self.checkpoint_file))

    def save_best(self):
        print('... saving best checkpoint ...')
        checkpoint_file = os.path.join(self.checkpoint_dir, self.name+'_best')
        T.save(self.state_dict(), checkpoint_file)

## Plot_learning_curve

In [30]:
def plot_learning_curve(x, scores, figure_file):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    plt.savefig(figure_file)


    print('figure_file = ', figure_file)


# main

In [31]:
if __name__ == '__main__':
    DIRECTORIO_BASE = '/content/drive/MyDrive/Colab_Notebooks/DDPG'
    DIRECTORIO_BASE = 'D:/github/Practicando_RL_Aprendiendo/DDPG'

    env = Env2()
    agent = Agent(alpha=0.0001, beta=0.001, 
                    input_dims=(25,), tau=0.001,
                    batch_size=64, fc1_dims=400, fc2_dims=300, 
                    n_actions=1)
    n_games = 3
    filename = 'car_' + str(agent.alpha) + '_beta_' + \
                str(agent.beta) + '_' + str(n_games) + '_games'
    figure_file = DIRECTORIO_BASE + '/plots/' + filename + '.png'

    # load models
    agent.load_models()
    # best_score = env.reward_range[0]
    best_score = 1
     
    score_history = []
    replayBuffer = ReplayBuffer(max_size=10000, input_shape=(25,), n_actions=1)

    for i in range(n_games):
        observation = env.reset()
        done = False
        score = 0
        agent.noise.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)

            replayBuffer.store_transition(observation, action, reward, observation_, done)

            score += reward
            observation = observation_
        score_history.append(score)

        print('episode ', i, 'score %.1f' % score)

    x = [i+1 for i in range(n_games)]
    plot_learning_curve(x, score_history, figure_file)
    

... loading checkpoint ...
self.checkpoint_file= D:/github/Practicando_RL_Aprendiendo/DDPG/tmp/ddpg\actor_ddpg


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [10]:
hora_ini = 24*20
hora_fin = hora_ini + 24*2

t_15 = np.array([k for k in range(hora_ini*4, hora_fin*4)])

q = np.array([replayBuffer.state_memory[int(k)][0] for k in t_15])
p = np.array([replayBuffer.state_memory[int(k)][1] for k in t_15])
a = np.array([replayBuffer.action_memory[int(k)][0] for k in t_15])

delta_q = q[1:-1] - q[0:-2]

NameError: name 'replayBuffer' is not defined

In [11]:
plt.figure(figsize=(20,16))
plt.subplot(211)
plt.plot(t_15, p,'.-',label='p')
plt.subplot(212)
plt.bar(t_15, q,label='q')
plt.bar(t_15[1:-1], delta_q * 4, color='black', label='delta_q', alpha=0.5)
plt.grid(True)
plt.legend()

NameError: name 'p' is not defined

In [12]:
plt.plot(a)

NameError: name 'a' is not defined

In [13]:
import math

x = np.linspace(0,100)
plt.plot(math.tanh(x))


TypeError: only size-1 arrays can be converted to Python scalars

In [None]:
pwd

In [None]:
from google.colab import drive
drive.mount('/content/drive')