In [1]:
from time import sleep
from collections import deque, namedtuple, OrderedDict
import random
from IPython.display import display
import asyncio
from promise import Promise
from typing import Tuple

import numpy as np
import matplotlib.pyplot as plt

import torch as T
from torch import nn
from torch import optim

import torchvision
from torchvision.io import decode_png
import torchsummary

from infant import Environment

In [2]:
def prefer_gpu():
    return 'cuda:0' if T.cuda.is_available() else 'cpu'

In [3]:
BASE_URL = 'http://localhost:3000'
device = prefer_gpu()

In [4]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

In [5]:
class ConvNetModel(nn.Module):
    def __init__(self, output_dims=1024):
        super(ConvNetModel, self).__init__()
        self.device = prefer_gpu()

        self.conv_layer = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=9, stride=2),
            # nn.BatchNorm2d(5),
            nn.ReLU(),
            nn.Conv2d(32, 5, kernel_size=5, stride=2),
            # nn.BatchNorm2d(5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4, stride=2),
            nn.Flatten(),
        )
        self.bottleneck = nn.Sequential(
            nn.Linear(16920, output_dims),
            nn.ReLU(),
        ).to(self.device)
    def forward(self, x:T.Tensor):
        x = self.conv_layer(x.to(prefer_gpu()))
        x = self.bottleneck(x)
        return x

# conv_net = ConvNetModel(output_dims=512)
# torchsummary.summary(conv_net, input_data=T.ones(1, 3, 400, 600))

In [6]:
class NeuNetModel(nn.Module):
    def __init__(self, lr=1e-5, input_dims=1024, h1_dims=64, h2_dims=64, output_dims=3, device=None):
        super(NeuNetModel, self).__init__()
        self.lr = lr
        self.model = nn.Sequential(
            nn.Linear(input_dims, h1_dims),
            nn.ReLU(),
            nn.Linear(h1_dims, h2_dims),
            nn.ReLU(), 
            nn.Linear(h2_dims, output_dims)
        )

        self.optimizer = optim.Adam(self.parameters(), lr=self.lr)
        self.device = device if device else prefer_gpu()
        self.to(device)

    def forward(self, x:T.Tensor):
        x = self.model(x)
        return x

In [7]:
class Actor(nn.Module):
    def __init__(self, lr, input_dims=1024, h1_dims=128, h2_dims=128, n_action=3, device=None):
        super(Actor, self).__init__()
        self.lr = lr
        self.base_layer = nn.Sequential(
            nn.Linear(input_dims, h1_dims),
            nn.ReLU(),
            nn.Linear(h1_dims, h2_dims),
            nn.ReLU(), 
        )
        self.mu = nn.Sequential(nn.Linear(h2_dims, n_action))
        self.var = nn.Sequential(nn.Linear(h2_dims, n_action))

        self.optimizer = optim.Adam(self.parameters(), lr=self.lr)
        self.device = device if device else prefer_gpu()
        self.to(device)

    def forward(self, x:T.Tensor) -> Tuple[T.Tensor, T.Tensor]:
        x = x.to(self.device)
        x = self.base_layer(x)
        mu:T.Tensor = self.mu(x)
        var:T.Tensor = self.var(x)  # log standard deviation
        return mu, var  

In [8]:
class Agent:
    def __init__(self, alpha, beta, gamma=.99, epsilon=.2, max_replay=1_000, n_actions=3):
        self.gamma = gamma
        self.epsilon = epsilon
        self._current_v = None
        self._timestep_ct = 0
        self._replay_memory = deque(maxlen=max_replay)
        self.n_actions = n_actions
        self.device = prefer_gpu()
        self.log_probs = None

        self.knowledges = [ConvNetModel()]
        self.input_layer = ConvNetModel().to(self.device)
        self.global_knowledge = Actor(alpha, n_action=self.n_actions, device=self.device)
        self.critic = NeuNetModel(beta, output_dims=1, device=self.device)
        # self.critic = nn.Sequential(nn.Linear(128, 1)).to(device)
        # self.global_knowledge = nn.Sequential(nn.Linear(128, self.n_actions)).to(device)
        self.actor = self.global_knowledge
    
    def save(self, pathname:str):
        T.save({
            'input_layer': self.input_layer.state_dict(),
            'actor': self.actor.state_dict(),
            'critic': self.critic.state_dict(),
            'actor_op': self.actor.optimizer.state_dict(),
            'critic_op': self.critic.optimizer.state_dict(),
        }, pathname)

    def load(self, pathname:str):
        checkpoint:dict = T.load(pathname)
        self.input_layer.load_state_dict(checkpoint['input_layer']),
        self.actor.load_state_dict(checkpoint['actor']),
        self.critic.load_state_dict(checkpoint['critic']),
        self.actor.optimizer.load_state_dict(checkpoint['actor_op']),
        self.critic.optimizer.load_state_dict(checkpoint['critic_op']),
        
    def choose_action(self, observation:T.Tensor) -> T.Tensor:
        observation = observation.to(device=self.device)
        x = self.input_layer(observation)

        mus, log_stds = self.actor(x)
        sigmas = T.exp(log_stds)
        action_probs = T.distributions.Normal(mus, sigmas)
        prob_samples = T.squeeze(action_probs.sample((1,)), 2)
        self.log_probs = action_probs.log_prob(prob_samples)
        action = T.squeeze(T.tanh(prob_samples))
            
        return action

    def train(self, state:T.Tensor, reward:float, next_state:T.Tensor, done:bool):
        self.actor.optimizer.zero_grad()
        self.critic.optimizer.zero_grad()

        try:
            x = self.input_layer(state.to(self.device))
            x_ = self.input_layer(next_state.to(self.device))
            critic_value = self.critic(x)
            critic_value_ = self.critic(x_)
        except Exception as e:
            print('Critic: ', e)
            print(state.shape, next_state.shape)
            # print(x, x_)
            return
            
        reward = T.tensor(reward, dtype=T.float).to(self.device)
        delta = reward + self.gamma * critic_value_ * (1-int(done)) - critic_value
        
        actor_loss:T.Tensor = -self.log_probs * delta   # "alpha"   * .1, .5, ...
        critic_loss:T.Tensor = delta**2

        # print(actor_loss.shape, critic_loss.shape)
        # (actor_loss + critic_loss).backward()
        grand_loss:T.Tensor = actor_loss + critic_loss
        grand_loss.backward(T.ones_like(grand_loss))

        self.actor.optimizer.step()
        self.critic.optimizer.step()
        
        # v_next = self.critic(next_state)
        # advantage = reward + v_next - self._current_v
        # loss = T.log(action, state) * advantage

agent = Agent(1e-6, 1e-5)
agent.choose_action(T.rand(1, 3, 400, 600))


tensor([-0.8990, -0.7000, -0.4764], device='cuda:0')

In [24]:
def create_responder(agent: Agent, action_multiplier=600):
    def responder(observation: T.Tensor, reward: T.Tensor, env: Environment):
        """Get observation and returns action"""
        if reward > 0:
            print('Reward: ', reward)
        try:
            # Preprocess image
            image = observation / 255
            state = T.unsqueeze(image, dim=0)

            # Train the agent
            if env.prev_state != None:
                try:
                    agent.train(env.prev_state, reward, state, env._is_done)
                except Exception as e:
                    print('Training Error')
                    print(e)
            
            # Predict correct action
            action = agent.choose_action(state)
            action = T.squeeze(action) * action_multiplier

            env.prev_state = state
            
            return action
        except Exception as e:
            print(e)
            env.stop()
            return T.ones((3,))

    return responder

In [18]:
model_name = 'cache-v1'
model_path = f'model/{model_name}.pkl'

In [12]:
agent = Agent(alpha=5e-6, beta=1e-5, epsilon=.8)
try:
    agent.load(model_path)
except FileNotFoundError:
    print('Model does not exist yet')

Model does not exist yet


In [28]:
sim_id = "" or None
env = Environment(sim_id) 
if sim_id is None:
    env.create()
    print(env.sim_id)

506bfbae


In [30]:
env.reset()
env.on_state(create_responder(agent, action_multiplier=300))
env.connect(disconnect_on_done=True)
env.init(max_episodes=1000)
env.wait()

Connected to /sim-506bfbae
Reward:  40
Reward:  40
Reward:  40


In [66]:
# env.disconnect()
# env.destroy()

In [31]:
agent.save(model_path)

In [10]:
# # Using step fn from environment API.

# loop = asyncio.get_running_loop()
# env.set_running_loop(loop)

# env.connect(close_on_stop=True)
# fut = env.step(T.tensor([10,0,0], dtype=T.float))
# data_uri, reward = await fut
# data_uri[30:100], reward

In [10]:
# envs = [Environment() for _ in range(2)]

# print('Initializing...')
# for env in envs:
#     env.create()
#     agent = Agent().to(device)
#     env.on_state(create_responder(agent))

# print('Connecting...')
# for env in envs:
#     env.connect()

# print('Starting...')
# for env in envs:
#     env.init(max_episodes=10)

# print('Working...')
# for env in envs:
#     env.wait()

# print('Cleaning...')
# for env in envs:
#     env.destroy()

Initializing...
Connecting...
Connected to /sim-4e7732c7
Connected to /sim-688daa83
Starting...
Working...
Cleaning...
