In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim

from torch.nn.utils import clip_grad_norm_
import random
import math
#from torch.utils.tensorboard import SummaryWriter
from collections import deque, namedtuple
import time
import gym
import copy

from torch.distributions import MultivariateNormal, Normal


import numpy as np
from typing import Optional

import pygame
from pygame import gfxdraw

import gym
from gym import spaces
from gym.utils import seeding

import argparse, sys

In [8]:
class TwoJoint(gym.Env):
    #same environment as in model.py but written in PyTorch to enable backpropagation


    metadata = {"render.modes": ["human", "rgb_array"], "video.frames_per_second": 30}

    def __init__(self, g=10.0):
        self.max_speed = 8
        self.max_torque = 2.0
        self.dt = 0.2
        self.t = 0
        self.screen = None
        self.isopen = True

        self.screen_dim = 500
        self.resolution = 10

        high = np.ones([self.resolution*6])#, dtype=np.float32)
        self.action_space = spaces.Box(
            low=-self.max_torque, high=self.max_torque, shape=(2,), dtype=np.float32
        )
        self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32)

    def step(self, action):
        self.t += 1
        u, v = action.flatten()
        
        u = torch.clamp(u, -100.0, 100.0)
        v = torch.clamp(v, -100.0, 100.0)
        
        x, dx, y, dy, goal1, goal2 = self.state  # th := theta

        dt = self.dt

        #x = u
        #y = v+np.pi
        vecx = torch.stack([torch.cos(x), torch.sin(x)])
        vecy = torch.stack([torch.cos(x+y), torch.sin(x+y)])
        vec = vecx + vecy
        pos1, pos2 = vec.flatten()
        #print('yoo', self.state)
        #u = np.clip(u, -self.max_torque, self.max_torque)[0]
        self.last_u = u  # for rendering
        costs = torch.sum((pos1-goal1) ** 2 + 0.01*dx**2 + 0.01*u**2 + (pos2-goal2) ** 2 + 0.01*dy**2 + 0.01*v**2)# + 0.1 * thdot ** 2 + 0.001 * (u ** 2)

        newdx = dx + u * dt
        newdy = dy + v * dt
        newdx = torch.clamp(newdx, -2.0, 2.0)
        newdy = torch.clamp(newdy, -2.0, 2.0)
        newx = x + newdx * dt
        newy = y + newdy * dt
        newx = torch.clamp(newx, 0, np.pi)
        newy = torch.clamp(newy, 0, np.pi/2)
        #newx = angle_normalize(newx)
        #newy = angle_normalize(newy)
        
        self.state = [newx, newdx, newy, newdy, goal1, goal2]
        return self._get_obs(), -costs, self.t >= 10, {}

    def reset(
        self,
        *,
        seed: Optional[int] = None,
        return_info: bool = False,
        options: Optional[dict] = None
    ):
        super().reset(seed=seed)
        self.t = 0
        self.state = [np.random.uniform(low=0, high=np.pi),
                               0, np.random.uniform(low=0, high=np.pi/2),
                               0, 
                      np.random.uniform(low=-1, high=1),
                     1]#np.array([[-1], [0]])#self.np_random.uniform(low=-high, high=high)#*0.1
        
        self.state = [torch.from_numpy(np.array(s)).float().to(device) for s in self.state]
        self.last_u = None
        if not return_info:
            return self._get_obs()
        else:
            return self._get_obs(), {}

    def _get_obs(self):
        x, dx, y, dy, goal1, goal2 = self.state
        
        #goal1 = goal1 - np.cos(x) - np.cos(y)
        #goal2 = goal2 - np.sin(x) - np.sin(y)
        x = (x-np.pi/2) / (np.pi/2)
        y = (y-np.pi/4) / (np.pi/4)
        dx = dx / 2.0
        dy = dy / 2.0
        resolution = self.resolution
        step = 2.0 / resolution
        statex = np.zeros([resolution])
        idx = -1
        for ii in np.arange(-1.0, 1.0-1e-8, 2.0/resolution):
            idx += 1
            if x >= ii-1e-8 and x <= ii + step+1e-8:
                statex[idx] = 1.0
                
        statedx = np.zeros([resolution])
        idx = -1
        for ii in np.arange(-1.0, 1.0-1e-8, 2.0/resolution):
            idx += 1
            if dx >= ii-1e-8 and dx <= ii + step+1e-8:
                statedx[idx] = 1.0
                

        statey = np.zeros([resolution])
        idx = -1
        for ii in np.arange(-1.0, 1.0-1e-8, 2.0/resolution):
            idx += 1
            if y >= ii-1e-8 and y <= ii + step+1e-8:
                statey[idx] = 1.0
                
        statedy = np.zeros([resolution])
        idx = -1
        for ii in np.arange(-1.0, 1.0-1e-8, 2.0/resolution):
            idx += 1
            if dy >= ii-1e-8 and dy <= ii + step+1e-8:
                statedy[idx] = 1.0
                
                
        stateg1 = np.zeros([resolution])
        idx = -1
        for ii in np.arange(-1.0, 1.0-1e-8, 2.0/resolution):
            idx += 1
            if goal1 >= ii-1e-8 and goal1 <= ii + step+1e-8:
                stateg1[idx] = 1.0
                
        stateg2 = np.zeros([resolution])
        idx = -1
        for ii in np.arange(-1.0, 1.0-1e-8, 2.0/resolution):
            idx += 1
            if goal2 >= ii-1e-8 and goal2 <= ii + step+1e-8:
                stateg2[idx] = 1.0
                
                
                
        state = np.concatenate([statex, statey, statedx, statedy, stateg1, stateg2], 0)
        #state = np.array([np.sin(x), np.cos(x), dx, np.sin(y), np.cos(y), dy, goal1, goal2])
        return state#np.array([x], dtype=np.float32)




def angle_normalize(x):
    return ((x + np.pi) % (2 * np.pi)) - np.pi

In [9]:
class Maze(gym.Env):

    #same environment as in model.py but written in PyTorch to enable backpropagation

    metadata = {"render.modes": ["human", "rgb_array"], "video.frames_per_second": 30}

    def __init__(self, g=10.0):
        self.max_speed = 8
        self.max_torque = 2.0
        self.dt = 0.2
        self.t = 0
        self.screen = None
        self.isopen = True

        self.screen_dim = 500
        self.resolution = 10

        high = np.ones([self.resolution*6])#, dtype=np.float32)
        self.action_space = spaces.Box(
            low=-self.max_torque, high=self.max_torque, shape=(2,), dtype=np.float32
        )
        self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32)

    def step(self, action):
        self.t += 1
        u, v = action.flatten()
        
        u = torch.clamp(u, -100.0, 100.0)
        v = torch.clamp(v, -100.0, 100.0)
        
        x, dx, y, dy, goal1, goal2 = self.state  # th := theta

        dt = self.dt

        #x = u
        #y = v+np.pi
        #vecx = torch.stack([torch.cos(x), torch.sin(x)])
        #vecy = torch.stack([torch.cos(x+y), torch.sin(x+y)])
        #vec = vecx + vecy
        pos1, pos2 = x, y#vec.flatten()
        #print('yoo', self.state)
        #u = np.clip(u, -self.max_torque, self.max_torque)[0]
        self.last_u = u  # for rendering
        costs = torch.sum((pos1-goal1) ** 2 + 0.01*dx**2 + 0.01*u**2 + (pos2-goal2) ** 2 + 0.01*dy**2 + 0.01*v**2)# + 0.1 * thdot ** 2 + 0.001 * (u ** 2)

        newdx = dx + u * dt
        newdy = dy + v * dt
        newdx = torch.clamp(newdx, -2.0, 2.0)
        newdy = torch.clamp(newdy, -2.0, 2.0)
        newx = x + newdx * dt
        newy = y + newdy * dt
        newx = torch.clamp(newx, -1, 1)
        newy = torch.clamp(newy, -1, 1)
        #newx = angle_normalize(newx)
        #newy = angle_normalize(newy)
        
        self.state = [newx, newdx, newy, newdy, goal1, goal2]
        return self._get_obs(), -costs, self.t >= 10, {}

    def reset(
        self,
        *,
        seed: Optional[int] = None,
        return_info: bool = False,
        options: Optional[dict] = None
    ):
        super().reset(seed=seed)
        self.t = 0
        self.state = [np.random.uniform(low=-1, high=1),
                               np.random.uniform(low=-1, high=1), np.random.uniform(low=-1, high=1),
                               np.random.uniform(low=-1, high=1), 
                      np.random.uniform(low=-1, high=1),
                     np.random.uniform(low=-1, high=1)]#np.array([[-1], [0]])#self.np_random.uniform(low=-high, high=high)#*0.1
        self.last_u = None
        if not return_info:
            return self._get_obs()
        else:
            return self._get_obs(), {}
        
        

    def _get_obs(self):
        x, dx, y, dy, goal1, goal2 = self.state
        
        #goal1 = goal1 - np.cos(x) - np.cos(y)
        #goal2 = goal2 - np.sin(x) - np.sin(y)
        #x = (x-np.pi/2) / (np.pi/2)
        #y = (y-np.pi/4) / (np.pi/4)
        dx = dx / 2.0
        dy = dy / 2.0
        resolution = self.resolution
        step = 2.0 / resolution
        statex = np.zeros([resolution])
        idx = -1
        for ii in np.arange(-1.0, 1.0-1e-8, 2.0/resolution):
            idx += 1
            if x >= ii-1e-8 and x <= ii + step+1e-8:
                statex[idx] = 1.0
                
        statedx = np.zeros([resolution])
        idx = -1
        for ii in np.arange(-1.0, 1.0-1e-8, 2.0/resolution):
            idx += 1
            if dx >= ii-1e-8 and dx <= ii + step+1e-8:
                statedx[idx] = 1.0
                

        statey = np.zeros([resolution])
        idx = -1
        for ii in np.arange(-1.0, 1.0-1e-8, 2.0/resolution):
            idx += 1
            if y >= ii-1e-8 and y <= ii + step+1e-8:
                statey[idx] = 1.0
                
        statedy = np.zeros([resolution])
        idx = -1
        for ii in np.arange(-1.0, 1.0-1e-8, 2.0/resolution):
            idx += 1
            if dy >= ii-1e-8 and dy <= ii + step+1e-8:
                statedy[idx] = 1.0
                
                
        stateg1 = np.zeros([resolution])
        idx = -1
        for ii in np.arange(-1.0, 1.0-1e-8, 2.0/resolution):
            idx += 1
            if goal1 >= ii-1e-8 and goal1 <= ii + step+1e-8:
                stateg1[idx] = 1.0
                
        stateg2 = np.zeros([resolution])
        idx = -1
        for ii in np.arange(-1.0, 1.0-1e-8, 2.0/resolution):
            idx += 1
            if goal2 >= ii-1e-8 and goal2 <= ii + step+1e-8:
                stateg2[idx] = 1.0
                
                
                
        state = np.concatenate([statex, statey, statedx, statedy, stateg1, stateg2], 0)
        #state = np.array([np.sin(x), np.cos(x), dx, np.sin(y), np.cos(y), dy, goal1, goal2])
        return state#np.array([x], dtype=np.float32)



def angle_normalize(x):
    return ((x + np.pi) % (2 * np.pi)) - np.pi

In [10]:
class FF(nn.Module):
    def __init__(self, state_size, action_size, layer_size, seed):
        super(FF, self).__init__()
        
        self.seed = torch.manual_seed(seed)
        self.input_shape = state_size
        self.action_size = action_size
        self.layer_size = layer_size
        
        #layer_size = self.input_shape
        self.head_1 = nn.Linear(self.input_shape, layer_size)
        self.rec = nn.Linear(self.layer_size, layer_size)
        self.action = nn.Linear(layer_size, action_size)
       
        self.x = torch.zeros([layer_size])
        

    
    def forward(self, input):

        self.x = torch.relu(self.head_1(input))# + self.rec(self.x))

        action = self.action(self.x)

        return action
    
    def reset(self):
        self.x = torch.zeros([layer_size])
    
    

In [17]:
task_names = ["two_joint", "maze"]

length_names = [ "_random", "_short", ""]
lengths = [0, 2000, 100000]

for seed in range(1):
    for task_idx in range(len(task_names)):
        for length_idx in range(len(length_names)):
        
            print(task_idx, length_idx, seed)

            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            if task_idx == 0:
                env = TwoJoint()
            elif task_idx == 1:
                env = Maze()
            else:
                assert False
            action_size  = env.action_space.shape[0]
            state_size = env.observation_space.shape[0]
            layer_size = 256

            device = torch.device('cpu')
            frames = lengths[length_idx]
            net = FF(state_size, action_size, layer_size, seed)
            state = env.reset()
            state = torch.from_numpy(state).float().to(device)
            rewards = []
            LR=1e-3

            optimizer = optim.Adam(net.parameters(), lr=LR)


            cur_rew = 0
            for frame in range(1, frames+1):

                action = net(state)
                next_state, reward, done, _ = env.step(action)
                next_state = torch.from_numpy(next_state).float().to(device)
                state = next_state
                cur_rew += reward
                if done:

                    rewards.append(cur_rew.detach().numpy())
                    loss = -cur_rew
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    net.reset()
                    state = env.reset()
                    state = torch.from_numpy(state).float().to(device)
                    cur_rew = 0
                #if frame % 100 == 0:
                #    print('frame', frame, np.mean(rewards[-100:]))
            torch.save(net.state_dict(), "../output/ff_"+str(task_names[task_idx])+str(length_names[length_idx])+"_"+str(seed)+".pth")

0 0 0
0 1 0
0 2 0
1 0 0
1 1 0
1 2 0


In [None]:
net = LinearPolicy(state_size, action_size, layer_size, seed)

fixed_policy_weight = np.zeros([2, 60])
fixed_policy_bias = np.array([1, 1])
net.action.weight = torch.nn.Parameter(torch.Tensor(fixed_policy_weight).to(device))
net.action.bias = torch.nn.Parameter(torch.Tensor(fixed_policy_bias).to(device))
torch.save(net.state_dict(), "linear_maze_constant1.pth")

In [6]:
class LinearPolicy(nn.Module):
    def __init__(self, state_size, action_size, layer_size, seed):
        super(LinearPolicy, self).__init__()
        
        self.seed = torch.manual_seed(seed)
        self.input_shape = state_size
        self.action_size = action_size

        self.action = nn.Linear(self.input_shape, action_size)
       
        

    
    def forward(self, input):


        action = self.action(input)

        return action

    def reset(self):
        pass

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
env = Maze()
action_size  = env.action_space.shape[0]
state_size = env.observation_space.shape[0]
seed = 0
layer_size = 256

device = torch.device('cpu')
frames = 100000
net = LinearPolicy(state_size, action_size, layer_size, seed)
state = env.reset()
state = torch.from_numpy(state).float().to(device)
rewards = []
LR=1e-3

optimizer = optim.Adam(net.parameters(), lr=LR)


cur_rew = 0
for frame in range(1, frames+1):
    
    action = net(state)
    next_state, reward, done, _ = env.step(action)
    next_state = torch.from_numpy(next_state).float().to(device)
    state = next_state
    cur_rew += reward
    if done:
        
        rewards.append(cur_rew.detach().numpy())
        loss = -cur_rew
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        net.reset()
        state = env.reset()
        state = torch.from_numpy(state).float().to(device)
        cur_rew = 0
    if frame % 100 == 0:
        print('frame', frame, np.mean(rewards[-100:]))

frame 100 -10.482031
frame 200 -16.885832
frame 300 -17.884733
frame 400 -16.27606
frame 500 -16.262705
frame 600 -15.396638
frame 700 -16.266544
frame 800 -16.265789
frame 900 -16.737232
frame 1000 -16.484436
frame 1100 -16.78951
frame 1200 -16.41757
frame 1300 -16.044035
frame 1400 -16.749367
frame 1500 -16.852896
frame 1600 -17.182377
frame 1700 -17.322973
frame 1800 -17.191887
frame 1900 -16.867836
frame 2000 -16.988539
frame 2100 -17.54432
frame 2200 -16.901941
frame 2300 -16.140461
frame 2400 -15.752722
frame 2500 -15.574137
frame 2600 -15.357919
frame 2700 -13.928328
frame 2800 -13.298912
frame 2900 -13.403425
frame 3000 -14.1079235
frame 3100 -13.960306
frame 3200 -13.200535
frame 3300 -13.560199
frame 3400 -13.968367
frame 3500 -14.476321
frame 3600 -14.458386
frame 3700 -16.135073
frame 3800 -16.299803
frame 3900 -15.794668
frame 4000 -14.991053
frame 4100 -14.657197
frame 4200 -15.2142
frame 4300 -15.406001
frame 4400 -15.363971
frame 4500 -15.021215
frame 4600 -15.18729
fra

frame 37100 -7.716319
frame 37200 -7.540091
frame 37300 -7.7354865
frame 37400 -7.8625093
frame 37500 -8.248736
frame 37600 -7.7913833
frame 37700 -7.8612885
frame 37800 -8.285663
frame 37900 -7.4889917
frame 38000 -7.778471
frame 38100 -7.277485
frame 38200 -7.448852
frame 38300 -7.515285
frame 38400 -7.493073
frame 38500 -7.7222967
frame 38600 -7.593137
frame 38700 -7.3194995
frame 38800 -7.0227013
frame 38900 -7.692904
frame 39000 -7.2986193
frame 39100 -7.4591246
frame 39200 -7.308008
frame 39300 -7.1133814
frame 39400 -6.9835405
frame 39500 -6.839724
frame 39600 -7.353622
frame 39700 -7.4348216
frame 39800 -7.57308
frame 39900 -7.1410737
frame 40000 -7.1607833
frame 40100 -7.437264
frame 40200 -7.04201
frame 40300 -7.4012446
frame 40400 -7.571008
frame 40500 -7.2630773
frame 40600 -6.8875346
frame 40700 -7.1021986
frame 40800 -7.189188
frame 40900 -7.4502983
frame 41000 -7.5798006
frame 41100 -7.173469
frame 41200 -7.364398
frame 41300 -7.0395293
frame 41400 -7.1599226
frame 41500

frame 73700 -4.821878
frame 73800 -5.066266
frame 73900 -5.110594
frame 74000 -4.875841
frame 74100 -4.9957843
frame 74200 -4.820057
frame 74300 -4.769343
frame 74400 -5.136549
frame 74500 -5.397806
frame 74600 -5.355113
frame 74700 -5.2278943
frame 74800 -5.0023437
frame 74900 -5.0134406
frame 75000 -5.1477995
frame 75100 -5.1559987
frame 75200 -5.188739
frame 75300 -5.197649
frame 75400 -5.1795974
frame 75500 -4.883237
frame 75600 -4.7689753
frame 75700 -5.047185
frame 75800 -5.0406737
frame 75900 -4.9706984
frame 76000 -5.182876
frame 76100 -5.5310106
frame 76200 -5.695027
frame 76300 -5.698863
frame 76400 -5.4817615
frame 76500 -5.257704
frame 76600 -5.511476
frame 76700 -5.1215267
frame 76800 -5.0527105
frame 76900 -4.9168224
frame 77000 -4.8939056
frame 77100 -4.4688106
frame 77200 -4.2336135
frame 77300 -4.4410686
frame 77400 -4.6984177
frame 77500 -4.727156
frame 77600 -4.5214834
frame 77700 -5.0301743
frame 77800 -5.086943
frame 77900 -5.1565795
frame 78000 -5.07876
frame 7810

In [9]:
torch.save(net.state_dict(), "linear_maze_trained.pth")