## PPO Controller Training (RL)

In [1]:
import gym
import numpy as np
from ppo_torch_conv1d import Agent
from utils import plot_learning_curve
import MFC_aileron_env_discrete_pytorch_variable_target_obs_is_state as mfc_env
import tensorflow as tf


if __name__ == '__main__':
    #env = gym.make('CartPole-v1')
    env = mfc_env.MFC_aileron_Env()
    N = 20
    batch_size = 5
    n_epochs = 4
    alpha = 0.00003
    agent = Agent(n_actions=env.action_space.n, batch_size=batch_size, 
                    alpha=alpha, n_epochs=n_epochs, 
                    input_dims=env.observation_space.shape, 
                    fc1_dims=512, fc2_dims=512, chk_dir='tmp/ppo/GITHUB_X')
    n_games = 5000

    figure_file = 'plots/mfc_aileron_flex.png'

    best_score = env.reward_range[0]
    score_history = []

    learn_iters = 0
    avg_score = 0
    n_steps = 0
    
    for i in range(n_games):
        goal_pos = np.random.rand()*4-2
        observation = env.reset(goal_pos)
        done = False
        score = 0
        while not done:
            action, prob, val = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            n_steps += 1
            score += reward
            agent.remember(observation, action, prob, val, reward, done)
            if n_steps % N == 0:
                agent.learn()
                learn_iters += 1
            observation = observation_
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])
        
        if i>100:
            if avg_score > best_score:
                best_score = avg_score
                agent.save_models()

        print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score,
                'time_steps', n_steps, 'learning_steps', learn_iters)
    x = [i+1 for i in range(len(score_history))]
    plot_learning_curve(x, score_history, figure_file)
    
    agent.save_models(end=True)
    
  



actor using:  cpu
Critic using:  cpu
episode 0 score -912.3 avg score -912.3 time_steps 200 learning_steps 10
episode 1 score -110.1 avg score -511.2 time_steps 400 learning_steps 20
episode 2 score -47.0 avg score -356.5 time_steps 600 learning_steps 30
episode 3 score -287.4 avg score -339.2 time_steps 800 learning_steps 40
episode 4 score -265.8 avg score -324.5 time_steps 1000 learning_steps 50
episode 5 score -869.8 avg score -415.4 time_steps 1200 learning_steps 60
episode 6 score -53.1 avg score -363.7 time_steps 1400 learning_steps 70
episode 7 score -9.1 avg score -319.3 time_steps 1600 learning_steps 80
episode 8 score -775.1 avg score -370.0 time_steps 1800 learning_steps 90
episode 9 score -717.8 avg score -404.8 time_steps 2000 learning_steps 100
episode 10 score -230.1 avg score -388.9 time_steps 2200 learning_steps 110
episode 11 score -262.2 avg score -378.3 time_steps 2400 learning_steps 120
episode 12 score -180.2 avg score -363.1 time_steps 2600 learning_steps 130
ep

episode 107 score -9.1 avg score -111.7 time_steps 21600 learning_steps 1080
... saving models ...
episode 108 score -40.9 avg score -104.3 time_steps 21800 learning_steps 1090
... saving models ...
episode 109 score -6.3 avg score -97.2 time_steps 22000 learning_steps 1100
... saving models ...
episode 110 score -22.2 avg score -95.1 time_steps 22200 learning_steps 1110
... saving models ...
episode 111 score -8.3 avg score -92.6 time_steps 22400 learning_steps 1120
... saving models ...
episode 112 score -27.6 avg score -91.1 time_steps 22600 learning_steps 1130
... saving models ...
episode 113 score -8.5 avg score -82.3 time_steps 22800 learning_steps 1140
... saving models ...
episode 114 score -7.4 avg score -80.3 time_steps 23000 learning_steps 1150
... saving models ...
episode 115 score -65.1 avg score -79.0 time_steps 23200 learning_steps 1160
... saving models ...
episode 116 score -64.1 avg score -78.8 time_steps 23400 learning_steps 1170
... saving models ...
episode 117 s

KeyboardInterrupt: 

## Quick Testing on simulation

In [None]:
import matplotlib.pyplot as plt
import gym
import numpy as np
from ppo_torch_conv1d import Agent
from utils import plot_learning_curve
import MFC_aileron_env_discrete_pytorch_variable_target_obs_is_state as mfc_env
import tensorflow as tf
env = mfc_env.MFC_aileron_Env()
N = 20
batch_size = 5
n_epochs = 4
alpha = 0.00003
steps_per_test_ep = 200
goals = [0,0.5,-0.5,0]
goal_len = len(goals)

agent = Agent(n_actions=env.action_space.n, batch_size=batch_size, 
                    alpha=alpha, n_epochs=n_epochs, 
                    input_dims=env.observation_space.shape, 
                    fc1_dims=512, fc2_dims=512, chk_dir='tmp/ppo/10obs_AUG5')

states = np.zeros((steps_per_test_ep*goal_len+1, 4))
observation = env.test_reset()
states[0,:] = observation[:,-1]
agent.load_models(end=True)
for ii, g in enumerate(goals):
    env.next_goal(g)
    for i in range(steps_per_test_ep):
        action, prob, val = agent.choose_action(observation, avg=True)
        observation_, reward, done, info = env.step(action)
        observation = observation_
        states[ii*steps_per_test_ep+i+1,:] = observation[:,-1]
            
plt.plot(np.arange(steps_per_test_ep*goal_len+1), (states[:,0]*226+497.5)*30/1023)
plt.plot(np.arange(steps_per_test_ep*goal_len+1), (states[:,3]*226+497.5)*30/1023, '--r')
plt.xlabel('Timesteps')
plt.ylabel('Position (mm)')
plt.show()
plt.plot(np.arange(steps_per_test_ep*goal_len+1), states[:,2]*26.8+50)
plt.show()