# Imitation Learning notebook - Inverse Reinforcement Learning - Use rewards from sliced RL to imitate expert behavioural movements 

In [1]:
import gym
import numpy as np
import cvxpy as cp
import sys
import pylab
import matplotlib.pyplot as plt
import pandas as pd 
import torch
from torch.autograd import Variable
import copy
import torch.nn.functional as F
import random
import torch.nn as nn
import math
from itertools import count
from PIL import Image
import torch
import torch.optim 
import torchvision.transforms as transforms
from collections import namedtuple, deque
from torch import nn
from gym import make
import torch.optim as optim
from numpy import save
from tqdm.notebook import tqdm
import pickle

import pickle
from typing import Optional
import IPython
from IPython.display import set_matplotlib_formats; set_matplotlib_formats('svg')
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation, FFMpegWriter
import os
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [2]:
from Code.utils import Net, Memory, Agent
from Code.sliced_wasserstein_rewards import *
from Code.plotting import preprocess_states, animate
from Code.helper_functions import *
from Code.pwil_rewarder import *

## Load expert trajectories 

In [3]:
environment = 'Pendulum-v0'

t1 = 'multi_diff_lengths_excl_1'
traj1 = np.load('/Users/ilanasebag/Documents/Thesis_Code/RL_results/%s_exp_states_%s.npy'%(t1,environment))


## Imitation Learning Model

In [4]:
TrainingRecord = namedtuple('TrainingRecord', ['ep', 'reward'])
Transition = namedtuple('Transition', ['s', 'a', 'r', 's_'])

In [5]:
def main(environment, exp, seeds, simple = False, MMOT = False, wass_PWIL = False, PWIL = False):

    env = gym.make(environment)
    env.seed(seeds)

    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.shape[0]
    action_dim = env.action_space.shape[0] * 5 #discretization  of the unique continuous action of the pendulum

    agent = Agent(input_dim, output_dim, action_dim, environment)

    training_records = []
    running_reward, running_q = -1000, 0
    

    for i_ep in tqdm(range(800)):

        rewards = []
        new_states = []
        old_states = []
        action_indexes = []

        score = 0

        #We fix the departure state 
        state = env.reset()
        env.env.state = np.array([np.pi/2, 0.5])
        env.env.last_u = None
        state = env.env._get_obs()
        
        #to make it more robust we have to use :
        #state = env.reset()
        
        for t in range(200):
            action, action_index = agent.select_action(state)
            state_, reward, done, _ = env.step(action)
            score += reward
            old_states.append(state)

            env.render()
            #agent.store_transition(Transition(state, action_index, (reward + 8) / 8, state_))
            state = state_
            if agent.memory.isfull:
                q = agent.update()
                running_q = 0.99 * running_q + 0.01 * q

            action_indexes.append(action_index)
            rewards.append(reward)
            new_states.append(state_)

        states_tens = [torch.tensor(elt) for elt in old_states] #agent rollout 
        states_tens = torch.stack(states_tens).float()

        
        if MMOT is True :
            rewards_multitask = rewarder_multi([states_tens, torch.tensor(exp[0]).float(), torch.tensor(exp[1]).float(), torch.tensor(exp[2]).float(), torch.tensor(exp[3]).float(), torch.tensor(exp[4]).float()], num_projections = 50)

            
        elif simple is True : 
            rewards_multitask = rewarder_multi([states_tens, torch.tensor(exp[0]).float()], num_projections = 50)
            
            
        elif wass_PWIL is True : 
            pwil_exp = torch.tensor(concatenate_and_sample(exp)).float()
            rewards_multitask = rewarder_multi([states_tens, pwil_exp], num_projections = 50)
            
            
            
        elif PWIL is True : 
            pwilexp = concatenate_and_sample(exp)
            
            #states_tens = np.asarray(states_tens)
            #rwd_class = PWILRewarder(pwilexp,states_tens, env)
            #rewards_multitask = rwd_class.compute_reward()
            #rewards_multitask = torch.from_numpy(rewards_multitask)
            rewards_multitask = rp([states_tens, pwilexp])
            

            
        for t in range(200):
            rewards[t] = torch.exp(-5*rewards_multitask[t,0])
            agent.store_transition(Transition(old_states[t], action_indexes[t], rewards[t], new_states[t]))

        running_reward = running_reward * 0.9 + score * 0.1
        training_records.append(TrainingRecord(i_ep, running_reward))

        print('Ep', i_ep, 'Average score:', running_reward, 'score of current env', score )

    env.close()
    
    return training_records
    
    
    

In [6]:
if __name__ == '__main__':
    environment = 'Pendulum-v0'
    exp = traj1
    seeds = 1
    trialpwil = main(environment, exp, seeds, simple = False, MMOT = False, wass_PWIL = False, PWIL = True)

HBox(children=(FloatProgress(value=0.0, max=800.0), HTML(value='')))




TypeError: object of type 'int' has no len()

In [None]:
plt.plot([r.ep for r in trialpwil], [r.reward for r in trialpwil], color = 'green')
plt.show()