In [1]:
import numpy as np
import torch
from collections import defaultdict
import dill

In [85]:
class MCAgent:
    
    def __init__(self, action_dim, cfg):
        self.action_dim = action_dim
        self.epsilon = cfg.epsilon
        self.gamma = cfg.gamma
        self.Q_table = defaultdict(lambda: np.zeros(action_dim))
        self.returns_sum = defaultdict(float)
        self.returns_count = defaultdict(float)
        
    def choose_action(self, state):
        ''' e-greedy policy'''
        if state in self.Q_table.keys():
            best_action = np.argmax(self.Q_table[state])
            action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
            action_probs[best_action] += (1.0 - self.epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        else:
            action = np.random.randint(0, self.action_dim)
        return action
    
    def update(self, one_ep_transition):
        sa_in_episode = set([(tuple(x[0]), x[1]) for x in one_ep_transition])
#         print('sa set: ', sa_in_episode)
        for state, action in sa_in_episode:
            sa_pair = (state, action)
            first_occurence_idx = next(i for i, x in enumerate(one_ep_transition) if x[0] == state and x[1] == action)
            G = sum([x[2] * (self.gamma ** i) for i, x in enumerate(one_ep_transition[first_occurence_idx:])])
            
            self.returns_sum[sa_pair] += G
            self.returns_count[sa_pair] += 1.0
            self.Q_table[state][action] = self.returns_sum[sa_pair] / self.returns_count[sa_pair]
            
            
    def save(self,path):
        '''把 Q表格 的数据保存到文件中
        '''
        torch.save(
            obj=self.Q_table,
            f=path+"Q_table",
            pickle_module=dill
        )

    def load(self, path):
        '''从文件中读取数据到 Q表格
        '''
        self.Q_table =torch.load(f=path+"Q_table",pickle_module=dill)
            

In [86]:
import sys, os

In [87]:
# 可将根目标路径加入系统的环境变量，方便导入其他库，如common.utils
curr_path = os.path.dirname(os.path.realpath('__file__'))
parent_path = os.path.dirname(curr_path)
sys.path.append(parent_path)

In [88]:
import torch
import datetime

from common.utils import save_results, make_dir, plot_rewards
from envs.racetrack_env import RacetrackEnv

%matplotlib inline

In [89]:
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [90]:
class MCConfig:
    
    def __init__(self):
        self.algo = 'MC'
        self.env = 'Racetrack'
        self.result_path = curr_path + '/outputs/' + self.env + '/' + self.algo + '/' + curr_time + 'results'
        self.model_path = curr_path + '/outputs/' + self.env + '/' + self.algo + '/' + curr_time + 'models'
        self.epsilon = 0.15
        self.gamma = 0.9
        self.train_eps = 200
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [91]:
def env_agent_config(cfg, seed=1):
    env = RacetrackEnv()
    action_dim = 9
    agent = MCAgent(action_dim, cfg)
    return env, agent

In [92]:
def train(cfg, env, agent):
    print('Start to eval!')
    print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
    rewards = []
    ma_rewards = []
    for i_ep in range(cfg.train_eps):
        state = env.reset()
        ep_reward = 0
        one_ep_transition = []
        while True:
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            ep_reward += reward
            one_ep_transition.append((state, action, reward))
            state = next_state
            if done:
                break
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
        else:
            ma_rewards.append(ep_reward)
        agent.update(one_ep_transition)
        if (i_ep + 1) % 10 == 0:
            print(f'Episode:{i_ep + 1}/{cfg.train_eps}: Reward:{ep_reward}')
    print('Complete training!')
    return rewards, ma_rewards

In [93]:
def eval(cfg, env, agent):
    print('Start to eval!')
    print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
    rewards = []
    ma_rewards = []
    for i_ep in range(cfg.train_eps):
        state = env.reset()
        ep_reward = 0
        while True:
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            ep_reward += reward
            state = next_state
            if done:
                break
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
        else:
            ma_rewards.append(ep_reward)
        if (i_ep + 1) % 10 == 0:
            print(f'Episode:{i_ep + 1}/{cfg.train_eps}: Reward:{ep_reward}')
    return rewards, ma_rewards

In [99]:
cfg = MCConfig()

env, agent = env_agent_config(cfg, seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
agent.save(path=cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
# plot_rewards(rewards, ma_rewards, cfg, tag='train')

Start to eval!
Env:Racetrack, Algorithm:MC, Device:cuda
Episode:10/200: Reward:-3905
Episode:20/200: Reward:-11
Episode:30/200: Reward:-505
Episode:40/200: Reward:-44
Episode:50/200: Reward:0
Episode:60/200: Reward:-45
Episode:70/200: Reward:1
Episode:80/200: Reward:-288
Episode:90/200: Reward:1
Episode:100/200: Reward:-11
Episode:110/200: Reward:-22
Episode:120/200: Reward:2
Episode:130/200: Reward:-245
Episode:140/200: Reward:-58
Episode:150/200: Reward:-99
Episode:160/200: Reward:-131
Episode:170/200: Reward:-40
Episode:180/200: Reward:1
Episode:190/200: Reward:-27
Episode:200/200: Reward:-213
Complete training!
结果保存完毕！


In [100]:
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)

Start to eval!
Env:Racetrack, Algorithm:MC, Device:cuda
Episode:10/200: Reward:-13
Episode:20/200: Reward:-151
Episode:30/200: Reward:-92
Episode:40/200: Reward:0
Episode:50/200: Reward:-110
Episode:60/200: Reward:-71
Episode:70/200: Reward:2
Episode:80/200: Reward:-3
Episode:90/200: Reward:-37
Episode:100/200: Reward:-1
Episode:110/200: Reward:-7
Episode:120/200: Reward:-68
Episode:130/200: Reward:1
Episode:140/200: Reward:-1
Episode:150/200: Reward:-52
Episode:160/200: Reward:-55
Episode:170/200: Reward:-8
Episode:180/200: Reward:-209
Episode:190/200: Reward:-5
Episode:200/200: Reward:-58
结果保存完毕！
