In [1]:
import pandas as pd
import numpy as np
import random
import os, time
import matplotlib.pyplot as plt
from torchvision import datasets, transforms

import argparse
import pickle
from collections import namedtuple
from collections import deque
from itertools import count

import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable
from torch.distributions import Normal, Categorical
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler

from utils.replaybuffer import PrioritizedBuffer
from model import DQN
from utils.AI_Interface import *
from utils.reward import *
from utils.action_transform import action_transform

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
class DQNAgent:

    def __init__(self, learning_rate=3e-4, gamma=0.99, buffer_size=10000):
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = PrioritizedBuffer(buffer_size)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.model = DQN(obs_shape=428,                 
                         fStickLat_shape=21, fStickLon_shape=21,
                 fThrottle_shape=11, fRudder_shape=21,
                 eMainTaskMode=2, eEleScanLine_shape=2,
                 eAziScanRange=3, WeaponLaunch=2).to(self.device)

        self.optimizer = torch.optim.Adam(self.model.parameters())
        self.MSE_loss = nn.MSELoss()

    def get_action(self, state, eps=0.05):
        state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device)
        qvals = self.model.forward(state) #list[tensor]
        
        #这里还需要获得每一种动作的离散化后的列表，将argmax后得到的index放入列表中采样
        #假设叫做action_list
        action_list = []
        action_list.append(np.round(np.arange(-1.0, 1.1, 0.1)),1)
        action_list.append(np.round(np.arange(-1.0, 1.1, 0.1)),1)
        action_list.append(np.round(np.arange(0, 1.1, 0.1)),1)
        action_list.append(np.round(np.arange(-1.0, 1.1, 0.1)),1)
        action_list.append(np.round(np.array([30,60,120])),1)
        action_list.append(np.round(np.array([2,4])),1)
        action_list.append(np.round(np.array([0,1])),1)
        action_list.append(np.round(np.array([0,1])),1)
        
        action = []
        if(np.random.randn() < eps):
            for i in range(len(qvals)):
                action.append(action_list[i][random.randint(0, len(action_list[i])-1)])
        else:
            for i in range(len(qvals)):
                action.append(action_list[i][np.argmax(qvals[i].cpu().detach().numpy())])

        return action

    def compute_td(self, batch_size):
        transitions, idxs, IS_weights = self.replay_buffer.sample(batch_size)
        states, actions, rewards, next_states, dones = transitions
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        IS_weights = torch.FloatTensor(IS_weights).to(self.device)
        done_mask = torch.abs(dones - 1).reshape(batch_size,1)

        q = self.model.forward(states)
        for i in range(len(q)):
            if i == 0:
                curr_Q = q[i].gather(1, actions[:,i].unsqueeze(1))
            else:
                curr_Q = torch.cat([curr_Q,q[i].gather(1, actions[:,i].unsqueeze(1))],1)

        q_ = self.model.forward(next_states)
        for i in range(len(q_)):
            if i == 0:
                max_next_Q = torch.max(q_[i], 1)[0].unsqueeze(1)
            else:
                max_next_Q = torch.cat([max_next_Q, torch.max(q_[i], 1)[0].unsqueeze(1)], 1)
        expected_Q = rewards + self.gamma * max_next_Q * done_mask

        # 第一种 分开输出
#         td_errors = []
#         for i in range(len(q_)):
#             td_errors.append(torch.pow(curr_Q[i] - expected_Q[i], 2) * IS_weights)
        
        #第二种 加起来
        td_errors = 0
        for i in range(len(q_)):
            td_errors += torch.pow(curr_Q[i] - expected_Q[i], 2) * IS_weights
            
#         td_errors = torch.pow(curr_Q - expected_Q, 2) * IS_weights
        
        return td_errors, idxs

    def update(self, batch_size):
        td_errors, idxs = self.compute_td(batch_size)
        
#          分开版
#         self.optimizer.zero_grad()
#         for i in range(len(td_errors)):
#             td_errors[i].mean().backward()
#         self.optimizer.step()
        
#         加和版
        td_errors_mean = td_errors.mean()
        self.optimizer.zero_grad()
        td_errors_mean.backward()
        self.optimizer.step()

        # update priorities
        for idx, td_error in zip(idxs, td_errors.cpu().detach().numpy()):
            self.replay_buffer.update_priority(idx, td_errors)

In [3]:
# 暂时没有交互，只有从buffer中采样，学习
# 把data用循环存入buffer中

agent = DQNAgent()
    
data = pd.read_csv('../data',header=None)

i = 2
while i < (len(data)-3):
    # 假设我们是红色飞机，蓝色飞机的动作默认不知道
    # 把红蓝的input做成state，动作是红色的output
    # 注意：428-609是空空导弹数据，不能用作state
    state = data.iloc[i][:428].tolist()
    
    next_state = data.iloc[i+2][:428].tolist()
    
    action = data.iloc[i][609:].tolist()
    action = action_transform(action)
    
    # 为了方便做奖励，将所有变量都归类
    # 需要当前input，当前output，上一步input
    input_r_cur, output_r_cur = getStateAndAction(data.iloc[i])
    input_b_cur, output_b_cur = getStateAndAction(data.iloc[i+1])
    input_r_pre, _ = getStateAndAction(data.iloc[i-2])
    input_b_pre, _ = getStateAndAction(data.iloc[i-1])
    reward = getReward(input_r_pre, input_b_pre,
             output_r_cur, output_b_cur,
             input_r_cur, input_b_cur)
    
    # 终止flag
    # 不知道时间单位，暂时没有考虑终止条件2
    if ((input_r_cur.m_AircraftBasicInfo.m_bAlive == 0 or 
        input_r_cur.m_AircraftBasicInfo.m_fFuel <= 0 or
        input_r_cur.m_AircraftMoveInfo.m_dSelfAlt <= 0) or
        (input_b_cur.m_AircraftBasicInfo.m_bAlive == 0 or 
        input_b_cur.m_AircraftBasicInfo.m_fFuel <= 0 or
        input_b_cur.m_AircraftMoveInfo.m_dSelfAlt <= 0)):
        for i in range(len(input_r_cur.m_AAMDataSet.m_AAMData)):
            if (input_r_cur.m_AAMDataSet.m_AAMData[i].m_eAAMState != 0) or \
            (input_b_cur.m_AAMDataSet.m_AAMData[i].m_eAAMState != 0):
                done = 0
            else:
                done = 1
    else:
        done = 0
    
    agent.replay_buffer.push(state, action, reward, next_state, done)
    
    i = i + 2

  self.tree[parent] += change


In [4]:
MAX_EPISODES = 1000
MAX_STEPS = 500
BATCH_SIZE = 32

agent.update(BATCH_SIZE)

  a = segment * i
  IS_weight = (self.sum_tree.total() * prob) ** (-self.beta)
  return a + (b-a) * self.random()


TypeError: cannot unpack non-iterable int object

In [5]:
torch.onnx.export(agent,
                  (state),
                  "F:\TX_Distribution\Deploy\BVR\AIPilots\Intelligame\model.onnx", 
                   export_params=True,        # 是否保存训练好的参数在网络中
                   opset_version=10,          # ONNX算子版本
                   do_constant_folding=True,  # 是否不保存常数输出（优化选项）
                   input_names = ['input0'],   
                   output_names = ['output0', 'output1', 'output2', 'output3', 'output4', 'output5', 'output6'])

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (3742748864.py, line 3)