In [1]:
from drl4dypm.agent import *
from drl4dypm.env import *
import time

# Data source

## load data

In [2]:
data_source = DataSource(num_steps=252, asset_names=['AAPL','BC'], k=10)

In [3]:
path_to_data = 'data/assets.h5'
data_source.load_data(path_to_data)

Loading data from data/assets.h5 ...


In [6]:
len(data_source.data)

4585

In [10]:
data_source.data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,close,low,high
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,AAPL,3.596463,3.267146,3.614454
2000-01-03,BC,17.012791,16.818935,17.129104
2000-01-04,AAPL,3.29317,3.251081,3.554053
2000-01-04,BC,16.237367,16.190842,17.152367
2000-01-05,AAPL,3.341362,3.309234,3.552125


## process data

In [4]:
data_source.preprocess_data()

In [5]:
data_source.data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,close,low,high
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-14,AAPL,3.226985,3.192607,3.285138
2000-01-14,BC,16.524274,16.330418,16.71813
2000-01-18,AAPL,3.339435,3.226985,3.405619
2000-01-18,BC,16.625079,15.80313,16.671604
2000-01-19,AAPL,3.423611,3.321121,3.493973


## step

In [10]:
data_source.offset = 0

In [13]:
(data, state, feat), end = data_source.take_step()

In [14]:
feat

array([ 0.03813952, -0.01159555,  0.07427025,  0.00285714,  0.03544303,
       -0.01462523])

In [6]:
data

array([ 3.22698497, 16.52427396])

In [7]:
state.shape

(10, 9)

In [8]:
state

array([[1.        , 1.11449587, 1.02956352, 1.        , 1.01244493,
        1.01783194, 1.        , 1.12007134, 1.03660247],
       [1.        , 1.02050944, 0.9826372 , 1.        , 1.00746683,
        0.97982162, 1.        , 1.1013537 , 1.03801026],
       [1.        , 1.03544373, 0.98826836, 1.        , 1.02548754,
        0.97090562, 1.        , 1.10075633, 1.0145471 ],
       [1.        , 0.94583802, 0.99718436, 1.        , 0.94583802,
        0.97982162, 1.        , 1.06531229, 0.99999994],
       [1.        , 0.99064087, 1.00328478, 1.        , 0.95081611,
        0.9826372 , 1.        , 1.00557516, 1.0145471 ],
       [1.        , 0.97321754, 1.00610036, 1.        , 0.94334897,
        0.99718436, 1.        , 1.01802039, 1.02064752],
       [1.        , 0.92343659, 1.00610036, 1.        , 0.90103516,
        1.00328478, 1.        , 0.98934657, 1.02674794],
       [1.        , 0.86808018, 1.00328478, 1.        , 0.86121041,
        0.98545278, 1.        , 0.95081611, 1.02956352],


In [8]:
end

False

# Simulator

In [6]:
simulator = TradingSimulator(num_assets=2, 
                             cost_bps=1e-3)

In [7]:
simulator.reset()

In [8]:
action = np.ones(2) * 0.5
prices = np.ones(2)
simulator.take_step(action, prices)

-0.0010005003335835344

In [9]:
action = np.ones(2) * 0.5
prices = np.ones(2)
simulator.take_step(action, prices)

0.0

# Benchmark (CRP)

In [2]:
trading_days = 252
asset_names = ['AAPL','BC']
k = 10
cost_bps = 1e-3
path_to_data = 'data/assets.h5'

In [3]:
# trading environment
env = TradingEnvironment(num_steps=trading_days, 
                         asset_names=asset_names, 
                         k=k, 
                         cost_bps=cost_bps,
                         agent_names = ['crp'],
                         path_to_data=path_to_data
                        )


Loading data from data/assets.h5 ...


## by step

In [89]:
env.reset()

In [5]:
actions = {'crp': np.ones(3)/3}

In [4]:
state, end = env.init_step()

In [12]:
rewards, next_state, end = env.take_step(actions, state[0])
state = next_state
print(rewards['crp'], env.get_total_rewards()['crp'])

-0.008016212173172324 -0.024528082445371795


## by episode

In [13]:
env.reset()
# env.data_source.offset = 100

In [19]:
step = 0
state, end = env.init_step()

while not end:
    rewards, next_state, end = env.take_step(actions, state[0])
    state = next_state
    
    step += 1

    
print(step, env.get_total_rewards()['crp'])

env.reset()
# env.data_source.offset = 100


252 0.08368937840664353


# IPM

In [18]:
def learn(self, in_series, out_series):
    # self-def learn function
    
    predictions = list()
    
    n = len(in_series)
    for i in range(n):
        prediction = self.predict_next()
        predictions.append(prediction)
        
        self.learn_one_step(out_series[i])
        self._update_state(in_series[i])
    
    
    return predictions
        

In [53]:
def RMSE(y,y_hat):
    return np.sqrt(np.sum(np.mean(np.square(y-y_hat), axis=0)))

## example

In [42]:
from drl4dypm.pydybm.base.generator import NoisySin
from drl4dypm.pydybm.base.sgd import RMSProp

In [32]:
def generate_sin_series(sin_series, num_steps=252):
    in_series = np.zeros((num_steps,1))
    out_series = np.zeros((num_steps,1))
    
    for t in range(num_steps):
        in_series[t] = sin_series.next()
    
    out_series[:-1] = in_series[1:]
    out_series[-1] = sin_series.next()
    
    return in_series, out_series

In [36]:
sin_series = NoisySin(600, 80, 0.1, 1)

In [37]:
in_series, out_series = generate_sin_series(sin_series)

In [56]:
max_iter = 10


ipm_sin = RNNGaussianDyBM(1, 1, 100, 
                          spectral_radius=0.95, sparsity=0.1,
                         leak=1.0, random_seed=2, SGD=RMSProp())


line = '|'.join([f'{col:<10}' for col in ['iter','RMSE']])
print(line)

# sin_series = NoisySin(600, 80, 0.1, 1)
# in_series, out_series = generate_sin_series(sin_series)

for i in range(max_iter):
    predictions = learn(ipm_sin, in_series, out_series)
    
    error = RMSE(out_series, predictions)
    
    line = f'{i:<10}|{error:<10.4f}'
    print(line)
    

iter      |RMSE      
0         |0.4997    
1         |0.2296    
2         |0.1905    
3         |0.1543    
4         |0.1383    
5         |0.1393    
6         |0.1406    
7         |0.1416    
8         |0.1424    
9         |0.1428    


## learn on stock prices

In [6]:
def generate_series(data_source, feat_dim, num_steps=252):
    data_source.reset()
    
    in_series = np.zeros((num_steps, feat_dim))
    out_series = np.zeros((num_steps, feat_dim))
    
    for t in range(num_steps):
        (__, __, feat), __ = data_source.take_step()
        in_series[t] = feat
    
    out_series[:-1] = in_series[1:]
    
    (__, __, feat), __ = data_source.take_step()
    out_series[-1] = feat
    
    return in_series, out_series
    

In [2]:
# init data source
data_source = DataSource(num_steps=252, asset_names=['AAPL','BC'], k=10)

path_to_data = 'data/assets.h5'
data_source.load_data(path_to_data)

data_source.preprocess_data()

Loading data from data/assets.h5 ...


In [60]:
# init IPM
num_assets = 2
feat_dim = 3*num_assets
rnn_dim = 20
ipm = RNNGaussianDyBM(feat_dim, feat_dim,
                      rnn_dim, spectral_radius=0.95, sparsity=0.1,
                     leak=1.0, SGD=RMSProp())

In [61]:
max_iter = 10

line = '|'.join([f'{col:<10}' for col in ['iter','RMSE']])
print(line)


in_series, out_series = generate_series(data_source, feat_dim)

for i in range(max_iter):
    predictions = learn(ipm, in_series, out_series)
    
    error = RMSE(out_series, predictions)
    
    line = f'{i:<10}|{error:<10.4f}'
    print(line)
    

iter      |RMSE      
0         |0.0489    
1         |0.0487    
2         |0.0487    
3         |0.0486    
4         |0.0486    
5         |0.0486    
6         |0.0485    
7         |0.0485    
8         |0.0485    
9         |0.0485    


# BCM

## example

In [9]:
from scipy.optimize import minimize, Bounds, LinearConstraint

In [34]:
def obj(x):
    u = np.array([1,1.01,0.98])
    c = 1e-3
    
    return -np.dot(u,x[:3]) + c*np.sum(x[3:])


In [37]:
w_end = [0.4,0.3,0.3]
w0 = np.ones(3)/3
z0 = np.abs(w_0-w_end)

x0 = np.append(w_0, z_0)
x0

array([0.33333333, 0.33333333, 0.33333333, 0.06666667, 0.03333333,
       0.03333333])

In [13]:
bounds = Bounds(np.zeros(6), np.ones(6))

In [14]:
mtx = np.block([
    [np.eye(3),-1*np.eye(3)],
    [np.eye(3),np.eye(3)],
    [np.ones(3), np.zeros(3)]
])
mtx

array([[ 1.,  0.,  0., -1., -0., -0.],
       [ 0.,  1.,  0., -0., -1., -0.],
       [ 0.,  0.,  1., -0., -0., -1.],
       [ 1.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  1.,  1.,  0.,  0.,  0.]])

In [28]:
left_bnd = np.concatenate([-1*np.ones(3), w_end, [1]])
left_bnd

array([-1. , -1. , -1. ,  0.4,  0.3,  0.3,  1. ])

In [19]:
right_bnd = np.concatenate([w_end, 2*np.ones(3), [1]])
right_bnd

array([0.4, 0.3, 0.3, 2. , 2. , 2. , 1. ])

In [31]:
lin_constr = LinearConstraint(mtx, left_bnd, right_bnd)

In [43]:
res = minimize(obj, x0, method='trust-constr', constraints=[lin_constr], bounds=bounds)

In [36]:
res.x

array([9.12974443e-04, 9.98686245e-01, 4.00781041e-04, 4.04899671e-01,
       7.00793884e-01, 3.00763346e-01])

In [38]:
w_star = res.x[:3]
z_star = res.x[3:]

In [39]:
np.abs(w_star-w_end)

array([0.39908703, 0.69868624, 0.29959922])

In [40]:
z_star

array([0.40489967, 0.70079388, 0.30076335])

# Base agent

## set up params

In [2]:
# environment params
trading_days = 252
asset_names = ['AAPL','BC']
k = 10
cost_bps = 1e-3
path_to_data = 'data/assets.h5'

In [3]:
# agent params
num_assets = len(asset_names)
state_dim = 3*(1+num_assets)
action_dim = 1+num_assets

critic_learning_rate = 0.1**3
actor_learning_rate = critic_learning_rate * 0.01

network_params = {
    'actor': {
        'lstm': {
            'hidden_dim': 20,
            'num_layers': 1
        },
        'fc': [64,32],
        'dropout': 0.5,
    },
    'critic': {
        'lstm': {
            'hidden_dim': 20,
            'num_layers': 1
        },
        'fc': [64,32],
        'dropout': 0.5,
    }
}

In [4]:
# training params
max_episode = 10
min_episode_to_train = 5

## initiate modules

In [5]:
# trading environment
env = TradingEnvironment(num_steps=trading_days, 
                         asset_names=asset_names, 
                         k=k, 
                         cost_bps=cost_bps,
                         path_to_data=path_to_data
                        )


Loading data from data/assets.h5 ...


In [12]:
# agent
agent = BaseAgent(state_dim,
                  action_dim,
                  network_params,
                  actor_learning_rate,
                  critic_learning_rate)



## main by step

In [7]:
state, end = env.init_step()

In [39]:
with torch.no_grad():
    action = agent.get_action(torch.tensor(state[1], dtype=torch.float32).view(1,k,-1))
    print(action)
    
    reward, next_state, end = env.take_step(action.numpy().reshape(-1), state[0])
    print(reward)
    

tensor([[0.3297, 0.3768, 0.4520]])
0.07360835403823848


In [40]:
agent.store_transition({
        'state': {'state': torch.tensor(state[1], dtype=torch.float32).view(1,k,-1)},
        'action': {'action': action},
        'next_state': {'state': torch.tensor(next_state[1], dtype=torch.float32).view(1,k,-1)},
        'reward': reward,
        'terminal': end
    })

In [41]:
agent.update()

In [42]:
env.data_source.step

10

## main

In [13]:
env.reset()

In [14]:
sim_crp = TradingSimulator(num_assets, cost_bps)

In [16]:
reward_sm = 0
reward_crp_sm = 0

elp = 0
start_time = time.time()

cols = ['episode','reward','reward_sm','reward_crp','reward_crp_sm','elp','elp_sum']
line = '|'.join([f'{col:<16}' for col in cols])
print(line)


for e in range(max_episode):
    state, end = env.init_step()
    
    while not end:
        with torch.no_grad():
            # generate action by epsilon-greedy 
            action = agent.get_action(torch.tensor(state[1], dtype=torch.float32).view(1,k,-1))
        
            # execute action and move to next step
            reward, next_state, end = env.take_step(action.numpy().reshape(-1), state[0])
            
            # execute CRP action
            action_crp = np.ones(1+num_assets)/(1+num_assets)
            sim_crp.take_step(action_crp, state[0])
        
            # store experience
            agent.store_transition({
                'state': {'state': torch.tensor(state[1], dtype=torch.float32).view(1,k,-1)},
                'action': {'action': action},
                'next_state': {'state': torch.tensor(next_state[1], dtype=torch.float32).view(1,k,-1)},
                'reward': reward,
                'terminal': end
            })
            
            
        state = next_state
        
        # update ddpg
        if e > min_episode_to_train:
            agent.update()
        
    
    
    reward = env.get_total_reward()
    reward_sm = 0.9*reward_sm + 0.1*reward
    reward_corr = reward_sm/(1-0.9**(e+1))
    
    reward_crp = sim_crp.get_total_reward()
    reward_crp_sm = 0.9*reward_crp_sm + 0.1*reward_crp
    reward_crp_corr = reward_crp_sm/(1-0.9**(e+1))
    
    elp_episode = time.time()-start_time
    elp += elp_episode
    line = f'{e:<16}|' + '|'.join([f'{col:<16.4f}' for col in [reward, reward_corr, 
                                                          reward_crp, reward_crp_corr,
                                                          elp_episode, elp]])
    print(line)
    
    # reset environment
    env.reset()
    sim_crp.reset()
    start_time = time.time()
        
        
        
        
        
        
        

The framework is not responsible for any un-matching device issues caused by this operation.[0m


episode         |reward          |reward_sm       |reward_crp      |reward_crp_sm   |elp             |elp_sum         
0               |3.9137          |3.9137          |0.1681          |0.1681          |0.5794          |0.5794          
1               |4.0311          |3.9755          |0.3618          |0.2700          |0.5696          |1.1490          
2               |4.0587          |4.0062          |0.1998          |0.2441          |0.5663          |1.7154          
3               |3.7491          |3.9314          |0.0339          |0.1830          |0.5235          |2.2389          
4               |3.7937          |3.8978          |0.0311          |0.1459          |0.5297          |2.7686          


The framework is not responsible for any un-matching device issues caused by this operation.[0m
The framework is not responsible for any un-matching device issues caused by this operation.[0m
The framework is not responsible for any un-matching device issues caused by this operation.[0m


5               |3.5946          |3.8331          |-0.0196         |0.1106          |0.5242          |3.2928          
6               |3.8580          |3.8379          |0.2161          |0.1308          |16.0240         |19.3169         
7               |3.8685          |3.8433          |0.2071          |0.1442          |16.4764         |35.7933         
8               |3.9687          |3.8637          |0.3318          |0.1748          |16.4415         |52.2348         
9               |4.0896          |3.8984          |0.2995          |0.1940          |16.3143         |68.5491         


In [139]:
# noise param (0,0.1)

reward_sm = 0
reward_crp_sm = 0

elp = 0
start_time = time.time()

cols = ['episode','reward','reward_sm','reward_crp','reward_crp_sm','elp','elp_sum']
line = '|'.join([f'{col:<16}' for col in cols])
print(line)


for e in range(max_episode):
    state, end = env.init_step()
    
    while not end:
        with torch.no_grad():
            # generate action by epsilon-greedy 
            action = agent.get_action(torch.tensor(state[1], dtype=torch.float32).view(1,k,-1))
        
            # execute action and move to next step
            reward, next_state, end = env.take_step(action.numpy().reshape(-1), state[0])
            
            # execute CRP action
            action_crp = np.ones(1+num_assets)/(1+num_assets)
            sim_crp.take_step(action_crp, state[0])
        
            # store experience
            agent.store_transition({
                'state': {'state': torch.tensor(state[1], dtype=torch.float32).view(1,k,-1)},
                'action': {'action': action},
                'next_state': {'state': torch.tensor(next_state[1], dtype=torch.float32).view(1,k,-1)},
                'reward': reward,
                'terminal': end
            })
            
            
        state = next_state
        
        # update ddpg
        if e > min_episode_to_train:
            agent.update()
        
    
    
    reward = env.get_total_reward()
    reward_sm = 0.9*reward_sm + 0.1*reward
    reward_corr = reward_sm/(1-0.9**(e+1))
    
    reward_crp = sim_crp.get_total_reward()
    reward_crp_sm = 0.9*reward_crp_sm + 0.1*reward_crp
    reward_crp_corr = reward_crp_sm/(1-0.9**(e+1))
    
    elp_episode = time.time()-start_time
    elp += elp_episode
    line = f'{e:<16}|' + '|'.join([f'{col:<16.4f}' for col in [reward, reward_corr, 
                                                          reward_crp, reward_crp_corr,
                                                          elp_episode, elp]])
    print(line)
    
    # reset environment
    env.reset()
    sim_crp.reset()
    start_time = time.time()
        
        
        
        
        
        
        

episode         |reward          |reward_sm       |reward_crp      |reward_crp_sm   |elp             |elp_sum         
0               |34.1880         |34.1880         |0.3078          |0.3078          |0.5122          |0.5122          
1               |34.3944         |34.2966         |0.1820          |0.2416          |0.5390          |1.0511          
2               |34.8612         |34.5049         |0.3532          |0.2828          |0.5336          |1.5848          
3               |35.2292         |34.7155         |0.1356          |0.2400          |0.5316          |2.1163          
4               |35.1234         |34.8151         |0.3251          |0.2607          |0.5417          |2.6580          
5               |33.6763         |34.5721         |-0.0299         |0.1987          |0.5373          |3.1953          
6               |34.0263         |34.4675         |0.0933          |0.1785          |16.3121         |19.5075         
7               |34.6552         |34.5004       

# IPM agent

## set up params

In [2]:
# environment params
trading_days = 252
asset_names = ['AAPL','BC']
k = 10
cost_bps = 1e-3
path_to_data = 'data/assets.h5'

In [3]:
# agent params
num_assets = len(asset_names)
state_dim = 3*(1+num_assets)
action_dim = 1+num_assets
ipm_dim = 3*num_assets

critic_learning_rate = 0.1**3
actor_learning_rate = critic_learning_rate * 0.01
ipm_learning_rate = 0.1**3

network_params = {
    'actor': {
        'lstm': {
            'hidden_dim': 20,
            'num_layers': 1
        },
        'fc': [64,32],
        'dropout': 0.5,
    },
    'critic': {
        'lstm': {
            'hidden_dim': 20,
            'num_layers': 1
        },
        'fc': [64,32],
        'dropout': 0.5,
    },
    'ipm': {
        'rnn_dim': 20,
        'delay': 3,
        'decay_rates': [0.1,0.2,0.5,0.8],
    }
}

In [4]:
# training params
max_episode = 10
min_episode_to_train = 5

## initiate modules

In [5]:
# trading environment
env = TradingEnvironment(num_steps=trading_days, 
                         asset_names=asset_names, 
                         k=k, 
                         cost_bps=cost_bps,
                         path_to_data=path_to_data
                        )


Loading data from data/assets.h5 ...


In [6]:
# agent
agent = IPMAgent(state_dim,
                 action_dim,
                 ipm_dim,
                 network_params,
                 actor_learning_rate,
                 critic_learning_rate,
                 ipm_learning_rate
                )



## main methods

### get action

In [7]:
state, end = env.init_step()

In [8]:
agent.ddpg_per.act_with_noise(
    {'state': torch.tensor(state[1], dtype=torch.float32).view(1,k,-1),
    'ipm': torch.tensor(np.zeros(6), dtype=torch.float32).view(1,-1)},
    noise_param=agent.noise_param, mode=agent.noise_mode
)

The framework is not responsible for any un-matching device issues caused by this operation.[0m


tensor([[0.3037, 0.4444, 0.3560]], grad_fn=<AddBackward0>)

### store transition and update

In [8]:
reward, next_state, end = env.take_step(np.array([0.3037, 0.4444, 0.3560]), state[0])

In [10]:
next_state[-1]

array([-0.00540482,  0.00715746, -0.01699502,  0.01291711, -0.00745109,
        0.00707071])

In [9]:
agent.store_transition({
        'state': {'state': torch.tensor(state[1], dtype=torch.float32).view(1,k,-1),
                 'ipm': torch.tensor(np.zeros(6), dtype=torch.float32).view(1,-1)},
        'action': {'action': torch.tensor([[0.3037, 0.4444, 0.3560]])},
        'next_state': {'state': torch.tensor(next_state[1], dtype=torch.float32).view(1,k,-1),
                      'ipm': torch.tensor(np.zeros(6), dtype=torch.float32).view(1,-1)},
        'reward': reward,
        'terminal': end
    })

In [10]:
agent.ddpg_per.update()

The framework is not responsible for any un-matching device issues caused by this operation.[0m
The framework is not responsible for any un-matching device issues caused by this operation.[0m
The framework is not responsible for any un-matching device issues caused by this operation.[0m
The framework is not responsible for any un-matching device issues caused by this operation.[0m


(0.0661206841468811, 0.0009626416433032858)

## main by step

In [16]:
env.reset()

In [17]:
agent.ipm.init_state()

In [18]:
state, end = env.init_step()

In [19]:
ipm_predict = agent.ipm_predict_and_learn(state[-1],)
ipm_predict

array([-0.00335299, -0.00335297, -0.00335296,  0.00335299,  0.00335298,
        0.00335297])

In [41]:
with torch.no_grad():
    action = agent.get_action(torch.tensor(state[1], dtype=torch.float32).view(1,k,-1),
                             torch.tensor(ipm_predict, dtype=torch.float32).view(1,-1))
    print(action)
    
    reward, next_state, end = env.take_step(action.numpy().reshape(-1), state[0])
    print(reward)
    
    next_ipm_predict = agent.ipm_predict_and_learn(state[-1], next_state[-1])
    print(next_ipm_predict)
    

tensor([[0.4278, 0.4215, 0.3403]])
0.08418597467026154
[-0.00476514 -0.00304944 -0.00380439  0.00024339 -0.00329296 -0.00100855]


In [42]:
agent.store_transition({
        'state': {'state': torch.tensor(state[1], dtype=torch.float32).view(1,k,-1),
                 'ipm': torch.tensor(ipm_predict, dtype=torch.float32).view(1,-1)},
        'action': {'action': action},
        'next_state': {'state': torch.tensor(next_state[1], dtype=torch.float32).view(1,k,-1),
                      'ipm': torch.tensor(next_ipm_predict, dtype=torch.float32).view(1,-1)},
        'reward': reward,
        'terminal': end
    })

In [43]:
state = next_state
ipm_predict = next_ipm_predict

agent.update()

print(env.data_source.step, reward, env.get_total_reward())

8 0.08418597467026154 0.7071450234957418


## main

In [11]:
env.reset()
agent.ipm_init()

In [10]:
agent.noise_param=(0,0.01)

In [12]:
sim_crp = TradingSimulator(num_assets, cost_bps)

In [13]:
reward_sm = 0
reward_crp_sm = 0

elp = 0
start_time = time.time()

cols = ['episode','reward','reward_sm','reward_crp','reward_crp_sm','elp','elp_sum']
line = '|'.join([f'{col:<16}' for col in cols])
print(line)


for e in range(max_episode):
    state, end = env.init_step()
    
    # get IPM prediction
    ipm_predict = agent.ipm_predict_and_learn(state[-1],)
    
    while not end:
        with torch.no_grad():
            # generate action by epsilon-greedy 
            action = agent.get_action(torch.tensor(state[1], dtype=torch.float32).view(1,k,-1),
                                     torch.tensor(ipm_predict, dtype=torch.float32).view(1,-1))
        
            # execute action and move to next step
            reward, next_state, end = env.take_step(action.numpy().reshape(-1), state[0])
            
            # train IPM with next state and get next prediction
            next_ipm_predict = agent.ipm_predict_and_learn(state[-1], next_state[-1])
            
            # execute CRP action
            action_crp = np.ones(1+num_assets)/(1+num_assets)
            sim_crp.take_step(action_crp, state[0])
        
            # store experience
            agent.store_transition({
                'state': {'state': torch.tensor(state[1], dtype=torch.float32).view(1,k,-1),
                         'ipm': torch.tensor(ipm_predict, dtype=torch.float32).view(1,-1)},
                'action': {'action': action},
                'next_state': {'state': torch.tensor(next_state[1], dtype=torch.float32).view(1,k,-1),
                              'ipm': torch.tensor(next_ipm_predict, dtype=torch.float32).view(1,-1)},
                'reward': reward,
                'terminal': end
            })
            
            
        state = next_state
        ipm_predict = next_ipm_predict
        
        
        # update ddpg
        if e > min_episode_to_train:
            agent.update()
        
    
    
    reward = env.get_total_reward()
    reward_sm = 0.9*reward_sm + 0.1*reward
    reward_corr = reward_sm/(1-0.9**(e+1))
    
    reward_crp = sim_crp.get_total_reward()
    reward_crp_sm = 0.9*reward_crp_sm + 0.1*reward_crp
    reward_crp_corr = reward_crp_sm/(1-0.9**(e+1))
    
    elp_episode = time.time()-start_time
    elp += elp_episode
    line = f'{e:<16}|' + '|'.join([f'{col:<16.4f}' for col in [reward, reward_corr, 
                                                          reward_crp, reward_crp_corr,
                                                          elp_episode, elp]])
    print(line)
    
    # reset environment
    env.reset()
    agent.ipm_init()
    sim_crp.reset()
    start_time = time.time()
        
        
        
        
        
        
        

episode         |reward          |reward_sm       |reward_crp      |reward_crp_sm   |elp             |elp_sum         
0               |3.9924          |3.9924          |0.2512          |0.2512          |0.6688          |0.6688          
1               |3.6215          |3.7972          |-0.0907         |0.0713          |0.6949          |1.3637          
2               |3.6998          |3.7613          |-0.0771         |0.0165          |0.6348          |1.9985          
3               |3.6508          |3.7292          |0.0501          |0.0263          |0.6265          |2.6249          
4               |3.7776          |3.7410          |0.0197          |0.0247          |0.6177          |3.2426          
5               |3.9247          |3.7802          |0.1929          |0.0606          |0.6175          |3.8601          
6               |3.8890          |3.8010          |0.2299          |0.0930          |17.0033         |20.8634         
7               |3.7786          |3.7971        

# BCM agent

## set up params

In [20]:
# environment params
trading_days = 252
asset_names = ['AAPL','BC']
k = 10
cost_bps = 1e-3
path_to_data = 'data/assets.h5'

In [21]:
# agent params
num_assets = len(asset_names)
state_dim = 3*(1+num_assets)
action_dim = 1+num_assets
bcm_update_rate = .1

critic_learning_rate = 0.1**3
actor_learning_rate = critic_learning_rate * 0.01

network_params = {
    'actor': {
        'lstm': {
            'hidden_dim': 20,
            'num_layers': 1
        },
        'fc': [64,32],
        'dropout': 0.5,
    },
    'critic': {
        'lstm': {
            'hidden_dim': 20,
            'num_layers': 1
        },
        'fc': [64,32],
        'dropout': 0.5,
    }
}

In [22]:
# training params
max_episode = 10
min_episode_to_train = 5

## initiate modules

In [23]:
# trading environment
env = TradingEnvironment(num_steps=trading_days, 
                         asset_names=asset_names, 
                         k=k, 
                         cost_bps=cost_bps,
                         agent_names = ['bcm','crp'],
                         path_to_data=path_to_data
                        )


Loading data from data/assets.h5 ...


In [24]:
# RL agent
agent = BCMAgent(state_dim,
                  action_dim,
                 cost_bps,
                 bcm_update_rate,
                  network_params,
                  actor_learning_rate,
                  critic_learning_rate)



In [25]:
# benchmark rule-based agent
agent_crp = CRPAgent(action_dim)

## main methods

### get BCM action

In [7]:
state, end = env.init_step()

In [8]:
with torch.no_grad():
    action = agent.get_action(torch.tensor(state[1], dtype=torch.float32).view(1,k,-1))
    print(action)
    
    actions = {'bcm': action.numpy().reshape(-1)}
    rewards, next_state, end = env.take_step(actions, state[0])
    print(rewards)
    

The framework is not responsible for any un-matching device issues caused by this operation.[0m


tensor([[0.3081, 0.3841, 0.3268]])
{'bcm': -0.0014038208010464569}


In [9]:
prices = state[0]
next_prices = next_state[0]
print(prices, next_prices)

[  1.         103.66241648  46.66306948] [  1.         103.08932029  45.09883501]


In [10]:
bcm_action = agent.get_bcm_action(prices, next_prices)
bcm_action

array([9.99484924e-01, 2.30203370e-04, 2.84872551e-04])

### store transition and update

In [11]:
agent.store_transition({
                'state': {'state': torch.tensor(state[1], dtype=torch.float32).view(1,k,-1)},
                'action': {'action': torch.tensor(actions['bcm'], dtype=torch.float32).view(1,-1)},
                'next_state': {'state': torch.tensor(next_state[1], dtype=torch.float32).view(1,k,-1)},
                'reward': rewards['bcm'],
                'terminal': False,
                'bcm_action': torch.tensor(bcm_action)
            })

In [13]:
agent.update()

The framework is not responsible for any un-matching device issues caused by this operation.[0m
The framework is not responsible for any un-matching device issues caused by this operation.[0m
The framework is not responsible for any un-matching device issues caused by this operation.[0m


tensor(-0.0741, grad_fn=<NegBackward>)
tensor(0.7063, dtype=torch.float64, grad_fn=<NegBackward>)


## main by step

In [11]:
env.reset()

In [12]:
state, end = env.init_step()

In [19]:
with torch.no_grad():
    action = agent.get_action(torch.tensor(state[1], dtype=torch.float32).view(1,k,-1))
    print(action)
    
    actions = {'bcm': action.numpy().reshape(-1)}
    rewards, next_state, end = env.take_step(actions, state[0])
    print(rewards)
    

tensor([[0.2961, 0.3404, 0.3809]])
{'bcm': 0.010843541529001528}


In [20]:
bcm_action = agent.get_bcm_action(state[0], next_state[0])
print(bcm_action)

agent.store_transition({
                'state': {'state': torch.tensor(state[1], dtype=torch.float32).view(1,k,-1)},
                'action': {'action': torch.tensor(actions['bcm'], dtype=torch.float32).view(1,-1)},
                'next_state': {'state': torch.tensor(next_state[1], dtype=torch.float32).view(1,k,-1)},
                'reward': rewards['bcm'],
                'terminal': False,
                'bcm_action': torch.tensor(bcm_action)
            })

agent.update()

print(env.data_source.step, rewards['bcm'], env.get_total_rewards()['bcm'])

[5.57962674e-04 9.99366369e-01 7.56685610e-05]
5 0.010843541529001528 0.04247539380764243


## main

In [15]:
env.reset()

In [27]:
reward_sm = 0
reward_crp_sm = 0

elp = 0
start_time = time.time()

cols = ['episode','reward','reward_sm','reward_crp','reward_crp_sm','elp','elp_sum']
line = '|'.join([f'{col:<16}' for col in cols])
print(line)



for e in range(max_episode):
    state, end = env.init_step()
    
    while not end:
        with torch.no_grad():
            # generate action by epsilon-greedy 
            action = agent.get_action(torch.tensor(state[1], dtype=torch.float32).view(1,k,-1))
            action_crp = agent_crp.get_action()
            
            # execute action and move to next step
            actions = {'bcm': action.numpy().reshape(-1), 'crp': action_crp}
            rewards, next_state, end = env.take_step(actions, state[0])
            
            # get CBM action
            bcm_action = agent.get_bcm_action(state[0], next_state[0])
            
        
            # store experience
            agent.store_transition({
                'state': {'state': torch.tensor(state[1], dtype=torch.float32).view(1,k,-1)},
                'action': {'action': torch.tensor(actions['bcm'], dtype=torch.float32).view(1,-1)},
                'next_state': {'state': torch.tensor(next_state[1], dtype=torch.float32).view(1,k,-1)},
                'reward': rewards['bcm'],
                'terminal': False,
                'bcm_action': torch.tensor(bcm_action)
            })
            
            
        state = next_state
        
        
        # update ddpg
        if e > min_episode_to_train:
            agent.update()
        
    
    
    rewards = env.get_total_rewards()
    reward_sm = 0.9*reward_sm + 0.1*rewards['bcm']
    reward_corr = reward_sm/(1-0.9**(e+1))
    
    reward_crp_sm = 0.9*reward_crp_sm + 0.1*rewards['crp']
    reward_crp_corr = reward_crp_sm/(1-0.9**(e+1))
    
    elp_episode = time.time()-start_time
    elp += elp_episode
    line = f'{e:<16}|' + '|'.join([f'{col:<16.4f}' for col in [rewards['bcm'], reward_corr, 
                                                          rewards['crp'], reward_crp_corr,
                                                          elp_episode, elp]])
    print(line)
    
    # reset environment
    env.reset()
    start_time = time.time()
        
        
        
        
        
        
        

The framework is not responsible for any un-matching device issues caused by this operation.[0m


episode         |reward          |reward_sm       |reward_crp      |reward_crp_sm   |elp             |elp_sum         
0               |3.9160          |3.9160          |0.1212          |0.1212          |5.2540          |5.2540          
1               |3.4089          |3.6491          |-0.5495         |-0.2318         |4.9770          |10.2310         
2               |3.9362          |3.7551          |0.1749          |-0.0817         |4.9593          |15.1903         
3               |3.6652          |3.7289          |-0.0352         |-0.0682         |4.9814          |20.1716         
4               |4.0571          |3.8091          |0.2724          |0.0150          |5.0156          |25.1872         


The framework is not responsible for any un-matching device issues caused by this operation.[0m
The framework is not responsible for any un-matching device issues caused by this operation.[0m
The framework is not responsible for any un-matching device issues caused by this operation.[0m


5               |4.1861          |3.8895          |0.3973          |0.0966          |5.0161          |30.2034         
6               |3.3478          |3.7857          |-0.2316         |0.0337          |20.2810         |50.4844         
7               |4.1871          |3.8562          |0.4399          |0.1050          |20.1027         |70.5871         
8               |4.0136          |3.8819          |0.2141          |0.1228          |20.0825         |90.6696         
9               |3.8229          |3.8728          |0.1257          |0.1233          |20.1986         |110.8683        


# Note

- Differences from original paper
    - action is generated from by act_with_noise, not a perturbed actor network
    - last action is not added as inputs of actor network
    - only last-step output of LSTM is used as input of following layers
    - market index performance is not added to state
    

# Others

In [44]:
net_params = {
        'lstm': {
            'hidden_dim': 20,
            'num_layers': 1
        },
        'fc': [64,32],
        'dropout': 0.5,
    }

num_assets = 2
state_dim = 3*(1+num_assets)
action_dim = 1+num_assets


In [45]:
actor = Actor(state_dim, action_dim, net_params)
critic =Critic(state_dim, action_dim, net_params)

In [46]:
actor_target = Actor(state_dim, action_dim, net_params)
critic_target =Critic(state_dim, action_dim, net_params)

In [47]:
optimizer = lambda params, lr: torch.optim.Adam(params, lr=lr, weight_decay=1e-6)
criterion = torch.nn.MSELoss()

In [100]:
ddpg_per = DDPGPer(actor, actor_target,
                 critic, critic_target,
                 optimizer=optimizer,
                 criterion=criterion,
                 batch_size=128,
                 actor_learning_rate=1e-3,
                 critic_learning_rate=1e-3,
                 discount=0.99,
                 replay_size=int(1e6))

In [49]:
k = 10

In [101]:
state = torch.normal(0,1,size=(1,k,state_dim))
next_state = torch.normal(0,1,size=(1,k,state_dim))

temp = torch.normal(0,1, size=(1,action_dim))
action = temp/temp.sum()

# bcm_action = torch.ones(1,action_dim)/action_dim

In [104]:
bcm_action = torch.Tensor([[0.2,0.3,0.5]])

In [105]:
experience = {
                'state': {'state': state},
                'action': {'action': action},
                'next_state': {'state': next_state},
                'reward': 0.0,
                'terminal': False,
                'bcm_action': bcm_action
            }

In [106]:
ddpg_per.store_transition(experience)

In [107]:
batch_size, (state, action, reward, next_state, bcm_action, terminal, others), index, is_weight = \
		ddpg_per.replay_buffer.sample_batch(10, True,
									   sample_attrs=['state','action','reward','next_state','bcm_action','terminal','*'])

In [108]:
bcm_action

[tensor([[0.3333, 0.3333, 0.3333]]),
 tensor([[0.3333, 0.3333, 0.3333]]),
 tensor([[0.3333, 0.3333, 0.3333]]),
 tensor([[0.2000, 0.3000, 0.5000]]),
 tensor([[0.2000, 0.3000, 0.5000]]),
 tensor([[0.2000, 0.3000, 0.5000]]),
 tensor([[0.2000, 0.3000, 0.5000]]),
 tensor([[0.2000, 0.3000, 0.5000]]),
 tensor([[0.2000, 0.3000, 0.5000]]),
 tensor([[0.2000, 0.3000, 0.5000]])]

In [114]:
bcm_action = torch.stack(bcm_action).view(-1,action_dim)

In [110]:
cur_action = ddpg_per.action_transform_function(
			ddpg_per._act(state), state, others
			)
cur_action['action']

tensor([[0.3069, 0.3160, 0.3771],
        [0.3195, 0.3083, 0.3723],
        [0.3081, 0.3126, 0.3793],
        [0.3087, 0.3172, 0.3741],
        [0.3109, 0.3178, 0.3713],
        [0.3118, 0.3196, 0.3686],
        [0.3090, 0.3160, 0.3750],
        [0.3123, 0.3135, 0.3743],
        [0.3112, 0.3191, 0.3697],
        [0.3064, 0.3164, 0.3773]], grad_fn=<SoftmaxBackward>)

In [111]:
cur_action = cur_action['action']

In [112]:
eps = 1e-8

In [116]:
temp = bcm_action * torch.log(cur_action+eps) + (1-bcm_action)*torch.log(1-cur_action+eps)
temp.mean()

tensor(-0.6277, grad_fn=<MeanBackward0>)

In [31]:
a = torch.tensor(np.arange(6).reshape(2,3), dtype=torch.float)
a

tensor([[0., 1., 2.],
        [3., 4., 5.]])

In [36]:
b = torch.tensor(np.arange(6).reshape(2,3), dtype=torch.float)
b[0,0] += 2
b[1,0] += 1
b

tensor([[2., 1., 2.],
        [4., 4., 5.]])

In [37]:
torch.square(a-b, p=2)

tensor(2.2361)