In [1]:
from rljax.algorithm import DQN
from rljax.trainer import Trainer

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from micro_price_trading.config import TWENTY_SECOND_DAY
from micro_price_trading import Preprocess, OptimalExecutionEnvironment

In [2]:
raw = Preprocess('TBT_TBF_data.csv', res_bin=6)
data = raw.process()

In [3]:
# 23,400 seconds between 9:30am and 4pm broken in 10 second increments

NUM_AGENT_STEPS = 5000
SEED = 0

env = OptimalExecutionEnvironment(
    data,
    risk_weights=(2, 1),
    trade_penalty=100,
    max_purchase=2,
    steps=TWENTY_SECOND_DAY,
    end_units_risk=TWENTY_SECOND_DAY*2,  # Ideally, this should be `TWENTY_SECOND_DAY//5*2`
    seed=SEED
)
env_test = env.copy_env()

algo = DQN(
    num_agent_steps=NUM_AGENT_STEPS,
    state_space=env.observation_space,
    action_space=env.action_space,
    seed=SEED,
    batch_size=256,
    start_steps=1000,
    update_interval=1,
    update_interval_target=400,
    eps_decay_steps=0,
    loss_type="l2",
    lr=1e-5,  # Have been messing around with this but doesn't seem to make a big difference
)

trainer = Trainer(
    env=env,
    env_test=env_test,
    algo=algo,
    log_dir="",
    num_agent_steps=NUM_AGENT_STEPS,
    eval_interval=2500,
    seed=SEED,
)
trainer.train()



Num steps: 2500     Return: -16671.6   Time: 0:00:38
Num steps: 5000     Return: -16761.9   Time: 0:01:15


In [4]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)
df = env_test.portfolios_to_df(env_test.portfolio_history[-1])
df

Unnamed: 0,time,cash,shares,prices,total_risk,res_imbalance_state,trade,penalty_trade,trade_asset,trade_shares,trade_risk,trade_price,trade_cost,trade_penalty,risk,next_risk_target,distance_to_next_risk_target,rewards,observations,raw_action,action
0,0,0.0,"(0, 0)","(17.724999999999984, 16.344999999999988)",0,311,,,,,,,,,,10.0,10.0,,,0.0,-2.0
1,1,-35.45,"(2, 0)","(17.814999999999998, 16.435000000000002)",4,110,"Trade(asset=1, shares=2, risk=4, price=17.7249...",,1.0,2.0,4.0,17.725,35.45,False,,10.0,6.0,"(0.020000000000003126, actual)","[12.0, 3, 6]",0.0,-2.0
2,2,-71.08,"(4, 0)","(17.814999999999998, 16.435000000000002)",8,100,"Trade(asset=1, shares=2, risk=4, price=17.8149...",,1.0,2.0,4.0,17.815,35.63,False,,10.0,2.0,"(-0.160000000000025, actual)","[9.0, 2, 2]",0.0,-2.0
3,3,-106.71,"(6, 0)","(17.814999999999998, 16.435000000000002)",12,100,"Trade(asset=1, shares=2, risk=4, price=17.8149...",,1.0,2.0,4.0,17.815,35.63,False,,10.0,-2.0,"(-10.32000000000005, risk penalty)","[9.0, 1, 0]",0.0,-2.0
4,4,-142.34,"(8, 0)","(17.814999999999998, 16.435000000000002)",16,100,"Trade(asset=1, shares=2, risk=4, price=17.8149...",,1.0,2.0,4.0,17.815,35.63,False,,10.0,-6.0,"(-10.96000000000015, risk penalty)","[9.0, 0, 0]",0.0,-2.0
5,5,-177.97,"(10, 0)","(17.814999999999998, 16.435000000000002)",20,100,"Trade(asset=1, shares=2, risk=4, price=17.8149...",,1.0,2.0,4.0,17.815,35.63,False,10.0,10.0,-10.0,"(-11.60000000000025, risk penalty)","[9.0, 4, 0]",0.0,-2.0
6,6,-213.6,"(12, 0)","(17.814999999999998, 16.435000000000002)",24,100,"Trade(asset=1, shares=2, risk=4, price=17.8149...",,1.0,2.0,4.0,17.815,35.63,False,,20.0,-4.0,"(-10.6400000000001, risk penalty)","[9.0, 3, 0]",0.0,-2.0
7,7,-249.23,"(14, 0)","(17.814999999999998, 16.435000000000002)",28,100,"Trade(asset=1, shares=2, risk=4, price=17.8149...",,1.0,2.0,4.0,17.815,35.63,False,,20.0,-8.0,"(-10.0, risk penalty)","[9.0, 2, 0]",0.0,-2.0
8,8,-284.86,"(16, 0)","(17.814999999999998, 16.425)",32,0,"Trade(asset=1, shares=2, risk=4, price=17.8149...",,1.0,2.0,4.0,17.815,35.63,False,,20.0,-12.0,"(-10.0, risk penalty)","[0.0, 1, 0]",0.0,-2.0
9,9,-320.49,"(18, 0)","(17.814999999999998, 16.425)",36,201,"Trade(asset=1, shares=2, risk=4, price=17.8149...",,1.0,2.0,4.0,17.815,35.63,False,,20.0,-16.0,"(-10.0, risk penalty)","[19.0, 0, 0]",0.0,-2.0


#### Raw actions input by DQN before they are zero centered

In [None]:
env_test._raw_actions[-1]

#### Rewards along with a flag for the type of reward

In [None]:
env_test._rewards[-1]

#### Raw observations seen by DQN

In [None]:
env_test._observations[-1]

### Raw format for accessing the portfolio history
Should ideally be gotten by `env_test.portfolio_history`

## FOR SOME REASON, THE LENGTH OF THIS IS NOWHERE NEAR WHAT IT SHOULD BE FOR THE BASE TRAINING ENV. I AM GETTING AROUND 17 ENTRIES FOR IT BUT THE ENV_TEST SEEMS TO BE CORRECT

In [None]:
env_test._portfolios[3][-1]

In [None]:
env_test.end_units_risk

In [None]:
env_test._period_risk.get(5, env_test.end_units_risk)

In [None]:
env_test.end_units_risk - env_test.current_portfolio.total_risk

In [None]:
env_test.prices_at_start

In [None]:
# print(env_test.step(2))
# env_test._portfolios[-1][-1]

### Not sure this plot is fully correct, when the shares hit this line perfectly, they fail to hit the risk plot perfectly. This also happens in reverse

In [None]:
env_test.plot()

### Case and point, shares hit well but risk doesn't

In [None]:
env_test.plot('risk_history')

### Count number of chosen trades
Can use `len([p.time for p in env_test.portfolio_history[-1] if (p.trade or p.penalty_trade)])` for all trades or `len([p.time for p in env_test.portfolio_history[-1] if (p.penalty_trade)])` for the penalty trades

In [None]:
len([p.time for p in env_test.portfolio_history[-1] if (p.trade)])

### Changes the `env_test._period_risk` dictionary to have the amount of risk we should have bought instead of the amount of risk remaining

In [None]:
dict(zip(env_test._period_risk.keys(), env_test.end_units_risk-np.array(list(env_test._period_risk.values()))))

In [None]:
list(zip(env_test._rewards[-1], env_test.portfolio_history[-1]))

In [None]:
env_test._observations[-1]

In [None]:
plt.plot(np.cumsum(env_test._rewards[-1]))

In [None]:
import numpy as np

In [None]:
env_test.risk_history[-1]

In [None]:
set(np.gradient(env_test.risk_history[-1]))

In [None]:
env_test.portfolio_history

In [None]:
np.argwhere(env_test.risk_history[-1] > env_test.end_units_risk)