In [1]:
import random
from datetime import datetime
import time
import resource
import pickle
import os
import pdb

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.regularizers import l2
import tensorflow.keras.backend as K

import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from IPython.display import clear_output, display, HTML

from matplotlib import pyplot as plt
import seaborn as sns

# set seeds for reproducibility
# np.random.uniform(0,10000) 4465
random.seed(4465)
np.random.seed(4465)
tf.random.set_seed(4465)

print("TensorFlow %s" % tf.__version__)
print("Keras %s" % keras.__version__)
print("plotly %s" % plotly.__version__)
print("pandas %s" % pd.__version__)
print("numpy %s" % np.__version__)

# If model save directory isn't made yet, make it
if not os.path.exists('model_output'):
    os.makedirs('model_output')
if not os.path.exists('model_output/trading'):
    os.makedirs('model_output/trading')

TensorFlow 2.3.1
Keras 2.4.0
plotly 4.12.0
pandas 1.1.4
numpy 1.19.4


### 1) Simulate Market Data as Simple Harmonic Motion
- Hooke's law:
- Force is linear function of deviation from trend
    - *F = -k (y - trend)*
- Acceleration is linear function of force
    - *F = ma*
- Acceleration is 2nd derivative w.r.t. to time t
    - *m d<sup>2</sup> x / dt<sup>2</sup> = -k (y - trend)* 
    - *d<sup>2</sup> x / dt<sup>2</sup> = -k/m (y - trend)* 
- Gives a differential equation, solution is sin wave
    - *x(t) = A sin(ω t + Θ)*
- Where:  
    - *ω = (k/m)<sup>1/2</sup>*
- Period is T: 
    - *T = 2π / ω*
    - *T = 2π (m/k)<sup>1/2</sup>*


In [2]:
# show memory usage (some versions of TensorFlow gave memory issues)
def sizeof_fmt(num, suffix='B'):
    """given memory as int format as memory units eg KB"""
    for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Y', suffix)

def memusage():
    """print memory usage"""
    return sizeof_fmt(int(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))

memusage()


'284.7 MB'

In [3]:
def shm_gen(dt=0.001,
            coef=100,     # coef = k/m
            amplitude=2, 
            start_trend=100, 
            trend_per_tick=0.0, 
            noise=0.0, 
            damping=0.0, 
            verbose=False):
    """Generate simple harmonic motion around trend, with noise and damping"""
    
    period = 2 * np.pi * np.sqrt(1/coef)

    if verbose:
        print("%s Amplitude: %.3f" % (time.strftime("%H:%M:%S"), amplitude))
        print("%s Period: %.3f" % (time.strftime("%H:%M:%S"), period))

    # initial stock price
    stock_price = start_trend + amplitude
    stock_velocity = 0.0
    
    trend_index = start_trend
    t = 0.0

    while True:
        # acceleration based on distance from trend
        acc = - coef * (stock_price - trend_index) 
        stock_velocity += acc * dt
        # add noise to velocity
        stock_velocity += np.random.normal(loc=0, scale=noise)
        # damp velocity by a % (could also make this a constant)
        stock_velocity *= (1-damping)
        # increment stock price
        stock_price += stock_velocity * tick_length
        # add noise; doesn't impact velocity which makes velocity a partly hidden state variable
        stock_price += np.random.normal(loc=0, scale=noise/2)
        
        yield(t, stock_price, trend_index)
        t += dt


In [4]:
# simulate market data
total_time=1
ticks = 1000
tick_length = total_time/ticks

# coef = k/m
coef=100
amplitude=2 
start_trend=100 
trend_per_tick=0.0 
noise=0.0 
damping=0.0 

period = 2 * np.pi * np.sqrt(1/coef)
print(period)
# gen = shm_gen(dt=total_time/ticks,
#               coef=coef,
#               amplitude=amplitude, 
#               start_trend=start_trend, 
#               trend_per_tick=trend_per_tick, 
#               noise=noise, 
#               damping=damping, 
#               verbose=1)

gen = shm_gen()

trend_series = []
stock_series = []
time_series = []

for i in range(ticks):
    t, stock_price, trend_index = next(gen)
    stock_series.append(stock_price)
    trend_series.append(trend_index)
    time_series.append(t)


0.6283185307179586


In [5]:
df = pd.DataFrame({'dateindex': time_series, 'trend' : trend_series, 'stock': stock_series})
df['ma'] = df['stock'].rolling(int(0.5*period*ticks)).mean()
df

Unnamed: 0,dateindex,trend,stock,ma
0,0.000,100,101.999800,
1,0.001,100,101.999400,
2,0.002,100,101.998800,
3,0.003,100,101.998000,
4,0.004,100,101.997001,
...,...,...,...,...
995,0.995,100,98.284832,99.338440
996,0.996,100,98.295205,99.327586
997,0.997,100,98.305749,99.316800
998,0.998,100,98.316462,99.306083


In [6]:
def make_figure(*series, title="", xtitle="", ytitle=""):
    fig = go.Figure()
    series=list(series)
    x = series.pop(0)
    for s in series:
        fig.add_trace(go.Scatter(y=s, x=x))
    fig.update_layout(
        title= dict(text=title,
                    x=0.5,
                    xanchor='center'),
        xaxis=dict(
            title=xtitle,
            linecolor='black',
            linewidth=1,
            mirror=True
        ),
        yaxis=dict(
            title=ytitle,
            linecolor='black',
            linewidth=1,
            mirror=True
        ),
        showlegend=False
    )

    return fig.show()


make_figure(df.index, df['stock'],
            title="Simulated Stock Price Data As Simple Harmonic Motion (Sine Wave)",
            xtitle='Timesteps',
            ytitle='Value'
           )

### 2) Simulate Market Data as Simple Harmonic Motion + Noise + Damping


In [7]:
# simulate market data
total_time=1
ticks = 1000
tick_length = total_time/ticks

# coef = k/m
coef=100     
amplitude=1
start_trend=100 
trend_per_tick=0.0 
noise=0.2
damping=0.002

period = 2 * np.pi * np.sqrt(1/coef)

gen = shm_gen(dt=total_time/ticks,
              coef=coef,     # coef = k/m
              amplitude=amplitude, 
              start_trend=start_trend, 
              trend_per_tick=trend_per_tick, 
              noise=noise, 
              damping=damping, 
              verbose=1)

trend_series = []
stock_series = []
time_series = []

for i in range(ticks):
    t, stock_price, trend_index = next(gen)
    
    stock_series.append(stock_price)
    trend_series.append(trend_index)
    time_series.append(t)

df = pd.DataFrame({'dateindex': time_series, 'trend' : trend_series, 'stock': stock_series})
df['ma'] = df['stock'].rolling(int(0.5*period*ticks)).mean()

make_figure(df['dateindex'][-1000:], df['stock'][-1000:],
            title='Simulated Stock Prices As Simple Harmonic Motion + Noise + Damping',
            xtitle='Timesteps',
            ytitle='Value'
           )

12:34:19 Amplitude: 1.000
12:34:19 Period: 0.628


In [8]:
df

Unnamed: 0,dateindex,trend,stock,ma
0,0.000,100,100.890025,
1,0.001,100,101.019802,
2,0.002,100,101.048027,
3,0.003,100,101.117953,
4,0.004,100,101.056659,
...,...,...,...,...
995,0.995,100,97.692352,100.111833
996,0.996,100,97.674142,100.100149
997,0.997,100,97.612459,100.088297
998,0.998,100,97.801359,100.076960


In [9]:
# save to a file
df['datetime'] = pd.date_range(start='1900-01-01', periods=df.shape[0], freq='D')
#df.reset_index(inplace=True)
#df.rename(columns={ df.columns[0]: "timeindex"}, inplace = True)
df[['datetime','stock']].to_csv('shm.csv', index=False)

In [10]:
!head shm.csv
!tail shm.csv
!wc shm.csv

datetime,stock
1900-01-01,100.89002471906774
1900-01-02,101.0198017905789
1900-01-03,101.04802703382575
1900-01-04,101.11795332993756
1900-01-05,101.05665929370792
1900-01-06,101.04551079190047
1900-01-07,101.15873461494674
1900-01-08,101.1691250035856
1900-01-09,100.91603123858681
1902-09-18,97.66356382600853
1902-09-19,97.65601585447001
1902-09-20,97.64584786103549
1902-09-21,97.72192505594742
1902-09-22,97.7141437905507
1902-09-23,97.69235199468247
1902-09-24,97.67414183707504
1902-09-25,97.61245870984068
1902-09-26,97.80135877795387
1902-09-27,97.91321319469633
    1001    1001   29432 shm.csv


### 3) Simulate Market Data as Ornstein-Uhlenbeck process

Random walk plus mean reversion

[Wikipedia](https://en.wikipedia.org/wiki/Ornstein%E2%80%93Uhlenbeck_process)

In [11]:
def ou_gen(dt=0.001,
           sigma=1.0,
           mu=100.0,
           tau=0.05,
           verbose=1):
    """Generate simulated stock data via Ornstein-Uhlenbeck process"""

    sigma_bis = sigma * np.sqrt(2. / tau)
    sqrtdt = np.sqrt(dt)

    x = mu
    t = 0
    yield t, x, mu

    while True:
        x = x + dt * (-(x - mu) / tau) + \
            sigma_bis * sqrtdt * np.random.randn()    
        t += dt
        yield t, x, mu

        
T = 1.  # Total time.
dt = 0.001
ticks = int(T / dt)  # Number of time steps.

sigma = 1.0
mu = 100.0
tau = 0.05
verbose=1

gen = ou_gen(dt=dt,
             sigma=sigma,
             mu=mu,
             tau=tau,
             verbose=1
            )             

stock_series = []
time_series = []

for i in range(ticks):
    t, stock_price, _ = next(gen)
    time_series.append(t)
    stock_series.append(stock_price)

df = pd.DataFrame({'dateindex': time_series, 'stock': stock_series})

make_figure(df['dateindex'], df['stock'],
            title='Simulated Stock Price Data as an Ornstein-Uhlenbeck process',
            xtitle='Timesteps',
            ytitle='Value'
           )

### Run reinforcement learning algos on last 16 levels (deviation from 100) + changes 

In [12]:
gen = shm_gen(dt=1/1000,
              coef=100,     # coef = k/m
              amplitude=2,
              start_trend=100, 
              trend_per_tick=0, 
              noise=0.0,
              damping=0.0, 
              verbose=False)

for i in range(32):
    print(next(gen))


(0.0, 101.9998, 100)
(0.001, 101.99940002, 100)
(0.002, 101.998800099998, 100)
(0.003, 101.99800029998599, 100)
(0.004, 101.997000699944, 100)
(0.005, 101.995801399832, 100)
(0.006, 101.99440251958004, 100)
(0.007, 101.9928041990761, 100)
(0.008, 101.99100659815227, 100)
(0.009000000000000001, 101.98900989656862, 100)
(0.010000000000000002, 101.98681429399531, 100)
(0.011000000000000003, 101.98442000999259, 100)
(0.012000000000000004, 101.98182728398888, 100)
(0.013000000000000005, 101.97903637525677, 100)
(0.014000000000000005, 101.97604756288713, 100)
(0.015000000000000006, 101.9728611457612, 100)
(0.016000000000000007, 101.9694774425207, 100)
(0.017000000000000008, 101.96589679153594, 100)
(0.01800000000000001, 101.96211955087203, 100)
(0.01900000000000001, 101.95814609825304, 100)
(0.02000000000000001, 101.95397683102422, 100)
(0.02100000000000001, 101.9496121661123, 100)
(0.022000000000000013, 101.94505253998376, 100)
(0.023000000000000013, 101.94029840860124, 100)
(0.024000000000

In [13]:
# generator that always returns last n values, levels-100 and changes

def market_gen(gen, lag=16):
    
    buffer = []
    diffbuffer = []


    # fill buffer
    dt, last, trend = next(gen)
    for i in range(lag):
        prev = last
        dt, last, trend = next(gen)
        buffer.append(last-trend)
        diffbuffer.append(last-prev)

    # yield first group of lag vals and diffs
    yield buffer+diffbuffer

    while(True):
        prev = last
        dt, last, trend = next(gen)
        buffer.pop(0)
        buffer.append(last-trend)
        diffbuffer.pop(0)
        diffbuffer.append(last-prev)
        yield buffer+diffbuffer

lag=4
def shm_market_gen():
    return market_gen(gen=shm_gen(dt=1/1000,
                                  coef=100,     # coef = k/m
                                  amplitude=2,
                                  start_trend=100, 
                                  trend_per_tick=0, 
                                  noise=0.0,
                                  damping=0.0, 
                                  verbose=False),
                      lag=lag)
                      


In [14]:
z = shm_market_gen()
next(z)

[1.999400019999996,
 1.9988000999979931,
 1.9980002999859892,
 1.997000699943996,
 -0.00039997999999741296,
 -0.0005999200020028184,
 -0.0007998000120039706,
 -0.000999600041993176]

In [15]:
next(z)

[1.9988000999979931,
 1.9980002999859892,
 1.997000699943996,
 1.995801399832004,
 -0.0005999200020028184,
 -0.0007998000120039706,
 -0.000999600041993176,
 -0.0011993001119918745]

In [16]:
gen = shm_market_gen()

time_series=[]
stock_series=[]
for i in range(1256):
    z = next(gen)
    time_series.append(i)
    stock_series.append(z[lag-1])

df = pd.DataFrame({'dateindex': time_series, 'stock': stock_series})

make_figure(df.index, df['stock'],
            title='Simulated Stock Price Data',
            xtitle='Timesteps',
            ytitle='Value'
           )

In [17]:
# stock environment of n stocks)
# initialize using a generator function and the number of stocks
# observation space is array of stock prices
# step using an action (1, 0, -1) = (long, flat, short) for each stock
#   generates new values for each stock, 
#     returns new observation space
#     and reward which is the total gain/loss for this action

class Market:
    """Follows OpenAI gym environment convention basically
    init with generator and number of stocks
    reset() - generate and return first state
    step() - generate next state and reward
    """
    def __init__(self, gen, lag=16, nstocks=1, episode_length=300):
        self.genfunc = gen
        self.nstocks = nstocks
        self.episode_length = episode_length
        self.t = 0
        self.total_reward = 0
        self.lag = lag
        self.observation_space = np.asarray([1] * nstocks * lag * 2,)
        self.state_size = nstocks * lag * 2
        self.action_size = 2

    def reset(self):
        self.t = 0
        self.total_reward = 0
        self.gen = [self.genfunc() for _ in range(self.nstocks)]
        self.state=[next(g) for g in self.gen]
        self.state = np.asarray([s for s in self.state])
        return self.state
        
    def render(self):
        print(self.state[0, nstocks-1])
        
    def step(self, action):
        action = np.asarray([action])
        self.state=[next(g) for g in self.gen]
        self.state = np.asarray([s for s in self.state])
        # last element is most recent change
        stock_delta = np.asarray([s[-1] for s in self.state])
        # element at lag-1 is most recent deviation
        market_price = np.asarray([s[self.lag-1]+100 for s in self.state])
        # map actions 0 1 2 to positions -1, 0, 1
        position = action - 1
        reward = position @ stock_delta
        self.total_reward += reward
        self.t += 1
        done = True if self.episode_length and self.t >= self.episode_length else False
        # state, reward, done, info
        return self.state, reward, done, market_price
    
    def close(self):
        pass


env = Market(shm_market_gen, lag=4, nstocks=1, episode_length=10)


In [18]:
DISCOUNT_RATE = 0
# WIN_REWARD = 10
EPSILON_DECAY = 0.995
SAMPLE_SIZE = 256
RENDER = False
OUTPUT_DIR = 'model_output/trading/'

class DQN_Agent:
    def __init__(self, state_size, action_size, filename="dqn",
                 discount_rate=DISCOUNT_RATE,
                 learning_rate=0.001,
                 epsilon=1.0,
                 epsilon_decay=EPSILON_DECAY,
                 epsilon_min=0.01):

        self.state_size = state_size
        self.action_size = action_size
        self.filename = filename
        self.discount_rate = discount_rate
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.learning_rate = learning_rate

        self.model = self.build_model()
        self.memory = pd.DataFrame(columns=["state", "action", "next_state",
                                            "reward", "done"])
        self.memory_size = 100000
        self.results = []
        self.train_batch_size = 1
        self.timestep = 0
        self.save_interval = 10

    def build_model(self,
                    n_hidden_layers=2,
                    hidden_layer_size=16,
                    activation='relu',
                    reg_penalty=0.001,
                    dropout=0.0675,
                    verbose=True
                   ):
        """return keras NN model per inputs
        input is a state - array of size state_size
        output is an array of action values - array of size action_size
        """

        inputs = Input(shape=(self.state_size,), name="Input")
        last_layer = inputs

        for i in range(n_hidden_layers):
            if verbose:
                formatstr = "layer %d size %d, %s, reg_penalty %.8f, dropout %.3f"
                print(formatstr % (i + 1,
                                   hidden_layer_size,
                                   activation,
                                   reg_penalty,
                                   dropout))
            # add dropout, but not on inputs, only between hidden layers
            if i and dropout:
                last_layer = Dropout(dropout, name="Dropout%02d" % i)(last_layer)

            last_layer = Dense(units=hidden_layer_size,
                               activation=activation,
                               kernel_initializer=glorot_uniform(),
                               kernel_regularizer=l2(reg_penalty),
                               name="Dense%02d" % i)(last_layer)

        outputs = Dense(self.action_size, activation='linear', name="Output")(last_layer)

        #model = Model(inputs=input_layer , output=last_layer)
        model = Model(inputs=inputs, outputs=outputs)

        if verbose:
            print(model.summary())

        model.compile(loss='mse', optimizer=Adam(
            #learning_rate=self.learning_rate
        ))

        return model

    def remember(self):
        """store the states and rewards needed to fit the model"""
        # append in place
        self.memory.loc[self.memory.shape[0]] = [self.state,
                                                 self.action,
                                                 self.next_state,
                                                 self.reward,
                                                 self.done]

    def train(self):
        """train the model on experience stored by remember"""

        # need at least SAMPLE_SIZE observations
        if self.memory.shape[0] < SAMPLE_SIZE:
            return
        
        # truncate memory
        self.memory = self.memory[-self.memory_size:]
        # sample sample_size observations from memory
        minibatch = self.memory.sample(n=SAMPLE_SIZE)

        # target is our best estimate of value of each action
        X_fit = np.concatenate(minibatch['state'].values)
        X_fit = X_fit.reshape((SAMPLE_SIZE, self.state_size))
        Y_pred = self.model.predict(X_fit)

        # we don't just fit model against model's own prediction, gets us nowhere
        # we improve the target by what we learned about the action we actually took
        # value is reward obtained + predicted value of the observed next state
        minibatch['target_observed'] = minibatch['reward']
        # if done, target is the reward
        # reward by gym env is only 1 for each timestep of survival
        # but we also added a reward of -10 on failure
        # if not done, add discount_rate  * Q-value prediction for  observed next state
        not_done = minibatch.loc[minibatch['done'] == False]
        X_observed = np.concatenate(not_done['next_state'].values)
        X_observed = X_observed.reshape((not_done.shape[0], self.state_size))
        # run all predictions at once
        # iterates faster but does not train after each prediction
        y_observed_pred = np.amax(self.model.predict(X_observed), axis=1)
        minibatch.loc[minibatch['done'] == False, 'target_observed'] \
            += self.discount_rate * y_observed_pred
        # vectorized vlookup - update col specified by action with target_observed
        np.put_along_axis(Y_pred,
                          minibatch['action'].astype(int).values.reshape(SAMPLE_SIZE, 1),
                          minibatch['target_observed'].values.reshape(SAMPLE_SIZE, 1),
                          axis=1)
        # fit model against improved target
        # arbitrary 8 batch size to reduce variance a little and speed up fit
        self.model.fit(X_fit, Y_pred,
                       epochs=1,
                       batch_size=self.train_batch_size,
                       verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def act(self, state):
        """pick an action using model"""
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def save(self):
        "save agent: pickle self and use Keras native save model"
        fullname = "%s%s%05d" % (OUTPUT_DIR, self.filename, len(self.results))
        self.model.save("%s.h5" % fullname)
        pickle.dump(self, open("%s.p" % fullname, "wb"))

    def load(filename):
        "load saved agent"
        new = pickle.load(open("%s.p" % filename, "rb"))
        new.model = load_model("%s.h5" % filename)
        print("loaded %d results, %d rows of memory, epsilon %.4f" % (len(new.results),
                                                                      len(new.memory),
                                                                      new.epsilon))
        return new

    def reset(self):
        """reset agent for start of episode"""
        self.timestep = 0
        self.total_reward = 0

    def increment_time(self):
        """increment timestep counter"""
        self.timestep += 1

    def save_score(self):
        """save score of each episode"""
        self.results.append(self.total_reward)

    def score_episode(self, episode_num, n_episodes):
        """output results and save"""
        self.save_score()
        avglen = min(len(self.results), self.save_interval)
        formatstr = "{} episode {}/{}:, score: {}, {}-episode avg: {:.1f} Memory: {}        "
        print(formatstr.format(time.strftime("%H:%M:%S"), len(self.results),
                               n_episodes, self.total_reward, avglen,
                               sum(self.results[-avglen:])/avglen, memusage()),
              end="\r", flush=False)

    def run_episode(self, render=RENDER):
        """run a full episode"""
        global env

        self.reset()
        self.state = env.reset()
        self.done = False

        while not self.done:
            if render:
                env.render()
            self.action = self.act(self.state.reshape([1, self.state_size]))
            self.next_state, self.reward, self.done, _ = env.step(self.action)
            self.total_reward += self.reward
            self.remember()
            self.state = self.next_state
            self.increment_time()
            
        if render:
            env.render()
            
        self.train()
   
    def rlplot(self, title='Agent Training Progress'):
        """plot training progress"""
        df = pd.DataFrame({'timesteps': self.results})
        df['avg'] = df['timesteps'].rolling(10).mean()

        fig = go.Figure()
        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['timesteps'],
                                 mode='markers',
                                 name='timesteps',
                                 marker=dict(
                                     color='mediumblue',
                                     size=4,
                                 ),
                                ))

        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['avg'],
                                 mode='lines',
                                 line_width=3,
                                 name='moving average'))

        fig.update_layout(
            title=dict(text=title,
                       x=0.5,
                       xanchor='center'),
            xaxis=dict(
                title="Episodes",
                linecolor='black',
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                title="Total Reward per Episode",
                linecolor='black',
                linewidth=1,
                mirror=True
            ),
            legend=go.layout.Legend(
                x=0.01,
                y=0.99,
                traceorder="normal",
                font=dict(
                    family="sans-serif",
                    size=12,
                    color="black"
                ),
                #bgcolor="LightSteelBlue",
                bordercolor="Black",
                borderwidth=1,
            ),
        )

        return fig.show()
    

In [19]:
# very slow, don't even bother

# N_EPISODES = 10000
# ticks_per_episode = 1256
# nstocks = 1


# env = Market(shm_market_gen,
#              nstocks=1, 
#              episode_length=ticks_per_episode)

# agent = DQN_Agent(state_size=nstocks*32,
#                   action_size=2,
#                  )

# start_time = time.time()

# for e in range(N_EPISODES):
#     agent.run_episode()
#     agent.score_episode(e, N_EPISODES)
    
#     if e and (e+1) % agent.save_interval == 0:
#         agent.save()

# elapsed_time = time.time() - start_time
# print("Train time: ", elapsed_time)        
# agent.rlplot()

In [20]:
RENDER = False
OUTPUT_DIR = 'model_output/trading/'

class Agent:
    """abstract base class for agents"""

    def __init__(self, state_size, action_size, filename="model",
                 *args, **kwargs):
        self.state_size = state_size
        self.action_size = action_size
        self.filename = filename
        self.timestep = 0
        self.total_reward = 0
        self.save_interval = 10

        raise NotImplementedError

    def build_model(self, *args, **kwargs):
        """build the relevant model"""
        raise NotImplementedError

    def reset(self):
        """reset agent for start of episode"""
        self.timestep = 0
        self.total_reward = 0

    def increment_time(self):
        """increment timestep counter"""
        self.timestep += 1

    def remember(self, *args, **kwargs):
        """store the states and rewards needed to fit the model"""
        raise NotImplementedError

    def train(self, *args, **kwargs):
        """train the model on experience stored by remember"""
        raise NotImplementedError

    def act(self, *args, **kwargs):
        """pick an action using model"""
        raise NotImplementedError

    def save_score(self):
        """save score of each episode"""
        self.results.append(self.total_reward)

    def score_episode(self, episode_num, n_episodes):
        """output results and save"""
        self.save_score()
        avglen = min(len(self.results), self.save_interval)
        formatstr = "{} episode {}/{}:, score: {}, {}-episode avg: {:.1f} Memory: {}        "
        print(formatstr.format(time.strftime("%H:%M:%S"), len(self.results),
                               n_episodes, self.total_reward, avglen,
                               sum(self.results[-avglen:])/avglen, memusage()),
              end="\r", flush=False)

    def run_episode(self, render=RENDER):
        """run a full episode"""
        global env

        self.reset()
        self.state = env.reset()
        self.done = False

        while not self.done:
            if render:
                env.render()
            self.action = self.act(self.state.reshape([1, self.state_size]))
            self.next_state, self.reward, self.done, _ = env.step(self.action)
            self.total_reward += self.reward

            self.remember()
            self.state = self.next_state
            self.increment_time()
            
        if render:
            env.render()
            
        self.train()
   
    def save(self, *args, **kwargs):
        """save agent to disk"""
        raise NotImplementedError

    def load(*args, **kwargs):
        """load agent from disk"""
        raise NotImplementedError

    def view(self):
        """Run an episode without training, with rendering"""
        state = env.reset()
        state = np.reshape(state, [1, self.state_size])
        done = False

        # run an episode
        self.timestep = 0
        r = 0
        while not done:
            env.render()
            action = self.act(state)
            lastmarket = self.state[0, nstocks-1]
            state, reward, done, _ = env.step(action)
            newmarket = self.state[0, nstocks-1]
            print("prev mkt: %.4f action: %d, new mkt %f, reward %f" % (lastmarket, action, newmarket, reward))
            r += reward
            state = np.reshape(state, [1, self.state_size])
            self.timestep += 1
        env.render()
        print(r)
        env.close()
        return self.timestep

    def rlplot(self, title='Trading Agent Training Progress'):
        """plot training progress"""
        df = pd.DataFrame({'timesteps': self.results})
        df['avg'] = df['timesteps'].rolling(10).mean()

        fig = go.Figure()
        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['timesteps'],
                                 mode='markers',
                                 name='timesteps',
                                 marker=dict(
                                     color='mediumblue',
                                     size=4,
                                 ),
                                ))

        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['avg'],
                                 mode='lines',
                                 line_width=3,
                                 name='moving average'))

        fig.update_layout(
            title=dict(text=title,
                       x=0.5,
                       xanchor='center'),
            xaxis=dict(
                title="Episodes",
                linecolor='black',
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                title="Total Reward per Episode",
                linecolor='black',
                linewidth=1,
                mirror=True
            ),
            legend=go.layout.Legend(
                x=0.01,
                y=0.99,
                traceorder="normal",
                font=dict(
                    family="sans-serif",
                    size=12,
                    color="black"
                ),
                #bgcolor="LightSteelBlue",
                bordercolor="Black",
                borderwidth=1,
            ),
        )

        return fig.show()


In [21]:
class REINFORCE_Agent(Agent):
    """REINFORCE policy gradient method using deep Keras NN"""
    def __init__(self, state_size=4, action_size=2, learning_rate=0.0005,
                 discount_rate=DISCOUNT_RATE, n_hidden_layers=2, hidden_layer_size=16,
                 activation='relu', reg_penalty=0, dropout=0, filename="kreinforce",
                 verbose=True):
        self.state_size = state_size
        self.action_size = action_size
        self.action_space = list(range(action_size))
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate

        self.n_hidden_layers = n_hidden_layers
        self.hidden_layer_size = hidden_layer_size
        self.activation = activation
        self.reg_penalty = reg_penalty
        self.dropout = dropout
        self.verbose = verbose
        self.filename = filename

        self.train_model, self.predict_model = self.policy_model()
        self.results = []
        self.save_interval = 10
        self.reset()

    def reset(self):
        """reset agent for start of episode"""
        self.timestep = 0
        # truncate memory
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.total_reward = 0

    def policy_model(self):
        """set up NN model for policy.
        predict returns probs of actions to sample from.
        train needs discounted rewards for the episode, so we define custom loss.
        when training use train_model with custom loss and multi input of training data and rewards.
        when predicting use predict_model with single input.
        """
        
        def custom_loss(y_true, y_pred):
            y_pred_clip = K.clip(y_pred, 1e-8, 1-1e-8)
            log_likelihood = y_true*K.log(y_pred_clip)
            return K.sum(-log_likelihood*discounted_rewards)

        inputs = Input(shape=(self.state_size,), name="Input")
        discounted_rewards = Input(shape=(1,), name="Discounted_rewards")
        last_layer = inputs

        for i in range(self.n_hidden_layers):
            if self.verbose:
                formatstr = "layer %d size %d, %s, reg_penalty %.8f, dropout %.3f"
                print(formatstr % (i + 1,
                                   self.hidden_layer_size,
                                   self.activation,
                                   self.reg_penalty,
                                   self.dropout,
                                   ))
            # add dropout, but not on inputs, only between hidden layers
            if i and self.dropout:
                last_layer = Dropout(self.dropout, name="Dropout%02d" % i)(last_layer)

            last_layer = Dense(units=self.hidden_layer_size,
                               activation=self.activation,
                               kernel_initializer=glorot_uniform(),
                               kernel_regularizer=keras.regularizers.l2(self.reg_penalty),
                               name="Dense%02d" % i)(last_layer)

        outputs = Dense(self.action_size, activation='softmax', name="Output")(last_layer)

        train_model = Model(inputs=[inputs, discounted_rewards], outputs=[outputs])
        train_model.compile(optimizer=Adam(lr=self.learning_rate), loss=custom_loss)

        predict_model = Model(inputs=[inputs], outputs=[outputs])

        if self.verbose:
            print(predict_model.summary())

        return train_model, predict_model

    def act(self, state):
        """pick an action using predict_model"""
        probabilities = self.predict_model.predict(state)
        action = np.random.choice(self.action_space, p=probabilities[0])
        return action

    def remember(self):
        """at each step save state, action, reward for future training"""
        
        self.state_memory.append(self.state)
        self.action_memory.append(self.action)
        self.reward_memory.append(self.reward)

    def train(self):
        """train the model on experience stored by remember"""
        state_memory = np.array(self.state_memory)
        state_memory = state_memory.reshape((len(self.state_memory),self.state_size))
        action_memory = np.array(self.action_memory)
        reward_memory = np.array(self.reward_memory)

        # one-hot actions
        actions = np.zeros([len(action_memory), self.action_size])
        actions[np.arange(len(action_memory)), action_memory] = 1

        disc_rewards = np.zeros_like(reward_memory)
        cumulative_rewards = 0
        for i in reversed(range(len(reward_memory))):
            cumulative_rewards = cumulative_rewards * self.discount_rate + reward_memory[i]
            disc_rewards[i] = cumulative_rewards

        # standardize
        disc_rewards -= np.mean(disc_rewards)
        disc_rewards /= np.std(disc_rewards) if np.std(disc_rewards) > 0 else 1

        # train states v. actions, (complemented by disc_rewards_std)
        cost = self.train_model.train_on_batch([state_memory, disc_rewards], actions)

        return cost

    def view(self):
        """Run an episode without training, with rendering"""
        state = env.reset()
        state = np.reshape(state, [1, self.state_size])
        done = False

        # run an episode
        self.timestep = 0
        r = 0
        retarray = []
        while not done:
            action = self.act(state)
            lastmarket = state[0, self.state_size//2-1]
            state, reward, done, _ = env.step(action)
            newmarket = state[0, self.state_size//2-1]
            print("prev mkt: %.4f action: %d, new mkt %.4f, reward %f" % (lastmarket, action, newmarket, reward))
            r += reward
            state = np.reshape(state, [1, self.state_size])
            self.timestep += 1
            retarray.append((self.timestep, action, lastmarket, newmarket, reward))
        print(r)
        env.close()
        return retarray

    def save(self):
        "save agent: pickle self and use Keras native save model"
        fullname = "%s%s%05d" % (OUTPUT_DIR, self.filename, len(self.results))
        self.predict_model.save("%s_predict.h5" % fullname)
        # can't save / load train model due to custom loss
        pickle.dump(self, open("%s.p" % fullname, "wb"))

    def load(filename, memory=True):
        "load saved agent"
        self = pickle.load(open("%s.p" % filename, "rb"))
        self.predict_model = load_model("%s_predict.h5" % filename)
        print("loaded %d results, %d rows of memory, epsilon %.4f" % (len(self.results),
                                                                      len(self.memory),
                                                                      self.epsilon))



In [22]:
import tensorflow as tf
tf.compat.v1.disable_eager_execution()

In [23]:
N_EPISODES = 2000
ticks_per_episode = 1256
nstocks = 1
lag = 1

def shm_market_gen():
    return market_gen(gen=shm_gen(dt=1/1000,
                                  coef=100,     # coef = k/m
                                  amplitude=2,
                                  start_trend=100, 
                                  trend_per_tick=0, 
                                  noise=0.0,
                                  damping=0.0, 
                                  verbose=False),
                      lag=lag)
   
env = Market(shm_market_gen,
             lag=lag,
             nstocks=1, 
             episode_length=ticks_per_episode)

agent = REINFORCE_Agent(state_size=nstocks*lag*2,
                        action_size=3,
                       )

start_time = time.time()
print("Start: %s" % (time.strftime("%H:%M:%S")))

for e in range(N_EPISODES):
    agent.run_episode()
    agent.score_episode(e, N_EPISODES)
    
#     if e and (e+1) % agent.save_interval == 0:
#         agent.save()

elapsed_time = time.time() - start_time
print("\nTrain time: ", elapsed_time)        


layer 1 size 16, relu, reg_penalty 0.00000000, dropout 0.000
layer 2 size 16, relu, reg_penalty 0.00000000, dropout 0.000
Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           [(None, 2)]               0         
_________________________________________________________________
Dense00 (Dense)              (None, 16)                48        
_________________________________________________________________
Dense01 (Dense)              (None, 16)                272       
_________________________________________________________________
Output (Dense)               (None, 3)                 51        
Total params: 371
Trainable params: 371
Non-trainable params: 0
_________________________________________________________________
None
Start: 12:34:20
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatic

In [24]:
agent.rlplot("Training Progress: Simple Harmonic Motion")

In [25]:
env.reset()
z = agent.view()

df = pd.DataFrame(z)
df.columns = ["timestep", "action", "lastmarket", "newmarket", "reward"]
df['lastmarket']+=100
df['newmarket']+=100
df['short'] = np.nan
df.loc[df['action']==0, 'short'] = df['newmarket']
df['flat'] = np.nan
df.loc[df['action']==1, 'flat'] = df['newmarket']
df['long'] = np.nan
df.loc[df['action']==2, 'long'] = df['newmarket']
df['totalreward'] = df['reward'].cumsum()
df.to_csv('df.csv')
df

prev mkt: 1.9994 action: 2, new mkt 1.9988, reward -0.000600
prev mkt: 1.9988 action: 2, new mkt 1.9980, reward -0.000800
prev mkt: 1.9980 action: 2, new mkt 1.9970, reward -0.001000
prev mkt: 1.9970 action: 0, new mkt 1.9958, reward 0.001199
prev mkt: 1.9958 action: 0, new mkt 1.9944, reward 0.001399
prev mkt: 1.9944 action: 0, new mkt 1.9928, reward 0.001598
prev mkt: 1.9928 action: 0, new mkt 1.9910, reward 0.001798
prev mkt: 1.9910 action: 0, new mkt 1.9890, reward 0.001997
prev mkt: 1.9890 action: 0, new mkt 1.9868, reward 0.002196
prev mkt: 1.9868 action: 0, new mkt 1.9844, reward 0.002394
prev mkt: 1.9844 action: 0, new mkt 1.9818, reward 0.002593
prev mkt: 1.9818 action: 0, new mkt 1.9790, reward 0.002791
prev mkt: 1.9790 action: 2, new mkt 1.9760, reward -0.002989
prev mkt: 1.9760 action: 2, new mkt 1.9729, reward -0.003186
prev mkt: 1.9729 action: 0, new mkt 1.9695, reward 0.003384
prev mkt: 1.9695 action: 0, new mkt 1.9659, reward 0.003581
prev mkt: 1.9659 action: 0, new mkt

prev mkt: -0.7497 action: 0, new mkt -0.7682, reward 0.018504
prev mkt: -0.7682 action: 0, new mkt -0.7866, reward 0.018428
prev mkt: -0.7866 action: 0, new mkt -0.8049, reward 0.018349
prev mkt: -0.8049 action: 0, new mkt -0.8232, reward 0.018268
prev mkt: -0.8232 action: 0, new mkt -0.8414, reward 0.018186
prev mkt: -0.8414 action: 0, new mkt -0.8595, reward 0.018102
prev mkt: -0.8595 action: 0, new mkt -0.8775, reward 0.018016
prev mkt: -0.8775 action: 0, new mkt -0.8954, reward 0.017928
prev mkt: -0.8954 action: 0, new mkt -0.9133, reward 0.017839
prev mkt: -0.9133 action: 0, new mkt -0.9310, reward 0.017747
prev mkt: -0.9310 action: 0, new mkt -0.9487, reward 0.017654
prev mkt: -0.9487 action: 0, new mkt -0.9662, reward 0.017559
prev mkt: -0.9662 action: 0, new mkt -0.9837, reward 0.017463
prev mkt: -0.9837 action: 0, new mkt -1.0011, reward 0.017364
prev mkt: -1.0011 action: 0, new mkt -1.0183, reward 0.017264
prev mkt: -1.0183 action: 0, new mkt -1.0355, reward 0.017162
prev mkt

prev mkt: -0.9542 action: 2, new mkt -0.9366, reward 0.017625
prev mkt: -0.9366 action: 2, new mkt -0.9189, reward 0.017718
prev mkt: -0.9189 action: 2, new mkt -0.9011, reward 0.017810
prev mkt: -0.9011 action: 2, new mkt -0.8832, reward 0.017900
prev mkt: -0.8832 action: 2, new mkt -0.8652, reward 0.017988
prev mkt: -0.8652 action: 2, new mkt -0.8471, reward 0.018075
prev mkt: -0.8471 action: 2, new mkt -0.8290, reward 0.018160
prev mkt: -0.8290 action: 2, new mkt -0.8107, reward 0.018243
prev mkt: -0.8107 action: 2, new mkt -0.7924, reward 0.018324
prev mkt: -0.7924 action: 2, new mkt -0.7740, reward 0.018403
prev mkt: -0.7740 action: 2, new mkt -0.7555, reward 0.018480
prev mkt: -0.7555 action: 2, new mkt -0.7370, reward 0.018556
prev mkt: -0.7370 action: 2, new mkt -0.7183, reward 0.018630
prev mkt: -0.7183 action: 2, new mkt -0.6996, reward 0.018701
prev mkt: -0.6996 action: 2, new mkt -0.6809, reward 0.018771
prev mkt: -0.6809 action: 2, new mkt -0.6620, reward 0.018839
prev mkt

prev mkt: 1.9594 action: 0, new mkt 1.9553, reward 0.004107
prev mkt: 1.9553 action: 0, new mkt 1.9510, reward 0.004303
prev mkt: 1.9510 action: 0, new mkt 1.9465, reward 0.004498
prev mkt: 1.9465 action: 0, new mkt 1.9418, reward 0.004693
prev mkt: 1.9418 action: 0, new mkt 1.9369, reward 0.004887
prev mkt: 1.9369 action: 0, new mkt 1.9319, reward 0.005081
prev mkt: 1.9319 action: 0, new mkt 1.9266, reward 0.005274
prev mkt: 1.9266 action: 0, new mkt 1.9211, reward 0.005466
prev mkt: 1.9211 action: 0, new mkt 1.9155, reward 0.005659
prev mkt: 1.9155 action: 0, new mkt 1.9096, reward 0.005850
prev mkt: 1.9096 action: 0, new mkt 1.9036, reward 0.006041
prev mkt: 1.9036 action: 0, new mkt 1.8973, reward 0.006231
prev mkt: 1.8973 action: 0, new mkt 1.8909, reward 0.006421
prev mkt: 1.8909 action: 0, new mkt 1.8843, reward 0.006610
prev mkt: 1.8843 action: 0, new mkt 1.8775, reward 0.006799
prev mkt: 1.8775 action: 0, new mkt 1.8705, reward 0.006986
prev mkt: 1.8705 action: 0, new mkt 1.86

prev mkt: -1.4773 action: 0, new mkt -1.4907, reward 0.013408
prev mkt: -1.4907 action: 0, new mkt -1.5040, reward 0.013259
prev mkt: -1.5040 action: 0, new mkt -1.5171, reward 0.013109
prev mkt: -1.5171 action: 0, new mkt -1.5300, reward 0.012957
prev mkt: -1.5300 action: 0, new mkt -1.5428, reward 0.012804
prev mkt: -1.5428 action: 0, new mkt -1.5555, reward 0.012650
prev mkt: -1.5555 action: 0, new mkt -1.5680, reward 0.012494
prev mkt: -1.5680 action: 0, new mkt -1.5803, reward 0.012337
prev mkt: -1.5803 action: 0, new mkt -1.5925, reward 0.012179
prev mkt: -1.5925 action: 0, new mkt -1.6045, reward 0.012020
prev mkt: -1.6045 action: 0, new mkt -1.6164, reward 0.011860
prev mkt: -1.6164 action: 0, new mkt -1.6281, reward 0.011698
prev mkt: -1.6281 action: 0, new mkt -1.6396, reward 0.011535
prev mkt: -1.6396 action: 0, new mkt -1.6510, reward 0.011371
prev mkt: -1.6510 action: 0, new mkt -1.6622, reward 0.011206
prev mkt: -1.6622 action: 0, new mkt -1.6732, reward 0.011040
prev mkt

prev mkt: -0.2007 action: 2, new mkt -0.1808, reward 0.019909
prev mkt: -0.1808 action: 2, new mkt -0.1609, reward 0.019927
prev mkt: -0.1609 action: 2, new mkt -0.1409, reward 0.019943
prev mkt: -0.1409 action: 2, new mkt -0.1210, reward 0.019957
prev mkt: -0.1210 action: 2, new mkt -0.1010, reward 0.019969
prev mkt: -0.1010 action: 2, new mkt -0.0810, reward 0.019980
prev mkt: -0.0810 action: 2, new mkt -0.0610, reward 0.019988
prev mkt: -0.0610 action: 2, new mkt -0.0411, reward 0.019994
prev mkt: -0.0411 action: 2, new mkt -0.0211, reward 0.019998
prev mkt: -0.0211 action: 2, new mkt -0.0011, reward 0.020000
prev mkt: -0.0011 action: 2, new mkt 0.0189, reward 0.020000
prev mkt: 0.0189 action: 2, new mkt 0.0389, reward 0.019998
prev mkt: 0.0389 action: 2, new mkt 0.0589, reward 0.019994
prev mkt: 0.0589 action: 2, new mkt 0.0789, reward 0.019988
prev mkt: 0.0789 action: 2, new mkt 0.0989, reward 0.019980
prev mkt: 0.0989 action: 2, new mkt 0.1189, reward 0.019971
prev mkt: 0.1189 ac

Unnamed: 0,timestep,action,lastmarket,newmarket,reward,short,flat,long,totalreward
0,1,2,101.999400,101.998800,-0.000600,,,101.998800,-0.000600
1,2,2,101.998800,101.998000,-0.000800,,,101.998000,-0.001400
2,3,2,101.998000,101.997001,-0.001000,,,101.997001,-0.002399
3,4,0,101.997001,101.995801,0.001199,101.995801,,,-0.001200
4,5,0,101.995801,101.994403,0.001399,101.994403,,,0.000199
...,...,...,...,...,...,...,...,...,...
1251,1252,2,101.999044,101.999571,0.000526,,,101.999571,15.359077
1252,1253,2,101.999571,101.999897,0.000326,,,101.999897,15.359403
1253,1254,0,101.999897,102.000023,-0.000126,102.000023,,,15.359277
1254,1255,2,102.000023,101.999950,-0.000074,,,101.999950,15.359203


In [26]:
def tradesim_chart(df, title="Trading Simulation"):
    
    fig = go.Figure()
    markersize=4

    # x axis
    x = df['timestep']
    
    red = 'rgba(192, 32, 32, 0.75)'
    blue = 'rgba(32, 32, 192, 0.75)'
    green = 'rgba(0, 204, 0, 0.75)'
    black = 'rgba(32, 32, 32, 0.75)'

    fig = make_subplots(specs=[[{"secondary_y": True}]])

    fig.add_trace(go.Scatter(y=df['short'], 
                             x=x, 
                             name='Short (left axis)',
                             mode='markers',
                             marker=dict(size=markersize,
                                         color=red),
                            ),
                  secondary_y=False,
                 )

    fig.add_trace(go.Scatter(y=df['flat'], 
                             x=x, 
                             name='Flat (left axis)',
                             mode='markers',
                             marker=dict(size=markersize,
                                         color=blue),
                            ),
                  secondary_y=False,
                 )

    fig.add_trace(go.Scatter(y=df['long'], 
                             x=x, 
                             name='Long (left axis)',
                             mode='markers',
                             marker=dict(size=markersize,
                                         color=green),
                            ),
                  secondary_y=False,
                 )

    fig.add_trace(go.Scatter(y=df['totalreward'], 
                             x=x, 
                             name='Total reward (right)',
                             mode='markers',
                             marker=dict(size=markersize,
                                         color=black),
                            ),
                  secondary_y=True,
                 )

    # plot attributes
    fig.update_layout(
        title= dict(text=title,
                    x=0.5,
                    xanchor='center'),
        xaxis=dict(
            title="Timesteps",
            linecolor='black',
            linewidth=1,
            mirror=True
        ),
        yaxis=dict(
            title="Price",
            linecolor='black',
            linewidth=1,
            mirror=True
        ),
        showlegend=True,
        legend=dict(x=0.738, y=0.05)    
    )

    fig.update_yaxes(title_text="Total reward", secondary_y=True)

    fig.show()

In [27]:
tradesim_chart(df, title="Trading Simulation: Simple Harmonic Motion")

### Simulation: simple harmonic motion plus noise plus damping

In [28]:
# coef = k/m

def shmplusgen():
    return market_gen(gen=shm_gen(dt=1/1000,
                                  coef=100,     # coef = k/m
                                  amplitude=1,
                                  start_trend=100, 
                                  trend_per_tick=0, 
                                  noise=0.2,
                                  damping=0.002, 
                                  verbose=False),
                      lag=16)
gen = shmplusgen()

time_series=[]
stock_series=[]
for i in range(1256):
    z = next(gen)
    time_series.append(i)
    stock_series.append(z[15])

df = pd.DataFrame({'dateindex': time_series, 'stock': stock_series})

make_figure(df['dateindex'], df['stock'],
            title='Simulated Stock Price Data: Simple Harmonic Motion + Noise + Damping',
            xtitle='Timesteps',
            ytitle='Value'
           )

In [29]:
N_EPISODES = 2000
ticks_per_episode = 1256
nstocks = 1
lag=16
print("Using %d lags" % lag)

env = Market(shmplusgen,
             lag=lag,
             nstocks=1, 
             episode_length=ticks_per_episode)

agent = REINFORCE_Agent(state_size=nstocks*lag*2,
                        action_size=3,
                       )

start_time = time.time()
print("Start: %s" % (time.strftime("%H:%M:%S")))

for e in range(N_EPISODES):
    agent.run_episode()
    agent.score_episode(e, N_EPISODES)
    
#     if e and (e+1) % agent.save_interval == 0:
#         agent.save()

elapsed_time = time.time() - start_time
print("\nTrain time: ", elapsed_time)        


Using 16 lags
layer 1 size 16, relu, reg_penalty 0.00000000, dropout 0.000
layer 2 size 16, relu, reg_penalty 0.00000000, dropout 0.000
Model: "functional_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           [(None, 32)]              0         
_________________________________________________________________
Dense00 (Dense)              (None, 16)                528       
_________________________________________________________________
Dense01 (Dense)              (None, 16)                272       
_________________________________________________________________
Output (Dense)               (None, 3)                 51        
Total params: 851
Trainable params: 851
Non-trainable params: 0
_________________________________________________________________
None
Start: 13:05:56
13:36:46 episode 2000/2000:, score: 4.625596503264163, 10-episode avg: 4.9 Memory: 338.2 MB      

In [30]:
agent.rlplot("Training: Simple Harmonic Motion + Noise + Damping")

In [31]:
z = agent.view()

df = pd.DataFrame(z)
df.columns = ["timestep", "action", "lastmarket", "newmarket", "reward"]
df['lastmarket']+=100
df['newmarket']+=100
df['short'] = np.nan
df.loc[df['action']==0, 'short'] = df['newmarket']
df['flat'] = np.nan
df.loc[df['action']==1, 'flat'] = df['newmarket']
df['long'] = np.nan
df.loc[df['action']==2, 'long'] = df['newmarket']
df['totalreward'] = df['reward'].cumsum()
df.to_csv('df.csv')
df

prev mkt: 1.5993 action: 2, new mkt 1.7345, reward 0.135162
prev mkt: 1.7345 action: 2, new mkt 1.5850, reward -0.149535
prev mkt: 1.5850 action: 2, new mkt 1.4938, reward -0.091146
prev mkt: 1.4938 action: 0, new mkt 1.6575, reward -0.163712
prev mkt: 1.6575 action: 0, new mkt 1.7484, reward -0.090850
prev mkt: 1.7484 action: 2, new mkt 1.7951, reward 0.046778
prev mkt: 1.7951 action: 0, new mkt 1.7944, reward 0.000745
prev mkt: 1.7944 action: 0, new mkt 1.6860, reward 0.108378
prev mkt: 1.6860 action: 0, new mkt 1.5926, reward 0.093440
prev mkt: 1.5926 action: 0, new mkt 1.4546, reward 0.137939
prev mkt: 1.4546 action: 0, new mkt 1.5839, reward -0.129249
prev mkt: 1.5839 action: 0, new mkt 1.5535, reward 0.030373
prev mkt: 1.5535 action: 0, new mkt 1.5932, reward -0.039701
prev mkt: 1.5932 action: 0, new mkt 1.5078, reward 0.085459
prev mkt: 1.5078 action: 0, new mkt 1.5945, reward -0.086747
prev mkt: 1.5945 action: 0, new mkt 1.6404, reward -0.045861
prev mkt: 1.6404 action: 0, new 

prev mkt: -0.0815 action: 0, new mkt -0.1674, reward 0.085870
prev mkt: -0.1674 action: 0, new mkt -0.2285, reward 0.061112
prev mkt: -0.2285 action: 0, new mkt -0.4837, reward 0.255217
prev mkt: -0.4837 action: 0, new mkt -0.4979, reward 0.014187
prev mkt: -0.4979 action: 0, new mkt -0.3642, reward -0.133658
prev mkt: -0.3642 action: 0, new mkt -0.6577, reward 0.293434
prev mkt: -0.6577 action: 0, new mkt -0.8112, reward 0.153522
prev mkt: -0.8112 action: 0, new mkt -0.7584, reward -0.052838
prev mkt: -0.7584 action: 0, new mkt -0.7269, reward -0.031454
prev mkt: -0.7269 action: 0, new mkt -0.8960, reward 0.169126
prev mkt: -0.8960 action: 0, new mkt -0.8845, reward -0.011579
prev mkt: -0.8845 action: 0, new mkt -0.9518, reward 0.067335
prev mkt: -0.9518 action: 0, new mkt -1.0183, reward 0.066478
prev mkt: -1.0183 action: 0, new mkt -0.9569, reward -0.061327
prev mkt: -0.9569 action: 0, new mkt -0.9770, reward 0.020108
prev mkt: -0.9770 action: 0, new mkt -0.9518, reward -0.025221
pr

prev mkt: -0.6434 action: 2, new mkt -0.5778, reward 0.065629
prev mkt: -0.5778 action: 2, new mkt -0.6093, reward -0.031521
prev mkt: -0.6093 action: 2, new mkt -0.8370, reward -0.227627
prev mkt: -0.8370 action: 2, new mkt -0.8985, reward -0.061517
prev mkt: -0.8985 action: 2, new mkt -1.0456, reward -0.147126
prev mkt: -1.0456 action: 2, new mkt -0.7979, reward 0.247735
prev mkt: -0.7979 action: 2, new mkt -0.7034, reward 0.094427
prev mkt: -0.7034 action: 2, new mkt -0.5503, reward 0.153157
prev mkt: -0.5503 action: 2, new mkt -0.5468, reward 0.003515
prev mkt: -0.5468 action: 2, new mkt -0.6232, reward -0.076406
prev mkt: -0.6232 action: 2, new mkt -0.7806, reward -0.157436
prev mkt: -0.7806 action: 2, new mkt -0.6655, reward 0.115157
prev mkt: -0.6655 action: 2, new mkt -0.6541, reward 0.011316
prev mkt: -0.6541 action: 2, new mkt -0.7477, reward -0.093552
prev mkt: -0.7477 action: 2, new mkt -0.9327, reward -0.185025
prev mkt: -0.9327 action: 0, new mkt -1.0050, reward 0.072241


prev mkt: 1.2739 action: 2, new mkt 1.1969, reward -0.076992
prev mkt: 1.1969 action: 2, new mkt 1.3266, reward 0.129725
prev mkt: 1.3266 action: 2, new mkt 1.2924, reward -0.034227
prev mkt: 1.2924 action: 2, new mkt 1.3147, reward 0.022333
prev mkt: 1.3147 action: 2, new mkt 1.5751, reward 0.260381
prev mkt: 1.5751 action: 2, new mkt 1.6324, reward 0.057310
prev mkt: 1.6324 action: 2, new mkt 1.6060, reward -0.026444
prev mkt: 1.6060 action: 2, new mkt 1.7621, reward 0.156152
prev mkt: 1.7621 action: 2, new mkt 1.7748, reward 0.012656
prev mkt: 1.7748 action: 2, new mkt 1.7505, reward -0.024242
prev mkt: 1.7505 action: 2, new mkt 1.6473, reward -0.103246
prev mkt: 1.6473 action: 2, new mkt 1.5717, reward -0.075537
prev mkt: 1.5717 action: 0, new mkt 1.7748, reward -0.203108
prev mkt: 1.7748 action: 2, new mkt 1.8148, reward 0.039947
prev mkt: 1.8148 action: 0, new mkt 1.9229, reward -0.108137
prev mkt: 1.9229 action: 2, new mkt 1.9994, reward 0.076516
prev mkt: 1.9994 action: 0, new 

prev mkt: -0.7753 action: 0, new mkt -0.8844, reward 0.109134
prev mkt: -0.8844 action: 0, new mkt -0.8247, reward -0.059728
prev mkt: -0.8247 action: 0, new mkt -0.7901, reward -0.034561
prev mkt: -0.7901 action: 0, new mkt -0.8385, reward 0.048373
prev mkt: -0.8385 action: 0, new mkt -0.7201, reward -0.118401
prev mkt: -0.7201 action: 0, new mkt -0.6888, reward -0.031263
prev mkt: -0.6888 action: 2, new mkt -0.7685, reward -0.079619
prev mkt: -0.7685 action: 0, new mkt -0.7677, reward -0.000738
prev mkt: -0.7677 action: 0, new mkt -0.6999, reward -0.067827
prev mkt: -0.6999 action: 0, new mkt -0.8192, reward 0.119271
prev mkt: -0.8192 action: 0, new mkt -0.9365, reward 0.117362
prev mkt: -0.9365 action: 0, new mkt -0.8779, reward -0.058593
prev mkt: -0.8779 action: 0, new mkt -0.8638, reward -0.014190
prev mkt: -0.8638 action: 0, new mkt -0.9122, reward 0.048434
prev mkt: -0.9122 action: 0, new mkt -0.8989, reward -0.013274
prev mkt: -0.8989 action: 2, new mkt -0.8736, reward 0.02532

prev mkt: 1.9651 action: 2, new mkt 1.8697, reward -0.095478
prev mkt: 1.8697 action: 0, new mkt 1.7493, reward 0.120329
prev mkt: 1.7493 action: 0, new mkt 1.6891, reward 0.060264
prev mkt: 1.6891 action: 0, new mkt 1.4750, reward 0.214052
prev mkt: 1.4750 action: 0, new mkt 1.4805, reward -0.005486
prev mkt: 1.4805 action: 0, new mkt 1.4978, reward -0.017329
prev mkt: 1.4978 action: 0, new mkt 1.4039, reward 0.093923
prev mkt: 1.4039 action: 0, new mkt 1.3628, reward 0.041140
prev mkt: 1.3628 action: 0, new mkt 1.3181, reward 0.044671
prev mkt: 1.3181 action: 0, new mkt 1.2708, reward 0.047314
prev mkt: 1.2708 action: 0, new mkt 1.1321, reward 0.138672
prev mkt: 1.1321 action: 0, new mkt 1.1801, reward -0.047957
prev mkt: 1.1801 action: 0, new mkt 1.0345, reward 0.145539
prev mkt: 1.0345 action: 0, new mkt 0.9603, reward 0.074228
prev mkt: 0.9603 action: 0, new mkt 1.0909, reward -0.130570
prev mkt: 1.0909 action: 0, new mkt 0.8640, reward 0.226849
prev mkt: 0.8640 action: 0, new mkt

Unnamed: 0,timestep,action,lastmarket,newmarket,reward,short,flat,long,totalreward
0,1,2,101.599324,101.734486,0.135162,,,101.734486,0.135162
1,2,2,101.734486,101.584951,-0.149535,,,101.584951,-0.014373
2,3,2,101.584951,101.493804,-0.091146,,,101.493804,-0.105520
3,4,0,101.493804,101.657516,-0.163712,101.657516,,,-0.269231
4,5,0,101.657516,101.748366,-0.090850,101.748366,,,-0.360081
...,...,...,...,...,...,...,...,...,...
1251,1252,1,99.773987,99.775659,0.000000,,99.775659,,11.175430
1252,1253,2,99.775659,99.896589,0.120930,,,99.896589,11.296360
1253,1254,0,99.896589,100.050471,-0.153882,100.050471,,,11.142478
1254,1255,2,100.050471,99.989811,-0.060660,,,99.989811,11.081818


In [32]:
tradesim_chart(df, title="Trading Simulation: Simple Harmonic Motion + Noise + Damping")

In [33]:
make_figure(df.index, df['newmarket'],
            title='Simulated Stock Price Data: Simple Harmonic Motion + Noise + Damping',
            xtitle='Timesteps',
            ytitle='Value'
           )

### Simulation: OU process

In [34]:
def market_gen(gen, lag=16):
    
    buffer = []
    diffbuffer = []


    # fill buffer
    dt, last, _ = next(gen)
    for i in range(lag):
        prev = last
        dt, last, _  = next(gen)
        buffer.append(last-mu)
        diffbuffer.append(last-prev)

    # yield first group of lag vals and diffs
    yield buffer+diffbuffer

    while(True):
        prev = last
        dt, last, _ = next(gen)
        buffer.pop(0)
        buffer.append(last-mu)
        diffbuffer.pop(0)
        diffbuffer.append(last-prev)
        yield buffer+diffbuffer

T = 1.  # Total time.
dt = 0.001
ticks = int(T / dt)  # Number of time steps.

sigma = 1.0
mu = 100.0
tau = 0.05
verbose=1

lag=1
print("Using %d lags" % lag)

def ou_market_gen():
    return market_gen(gen=ou_gen(dt=dt,
                                 sigma=sigma,
                                 mu=mu,
                                 tau=tau,
                                 verbose=1
                                ),
                      lag=lag)

gen = ou_market_gen()

time_series=[]
stock_series=[]
for i in range(1256):
    z = next(gen)
    time_series.append(i)
    stock_series.append(z[lag - 1])

df = pd.DataFrame({'dateindex': time_series, 'stock': stock_series})

make_figure(df['dateindex'], df['stock'],
            title='Simulated Stock Price Data: Ornstein-Uhlenbeck Process',
            xtitle='Timesteps',
            ytitle='Value'
           )

Using 1 lags


In [35]:
N_EPISODES = 2000
ticks_per_episode = 1256
nstocks = 1
lag=1
print("Using %d lags" % lag)

env = Market(ou_market_gen,
             lag=lag,
             nstocks=1, 
             episode_length=ticks_per_episode)

agent = REINFORCE_Agent(state_size=nstocks*lag*2,
                        action_size=3,
                       )

start_time = time.time()
print("Start: %s" % (time.strftime("%H:%M:%S")))

for e in range(N_EPISODES):
    agent.run_episode()
    agent.score_episode(e, N_EPISODES)
    
#     if e and (e+1) % agent.save_interval == 0:
#         agent.save()

elapsed_time = time.time() - start_time
print("\nTrain time: ", elapsed_time)        


Using 1 lags
layer 1 size 16, relu, reg_penalty 0.00000000, dropout 0.000
layer 2 size 16, relu, reg_penalty 0.00000000, dropout 0.000
Model: "functional_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           [(None, 2)]               0         
_________________________________________________________________
Dense00 (Dense)              (None, 16)                48        
_________________________________________________________________
Dense01 (Dense)              (None, 16)                272       
_________________________________________________________________
Output (Dense)               (None, 3)                 51        
Total params: 371
Trainable params: 371
Non-trainable params: 0
_________________________________________________________________
None
Start: 13:36:47
14:09:19 episode 2000/2000:, score: 13.354438861289609, 10-episode avg: 18.6 Memory: 338.2 MB    

In [36]:
agent.rlplot()

In [37]:
z = agent.view()

df = pd.DataFrame(z)
df.columns = ["timestep", "action", "lastmarket", "newmarket", "reward"]
df['lastmarket']+=100
df['newmarket']+=100
df['short'] = np.nan
df.loc[df['action']==0, 'short'] = df['newmarket']
df['flat'] = np.nan
df.loc[df['action']==1, 'flat'] = df['newmarket']
df['long'] = np.nan
df.loc[df['action']==2, 'long'] = df['newmarket']
df['totalreward'] = df['reward'].cumsum()
df.to_csv('df.csv')
df

prev mkt: -0.0059 action: 2, new mkt 0.4213, reward 0.427151
prev mkt: 0.4213 action: 0, new mkt 0.7904, reward -0.369153
prev mkt: 0.7904 action: 0, new mkt 0.8543, reward -0.063902
prev mkt: 0.8543 action: 0, new mkt 0.5367, reward 0.317645
prev mkt: 0.5367 action: 0, new mkt 0.5935, reward -0.056770
prev mkt: 0.5935 action: 0, new mkt 0.5634, reward 0.030105
prev mkt: 0.5634 action: 0, new mkt 0.4322, reward 0.131181
prev mkt: 0.4322 action: 0, new mkt 0.3835, reward 0.048675
prev mkt: 0.3835 action: 0, new mkt 0.5539, reward -0.170432
prev mkt: 0.5539 action: 0, new mkt 0.5927, reward -0.038768
prev mkt: 0.5927 action: 0, new mkt 0.6827, reward -0.089981
prev mkt: 0.6827 action: 0, new mkt 0.2062, reward 0.476442
prev mkt: 0.2062 action: 0, new mkt 0.4602, reward -0.253913
prev mkt: 0.4602 action: 0, new mkt 0.4591, reward 0.001096
prev mkt: 0.4591 action: 0, new mkt 0.3271, reward 0.131962
prev mkt: 0.3271 action: 0, new mkt 0.3612, reward -0.034110
prev mkt: 0.3612 action: 0, new

prev mkt: -1.4505 action: 2, new mkt -1.6761, reward -0.225561
prev mkt: -1.6761 action: 2, new mkt -1.5014, reward 0.174649
prev mkt: -1.5014 action: 2, new mkt -1.6059, reward -0.104442
prev mkt: -1.6059 action: 2, new mkt -1.8040, reward -0.198126
prev mkt: -1.8040 action: 2, new mkt -1.7870, reward 0.017021
prev mkt: -1.7870 action: 2, new mkt -1.6086, reward 0.178382
prev mkt: -1.6086 action: 2, new mkt -1.8060, reward -0.197439
prev mkt: -1.8060 action: 2, new mkt -1.6850, reward 0.121053
prev mkt: -1.6850 action: 2, new mkt -1.5037, reward 0.181319
prev mkt: -1.5037 action: 2, new mkt -1.3705, reward 0.133151
prev mkt: -1.3705 action: 2, new mkt -1.3026, reward 0.067900
prev mkt: -1.3026 action: 2, new mkt -1.0956, reward 0.206981
prev mkt: -1.0956 action: 2, new mkt -1.0594, reward 0.036234
prev mkt: -1.0594 action: 2, new mkt -1.3835, reward -0.324122
prev mkt: -1.3835 action: 2, new mkt -1.3335, reward 0.050013
prev mkt: -1.3335 action: 2, new mkt -1.6018, reward -0.268284
pr

prev mkt: -0.5151 action: 2, new mkt -0.4774, reward 0.037630
prev mkt: -0.4774 action: 2, new mkt -0.2741, reward 0.203357
prev mkt: -0.2741 action: 2, new mkt -0.4060, reward -0.131887
prev mkt: -0.4060 action: 2, new mkt -0.5192, reward -0.113232
prev mkt: -0.5192 action: 2, new mkt -0.5956, reward -0.076415
prev mkt: -0.5956 action: 2, new mkt -0.5834, reward 0.012213
prev mkt: -0.5834 action: 2, new mkt -0.9892, reward -0.405763
prev mkt: -0.9892 action: 2, new mkt -0.4922, reward 0.496969
prev mkt: -0.4922 action: 2, new mkt -0.3718, reward 0.120433
prev mkt: -0.3718 action: 2, new mkt -0.5445, reward -0.172745
prev mkt: -0.5445 action: 2, new mkt -0.4931, reward 0.051396
prev mkt: -0.4931 action: 2, new mkt -0.2992, reward 0.193928
prev mkt: -0.2992 action: 2, new mkt -0.2899, reward 0.009324
prev mkt: -0.2899 action: 2, new mkt -0.4178, reward -0.127956
prev mkt: -0.4178 action: 2, new mkt -0.0789, reward 0.338903
prev mkt: -0.0789 action: 2, new mkt -0.4122, reward -0.333324
p

prev mkt: -0.7830 action: 2, new mkt -0.8264, reward -0.043416
prev mkt: -0.8264 action: 2, new mkt -0.3429, reward 0.483540
prev mkt: -0.3429 action: 2, new mkt -0.7500, reward -0.407091
prev mkt: -0.7500 action: 2, new mkt -0.9760, reward -0.226021
prev mkt: -0.9760 action: 2, new mkt -1.0574, reward -0.081423
prev mkt: -1.0574 action: 2, new mkt -1.0330, reward 0.024482
prev mkt: -1.0330 action: 2, new mkt -1.0601, reward -0.027155
prev mkt: -1.0601 action: 2, new mkt -0.6909, reward 0.369244
prev mkt: -0.6909 action: 2, new mkt -0.6230, reward 0.067907
prev mkt: -0.6230 action: 2, new mkt -0.4160, reward 0.206975
prev mkt: -0.4160 action: 2, new mkt -0.6246, reward -0.208565
prev mkt: -0.6246 action: 2, new mkt -0.7587, reward -0.134101
prev mkt: -0.7587 action: 2, new mkt -0.7879, reward -0.029235
prev mkt: -0.7879 action: 2, new mkt -0.8581, reward -0.070166
prev mkt: -0.8581 action: 2, new mkt -1.0159, reward -0.157846
prev mkt: -1.0159 action: 2, new mkt -1.0897, reward -0.0738

prev mkt: -1.8585 action: 2, new mkt -1.9545, reward -0.095989
prev mkt: -1.9545 action: 2, new mkt -2.1074, reward -0.152874
prev mkt: -2.1074 action: 2, new mkt -1.8688, reward 0.238629
prev mkt: -1.8688 action: 2, new mkt -1.9554, reward -0.086609
prev mkt: -1.9554 action: 2, new mkt -2.0593, reward -0.103910
prev mkt: -2.0593 action: 2, new mkt -2.0201, reward 0.039207
prev mkt: -2.0201 action: 2, new mkt -2.2000, reward -0.179908
prev mkt: -2.2000 action: 2, new mkt -2.3176, reward -0.117642
prev mkt: -2.3176 action: 2, new mkt -2.2789, reward 0.038672
prev mkt: -2.2789 action: 2, new mkt -2.2388, reward 0.040138
prev mkt: -2.2388 action: 2, new mkt -2.3495, reward -0.110723
prev mkt: -2.3495 action: 2, new mkt -2.1364, reward 0.213150
prev mkt: -2.1364 action: 2, new mkt -2.0776, reward 0.058734
prev mkt: -2.0776 action: 2, new mkt -1.7420, reward 0.335631
prev mkt: -1.7420 action: 2, new mkt -1.6809, reward 0.061132
prev mkt: -1.6809 action: 2, new mkt -1.5854, reward 0.095509
p

prev mkt: -1.1657 action: 2, new mkt -1.0909, reward 0.074841
prev mkt: -1.0909 action: 2, new mkt -1.1599, reward -0.068971
prev mkt: -1.1599 action: 2, new mkt -1.3304, reward -0.170575
prev mkt: -1.3304 action: 2, new mkt -1.1480, reward 0.182414
prev mkt: -1.1480 action: 2, new mkt -1.2217, reward -0.073638
prev mkt: -1.2217 action: 2, new mkt -1.2215, reward 0.000188
prev mkt: -1.2215 action: 2, new mkt -1.0307, reward 0.190757
prev mkt: -1.0307 action: 2, new mkt -0.8556, reward 0.175122
prev mkt: -0.8556 action: 2, new mkt -0.8745, reward -0.018946
prev mkt: -0.8745 action: 2, new mkt -0.8281, reward 0.046427
prev mkt: -0.8281 action: 2, new mkt -0.9241, reward -0.095991
prev mkt: -0.9241 action: 2, new mkt -0.9878, reward -0.063733
prev mkt: -0.9878 action: 2, new mkt -0.8846, reward 0.103225
prev mkt: -0.8846 action: 2, new mkt -0.9004, reward -0.015769
prev mkt: -0.9004 action: 2, new mkt -0.7825, reward 0.117910
prev mkt: -0.7825 action: 2, new mkt -0.6932, reward 0.089232
p

Unnamed: 0,timestep,action,lastmarket,newmarket,reward,short,flat,long,totalreward
0,1,2,99.994131,100.421282,0.427151,,,100.421282,0.427151
1,2,0,100.421282,100.790436,-0.369153,100.790436,,,0.057998
2,3,0,100.790436,100.854338,-0.063902,100.854338,,,-0.005904
3,4,0,100.854338,100.536693,0.317645,100.536693,,,0.311741
4,5,0,100.536693,100.593463,-0.056770,100.593463,,,0.254971
...,...,...,...,...,...,...,...,...,...
1251,1252,0,100.423953,100.564446,-0.140494,100.564446,,,22.440661
1252,1253,0,100.564446,100.481542,0.082905,100.481542,,,22.523566
1253,1254,0,100.481542,100.904155,-0.422614,100.904155,,,22.100952
1254,1255,0,100.904155,101.141709,-0.237554,101.141709,,,21.863398


In [38]:
df.head()

Unnamed: 0,timestep,action,lastmarket,newmarket,reward,short,flat,long,totalreward
0,1,2,99.994131,100.421282,0.427151,,,100.421282,0.427151
1,2,0,100.421282,100.790436,-0.369153,100.790436,,,0.057998
2,3,0,100.790436,100.854338,-0.063902,100.854338,,,-0.005904
3,4,0,100.854338,100.536693,0.317645,100.536693,,,0.311741
4,5,0,100.536693,100.593463,-0.05677,100.593463,,,0.254971


In [39]:
make_figure(df['timestep'], df['newmarket'],
            title='Simulated Stock Price Data: Ornstein-Uhlenbeck Process',
            xtitle='Timesteps',
            ytitle='Value'
           )

In [40]:
tradesim_chart(df, title="Trading Simulation: Ornstein-Uhlenbeck Process")

In [41]:
df

Unnamed: 0,timestep,action,lastmarket,newmarket,reward,short,flat,long,totalreward
0,1,2,99.994131,100.421282,0.427151,,,100.421282,0.427151
1,2,0,100.421282,100.790436,-0.369153,100.790436,,,0.057998
2,3,0,100.790436,100.854338,-0.063902,100.854338,,,-0.005904
3,4,0,100.854338,100.536693,0.317645,100.536693,,,0.311741
4,5,0,100.536693,100.593463,-0.056770,100.593463,,,0.254971
...,...,...,...,...,...,...,...,...,...
1251,1252,0,100.423953,100.564446,-0.140494,100.564446,,,22.440661
1252,1253,0,100.564446,100.481542,0.082905,100.481542,,,22.523566
1253,1254,0,100.481542,100.904155,-0.422614,100.904155,,,22.100952
1254,1255,0,100.904155,101.141709,-0.237554,101.141709,,,21.863398


In [42]:
df[['action', 'totalreward']].groupby('action').count()

Unnamed: 0_level_0,totalreward
action,Unnamed: 1_level_1
0,453
1,4
2,799


In [43]:
df.loc[df['action']==1]

Unnamed: 0,timestep,action,lastmarket,newmarket,reward,short,flat,long,totalreward
411,412,1,99.645493,99.616857,0.0,,99.616857,,7.588866
738,739,1,99.923399,99.880296,0.0,,99.880296,,15.386409
930,931,1,100.117414,100.054936,0.0,,100.054936,,17.584409
1158,1159,1,99.56827,99.461596,0.0,,99.461596,,20.821155
