From e0345c8f1672e74c818b5c965f4416f6c303982e Mon Sep 17 00:00:00 2001
From: Tyler Renelle <tylerrenelle@gmail.com>
Date: Fri, 9 Feb 2018 12:21:41 -0800
Subject: [PATCH] Don't early-terminate episodes, then we can fit batch_size &
 remove autoencoder. Switch from abs advantage to Sharpe & cummulative return.
 Conv step_window back-indexing

---
 README.md                |   1 +
 btc_env.py               | 165 +++++++++++++++++++--------------------
 data/data.py             |   6 +-
 hypersearch.py           |  37 +++++----
 visualize/client/App.jsx |  11 ++-
 visualize/server.py      |   8 +-
 6 files changed, 111 insertions(+), 117 deletions(-)

diff --git a/README.md b/README.md
index 92ce9e3..87a33b2 100644
--- a/README.md
+++ b/README.md
@@ -72,6 +72,7 @@ This project is a [TensorForce](https://github.com/reinforceio/tensorforce)-base
 
 - [Sutton & Barto](http://amzn.to/2EWvnVf): de-facto textbook on RL basics
 - [CS 294](http://rll.berkeley.edu/deeprlcourse/): the modern deep-learning spin on ^.
+- [Machine Learning for Trading](https://www.udacity.com/course/machine-learning-for-trading--ud501): teaches you algo-trading, stock stuff, and applied RL.
 
 This project goes with Episode 26+ of [Machine Learning Guide](http://ocdevel.com/podcasts/machine-learning). Those episodes are tutorial for this project; including an intro to Deep RL, hyperparameter decisions, etc.
 
diff --git a/btc_env.py b/btc_env.py
index e9cd7ea..2ccbfea 100755
--- a/btc_env.py
+++ b/btc_env.py
@@ -10,7 +10,7 @@
 env back to Gym format. Anyone wanna give it a go?
 """
 
-import random, time, requests, pdb, gdax
+import random, time, requests, pdb, gdax, math
 from enum import Enum
 import numpy as np
 import pandas as pd
@@ -59,12 +59,10 @@ class Scaler(object):
     def __init__(self):
         self.scalers = {
             self.REWARD: RobustScaler(quantile_range=(5., 95.)),
-            self.SERIES: RobustScaler(quantile_range=(5., 95.)),
             self.STATIONARY: RobustScaler(quantile_range=(5., 95.))
         }
         self.data = {
             self.REWARD: [],
-            self.SERIES: [],
             self.STATIONARY: []
         }
         self.done = False
@@ -103,12 +101,10 @@ def transform(self, input, kind, force=False):
 # keep this globally around for all runs forever
 scalers = {}
 
-# We don't want random-seeding for reproducibilityy! We _want_ two runs to give different results, because we only
-# trust the hyper combo which consistently gives positive results!
-ALLOW_SEED = False
-
 
 class BitcoinEnv(Environment):
+    EPISODE_LEN = 5000
+
     def __init__(self, hypers, name='ppo_agent'):
         """Initialize hyperparameters (done here instead of __init__ since OpenAI-Gym controls instantiation)"""
         self.hypers = Box(hypers)
@@ -118,7 +114,7 @@ def __init__(self, hypers, name='ppo_agent'):
         # cash/val start @ about $3.5k each. You should increase/decrease depending on how much you'll put into your
         # exchange accounts to trade with. Presumably the agent will learn to work with what you've got (cash/value
         # are state inputs); but starting capital does effect the learning process.
-        self.start_cash, self.start_value = .3, .3
+        self.start_cash, self.start_value = 1., 1.
 
         # We have these "accumulator" objects, which collect values over steps, over episodes, etc. Easier to keep
         # same-named variables separate this way.
@@ -126,10 +122,11 @@ def __init__(self, hypers, name='ppo_agent'):
             episode=dict(
                 i=0,
                 total_steps=0,
-                advantages=[],
+                sharpes=[],
+                returns=[],
                 uniques=[]
             ),
-            step=dict(i=0),  # setup in reset()
+            step=dict(),  # setup in reset()
             tests=dict(
                 i=0,
                 n_tests=0
@@ -150,8 +147,8 @@ def __init__(self, hypers, name='ppo_agent'):
 
         # Our data is too high-dimensional for the way MemoryModel handles batched episodes. Reduce it (don't like this)
         all_data = data.db_to_dataframe(self.conn, arbitrage=self.hypers.arbitrage)
-        self.all_observations, self.all_prices = self._xform_data(all_data)
-        self.all_prices_diff = self._diff(self.all_prices, percent=True)
+        self.all_observations, self.all_prices = self.xform_data(all_data)
+        self.all_prices_diff = self.diff(self.all_prices, percent=True)
 
         # Calculate a possible reward to be used as an average for repeat-punishing
         self.possible_reward = self.start_value * np.median([p for p in self.all_prices_diff if p > 0])
@@ -193,13 +190,9 @@ def states(self): return self.states_
     @property
     def actions(self): return self.actions_
 
-    def seed(self, seed=None):
-        if not ALLOW_SEED: return
-        # self.np_random, seed = seeding.np_random(seed)
-        # return [seed]
-        random.seed(seed)
-        np.random.seed(seed)
-        tf.set_random_seed(seed)
+    # We don't want random-seeding for reproducibilityy! We _want_ two runs to give different results, because we only
+    # trust the hyper combo which consistently gives positive results.
+    def seed(self, seed=None): return
 
     def update_btc_price(self):
         try:
@@ -207,7 +200,7 @@ def update_btc_price(self):
         except:
             self.btc_price = self.btc_price or 8000
 
-    def _diff(self, arr, percent=False):
+    def diff(self, arr, percent=False):
         series = pd.Series(arr)
         diff = series.pct_change() if percent else series.diff()
         diff.iloc[0] = 0  # always NaN, nothing to compare to
@@ -219,14 +212,14 @@ def _diff(self, arr, percent=False):
         # then forward-fill the NaNs.
         return diff.replace([np.inf, -np.inf], np.nan).ffill().bfill().values
 
-    def _xform_data(self, df):
+    def xform_data(self, df):
         columns = []
         use_indicators = self.hypers.indicators and self.hypers.indicators > 100
         tables_ = data.get_tables(self.hypers.arbitrage)
         percent = self.hypers.pct_change
         for table in tables_:
             name, cols, ohlcv = table['name'], table['cols'], table.get('ohlcv', {})
-            columns += [self._diff(df[f'{name}_{k}'], percent) for k in cols]
+            columns += [self.diff(df[f'{name}_{k}'], percent) for k in cols]
 
             # Add extra indicator columns
             if ohlcv and use_indicators:
@@ -236,10 +229,10 @@ def _xform_data(self, df):
                     ind[k] = df[f"{name}_{v}"]
                 columns += [
                     # TODO this is my naive approach, I'm not a TA expert. Could use a second pair of eyes
-                    self._diff(SMA(ind, timeperiod=self.hypers.indicators), percent),
-                    self._diff(EMA(ind, timeperiod=self.hypers.indicators), percent),
-                    self._diff(RSI(ind, timeperiod=self.hypers.indicators), percent),
-                    self._diff(ATR(ind, timeperiod=self.hypers.indicators), percent),
+                    self.diff(SMA(ind, timeperiod=self.hypers.indicators), percent),
+                    self.diff(EMA(ind, timeperiod=self.hypers.indicators), percent),
+                    self.diff(RSI(ind, timeperiod=self.hypers.indicators), percent),
+                    self.diff(ATR(ind, timeperiod=self.hypers.indicators), percent),
                 ]
 
         states = np.column_stack(columns)
@@ -258,18 +251,17 @@ def _xform_data(self, df):
         # Currently we're reducing the dimensionality of our states (OHLCV + indicators + arbitrage => 5 or 6 weights)
         # because TensorForce's memory branch changed Policy Gradient models' batching from timesteps to episodes.
         # This takes of way too much GPU RAM for us, so we had to cut back in quite a few areas (num steps to train
-        # per episode, episode batch_size, and especially this:)
-        ae = AutoEncoder()
-        states = ae.fit_transform_tied(states)
+        # per episode, episode batch_size, and especially this:
+        # ae = AutoEncoder()
+        # states = ae.fit_transform_tied(states)
 
         return states, prices
 
-    def use_dataset(self, mode, no_kill=False):
+    def use_dataset(self, mode, full_set=False):
         """Fetches, transforms, and stores the portion of data you'll be working with (ie, 80% train data, 20% test
         data, or the live database). Make sure to call this before reset()!
         """
         self.mode = mode
-        self.no_kill = no_kill
         if mode in (Mode.LIVE, Mode.TEST_LIVE):
             self.conn = data.engine_live.connect()
             # Work with 6000 timesteps up until the present (play w/ diff numbers, depends on LSTM)
@@ -285,25 +277,24 @@ def use_dataset(self, mode, no_kill=False):
             split = .9  # Using 90% training data.
             n_train, n_test = int(row_ct * split), int(row_ct * (1 - split))
             if mode == mode.TEST:
-                limit, offset = n_test, n_train
-                if no_kill is False:
-                    limit = 50000  # he's not likely to get past that, so save some RAM (=time)
+                offset = n_train
+                limit = 30000 if full_set else 8000  # should be `n_test` in full_set, getting idx errors
             else:
                 # Grab a random window from the 90% training data. The random bit is important so the agent
                 # sees a variety of data. The window-size bit is a hack: as long as the agent doesn't die (doesn't cause
                 # `terminal=True`), PPO's MemoryModel can keep filling up until it crashes TensorFlow. This ensures
                 # there's a stopping point (limit). I'd rather see how far he can get w/o dying, figure out a solution.
-                limit = 6000
-                offset = random.randint(0, n_train - limit)
+                limit = self.EPISODE_LEN
+                offset_start = 0 if not self.conv2d else self.hypers.step_window + 1
+                offset = random.randint(offset_start, n_train - self.EPISODE_LEN)
 
-        # self.observations, self.prices = self._xform_data(df)
-        # self.prices_diff = self._diff(self.prices, percent=True)
-        self.observations = self.all_observations[offset:offset+limit]
+        self.offset, self.limit = offset, limit
         self.prices = self.all_prices[offset:offset+limit]
         self.prices_diff = self.all_prices_diff[offset:offset+limit]
 
-    def _get_next_state(self, i, cash, value, repeats):
-        series = self.observations[i]
+    def get_next_state(self, i, cash, value, repeats):
+        i = i + self.offset
+        series = self.all_observations[i]
         stationary = [cash, value, repeats]
         if self.hypers.scale:
             # series already scaled in self._xform_data()
@@ -315,27 +306,24 @@ def _get_next_state(self, i, cash, value, repeats):
             # Take note of the +1 here. LSTM uses a single index [i], which grabs the list's end. Conv uses a window,
             # [-something:i], which _excludes_ the list's end (due to Python indexing). Without this +1, conv would
             # have a 1-step-behind delayed response.
-            window = self.observations[i - self.hypers.step_window + 1:i + 1]
+            window = self.all_observations[i - self.hypers.step_window + 1:i + 1]
             series = np.expand_dims(window, axis=1)
         return dict(series=series, stationary=stationary)
 
     def reset(self):
         step_acc, ep_acc = self.acc.step, self.acc.episode
-        # Cash & value are the real scores - how much we end up with at the end of an episode
+        step_acc.i = 0
         step_acc.cash, step_acc.value = self.start_cash, self.start_value
-        # But for our purposes, we care more about "how much better is what we made than if we held". We're training
-        # a trading bot, not an investing bot. So we compare these at the end, calling it "advantage"
-        step_acc.hold = Box(value=self.start_cash, cash=self.start_value)
-        start_timestep = 1
-        if self.conv2d:
-            # for conv2d, start at the end of the first window (grab a full window)
-            start_timestep = self.hypers.step_window
-        step_acc.i = start_timestep
-        step_acc.signals = [0] * start_timestep
+        step_acc.hold_value = self.start_value
+        step_acc.totals = Box(
+            trade=[self.start_cash + self.start_value],
+            hold=[self.start_cash + self.start_value]
+        )
+        step_acc.signals = []
         step_acc.repeats = 0
         ep_acc.i += 1
 
-        return self._get_next_state(start_timestep, self.start_cash, self.start_value, 0.)
+        return self.get_next_state(0, self.start_cash, self.start_value, 0.)
 
     def execute(self, actions):
         if self.hypers.single_action:
@@ -360,32 +348,35 @@ def execute(self, actions):
         }[EXCHANGE]
         reward = 0
         abs_sig = abs(signal)
-        before = Box(cash=step_acc.cash, value=step_acc.value, total=step_acc.cash+step_acc.value)
+        total_before = step_acc.cash + step_acc.value
         # Perform the trade. In training mode, we'll let it dip into negative here, but then kill and punish below.
         # In testing/live, we'll just block the trade if they can't afford it
-        if signal > 0 and not (self.no_kill and abs_sig > step_acc.cash):
+        if signal > 0 and abs_sig <= step_acc.cash:
             step_acc.value += abs_sig - abs_sig*fee
             step_acc.cash -= abs_sig
-        elif signal < 0 and not (self.no_kill and abs_sig > step_acc.value):
+        elif signal < 0 and abs_sig <= step_acc.value:
             step_acc.cash += abs_sig - abs_sig*fee
             step_acc.value -= abs_sig
 
         # next delta. [1,2,2].pct_change() == [NaN, 1, 0]
-        diff_loc = step_acc.i + 1
-        pct_change = self.prices_diff[diff_loc]
+        pct_change = self.prices_diff[step_acc.i + 1]
+
         step_acc.value += pct_change * step_acc.value
-        total = step_acc.value + step_acc.cash
-        reward += total - before.total
+        total_now = step_acc.value + step_acc.cash
+        step_acc.totals.trade.append(total_now)
+        # Reward is in dollar-change. As we build a great portfolio, the reward should get bigger and bigger (and
+        # the agent should notice this)
+        reward += (total_now - total_before)
 
         # calculate what the reward would be "if I held", to calculate the actual reward's _advantage_ over holding
-        before = step_acc.hold
-        before.value += pct_change * before.value
+        step_acc.hold_value += pct_change * step_acc.hold_value
+        step_acc.totals.hold.append(step_acc.hold_value + self.start_cash)
 
         # Collect repeated same-action count (homogeneous actions punished below)
         recent_actions = np.array(step_acc.signals[-step_acc.repeats:])
         if np.any(recent_actions > 0) and np.any(recent_actions < 0) and np.any(recent_actions == 0):
             step_acc.repeats = 0  # reset repeat counter
-        else:
+        elif self.hypers.punish_repeats < self.EPISODE_LEN:
             step_acc.repeats += 1
             # by the time we hit punish_repeats, we're doubling punishments / canceling rewards. Note: we don't want to
             # multiply by `reward` here because repeats are often 0, which means 0 penalty. Hence `possible_reward`
@@ -396,16 +387,11 @@ def execute(self, actions):
         step_acc.i += 1
         ep_acc.total_steps += 1
 
-        next_state = self._get_next_state(step_acc.i, step_acc.cash, step_acc.value, step_acc.repeats)
+        next_state = self.get_next_state(step_acc.i, step_acc.cash, step_acc.value, step_acc.repeats)
         if self.hypers.scale:
             reward = self.scaler.transform([reward], Scaler.REWARD)[0]
 
-        terminal = int(step_acc.i + 1 >= len(self.observations))
-        # Kill and punish if (a) agent ran out of money; (b) is doing nothing for way too long
-        # The repeats bit isn't just for punishment, but because training can get stuck too long on losers
-        if not self.no_kill and (step_acc.cash < 0 or step_acc.value < 0 or step_acc.repeats >= self.hypers.punish_repeats):
-            reward -= 1.  # Big penalty. BTC, like $12k
-            terminal = True
+        terminal = int(step_acc.i + 1 >= self.limit)
         if terminal and self.mode in (Mode.TRAIN, Mode.TEST):
             # We're done.
             step_acc.signals.append(0)  # Add one last signal (to match length)
@@ -455,8 +441,8 @@ def execute(self, actions):
                 time.sleep(20)
             self.last_timestamp = new_timestamp
             self.df = pd.concat([self.df.iloc[-1000:], new_data], axis=0)  # shed some used data, add new
-            self.observations, self.prices = self._xform_data(self.df)
-            self.prices_diff = self._diff(self.prices, percent=True)
+            self.observations, self.prices = self.xform_data(self.df)
+            self.prices_diff = self.diff(self.prices, percent=True)
             step_acc.i = self.df.shape[0] - n_new - 1
 
             if live:
@@ -475,22 +461,31 @@ def execute(self, actions):
     def episode_finished(self, runner):
         step_acc, ep_acc, test_acc = self.acc.step, self.acc.episode, self.acc.tests
         signals = step_acc.signals
-
-        advantage = ((step_acc.cash + step_acc.value) - (self.start_cash + self.start_value)) - \
-                    ((step_acc.hold.value + step_acc.hold.cash) - (self.start_cash + self.start_value))
-        # per step average advantage, then bring it to a reasonable number (up from ~.0001)
-        advantage = advantage / step_acc.i * 10000
-        if advantage == 0.: advantage = -.01  # no HODLing!
-        self.acc.episode.advantages.append(advantage)
+        totals = step_acc.totals
         n_uniques = float(len(np.unique(signals)))
-        self.acc.episode.uniques.append(n_uniques)
+
+        # Calculate the Sharpe ratio.
+        diff = (pd.Series(totals.trade).pct_change() - pd.Series(totals.hold).pct_change())[1:]
+        mean, std, sharpe = diff.mean(), diff.std(), 0
+        if (std, mean) != (0, 0):
+            # Usually Sharpe has `sqrt(num_trades)` in front (or `num_trading_days`?). Experimenting being creative w/
+            # trade-diversity, etc. Give Sharpe some extra info
+            # breadth = math.sqrt(np.uniques(signals))
+            breadth = np.std([np.sign(x) for x in signals])  # get signal direction, amount not as important (and adds complications)
+            sharpe = breadth * (mean / std)
+
+        cumm_ret = (totals.trade[-1] / totals.trade[0] - 1) - (totals.hold[-1] / totals.hold[0] - 1)
+
+        ep_acc.sharpes.append(float(sharpe))
+        ep_acc.returns.append(float(cumm_ret))
+        ep_acc.uniques.append(n_uniques)
 
         # Print (limit to note-worthy)
         lt_0 = len([s for s in signals if s < 0])
         eq_0 = len([s for s in signals if s == 0])
         gt_0 = len([s for s in signals if s > 0])
         completion = int(test_acc.i / test_acc.n_tests * 100)
-        print(f"{completion}%\tSteps: {step_acc.i}\tAdvantage: {'%.3f'%advantage}\tTrades:\t{lt_0}[<0]\t{eq_0}[=0]\t{gt_0}[>0]")
+        print(f"{completion}%\tSteps: {step_acc.i}\tSharpe: {'%.3f'%sharpe}\tReturn: {'%.3f'%cumm_ret}\tTrades:\t{lt_0}[<0]\t{eq_0}[=0]\t{gt_0}[>0]")
         return True
 
     def run_deterministic(self, runner, print_results=True):
@@ -515,8 +510,8 @@ def train_and_test(self, agent, n_steps, n_tests, early_stop):
                 self.use_dataset(Mode.TEST)
                 self.run_deterministic(runner, print_results=True)
                 if early_stop > 0:
-                    advantages = np.array(self.acc.episode.advantages[-early_stop:])
-                    if test_acc.i >= early_stop and np.all(advantages > 0):
+                    sharpes = np.array(self.acc.episode.sharpes[-early_stop:])
+                    if test_acc.i >= early_stop and np.all(sharpes > 0):
                         test_acc.i = n_tests
                 test_acc.i += 1
         except KeyboardInterrupt:
@@ -527,7 +522,7 @@ def train_and_test(self, agent, n_steps, n_tests, early_stop):
 
         # On last "how would it have done IRL?" run, without getting in the way (no killing on repeats, 0-balance)
         print('Running no-kill test-set')
-        self.use_dataset(Mode.TEST, no_kill=True)
+        self.use_dataset(Mode.TEST, full_set=True)
         self.run_deterministic(runner, print_results=True)
 
     def run_live(self, agent, test=True):
@@ -544,5 +539,5 @@ def run_live(self, agent, test=True):
         print(f'Starting total: {self.start_cash + self.start_value}')
 
         runner = Runner(agent=agent, environment=self)
-        self.use_dataset(Mode.TEST_LIVE if test else Mode.LIVE, no_kill=True)
+        self.use_dataset(Mode.TEST_LIVE if test else Mode.LIVE)
         self.run_deterministic(runner, print_results=True)
diff --git a/data/data.py b/data/data.py
index 1ee5f40..8e11574 100644
--- a/data/data.py
+++ b/data/data.py
@@ -298,9 +298,9 @@ def setup_runs_table():
         (
             id serial not null,
             hypers jsonb not null,
-            advantage_avg double precision not null,
-            advantages double precision[],
-            actions double precision[],
+            sharpes double precision[],
+            returns double precision[],
+            signals double precision[],
             prices double precision[],
             uniques double precision[],
             flag varchar(16),
diff --git a/hypersearch.py b/hypersearch.py
index 94ce97e..0d2c820 100755
--- a/hypersearch.py
+++ b/hypersearch.py
@@ -95,7 +95,7 @@ def build_net_spec(hypers):
 
         # This is just my hunch from CNNs I've seen; the filter sizes are much smaller than the downstream denses
         # (like 32-64-64 -> 512-256). If anyone has better intuition...
-        size = max([8, int(net.width // 5)])
+        size = max([32, int(net.width // 4)])
         # if i == 0: size = int(size / 2)  # Most convs have their first layer smaller... right? just the first, or what?
         arr.append({
             'size': size,
@@ -223,7 +223,7 @@ def hydrate_baseline(x, flat):
 }
 hypers['memory_model'] = {
     'update_mode.unit': 'episodes',
-    'update_mode.batch_size': 8,  # {
+    'update_mode.batch_size': 4,  # {
         # 'type': 'bounded',
         # 'vals': [1, 10],
         # 'guess': 10,
@@ -231,14 +231,14 @@ def hydrate_baseline(x, flat):
     # },
     'update_mode.frequency': {
         'type': 'bounded',
-        'vals': [1, 8],
-        'guess': 8,
+        'vals': [1, 4],
+        'guess': 4,
         'pre': round
     },
 
     'memory.type': 'latest',
     'memory.include_next_states': False,
-    'memory.capacity': 100000,  # {  TODO does this matter?
+    'memory.capacity': BitcoinEnv.EPISODE_LEN * 4,  # {
     #     'type': 'bounded',
     #     'vals': [2000, 20000],
     #     'guess': 5000
@@ -320,7 +320,7 @@ def hydrate_baseline(x, flat):
     'indicators': {
         'type': 'bounded',
         'vals': [0, 600],
-        'guess': 600,
+        'guess': 300,
         'pre': int,
         'hydrate': min_threshold(100, False)
     },
@@ -328,7 +328,7 @@ def hydrate_baseline(x, flat):
     'net.depth_mid': {
         'type': 'bounded',
         'vals': [1, 3],
-        'guess': 2,
+        'guess': 3,
         'pre': round
     },
     # Dense layers
@@ -384,7 +384,6 @@ def hydrate_baseline(x, flat):
         'hydrate': min_ten_neg(1e-6, 0.)
     },
 
-
     # Instead of using absolute price diffs, use percent-change.
     'pct_change': {
         'type': 'bool',
@@ -406,8 +405,8 @@ def hydrate_baseline(x, flat):
     # spanking. I didn't raise no investor, I raised a TRADER
     'punish_repeats': {
         'type': 'bounded',
-        'vals': [1000, 5000],
-        'guess': 5000,
+        'vals': [1000, BitcoinEnv.EPISODE_LEN * 1.5],  # more than ep len means don't punish
+        'guess': 1000,
         'pre': int
     },
 
@@ -582,27 +581,27 @@ def execute(self, actions):
         env.train_and_test(agent, self.cli_args.n_steps, self.cli_args.n_tests, -1)
 
         step_acc, ep_acc = env.acc.step, env.acc.episode
-        adv_avg = utils.calculate_score(ep_acc.advantages)
+        adv_avg = utils.calculate_score(ep_acc.sharpes)
         print(flat, f"\nAdvantage={adv_avg}\n\n")
 
         sql = """
-          insert into runs (hypers, advantage_avg, advantages, uniques, prices, actions, agent, flag) 
-          values (:hypers, :advantage_avg, :advantages, :uniques, :prices, :actions, :agent, :flag)
+          insert into runs (hypers, sharpes, returns, uniques, prices, signals, agent, flag) 
+          values (:hypers, :sharpes, :returns, :uniques, :prices, :signals, :agent, :flag)
           returning id;
         """
         row = self.conn_runs.execute(
             text(sql),
             hypers=json.dumps(flat),
-            advantage_avg=adv_avg,
-            advantages=list(ep_acc.advantages),
+            sharpes=list(ep_acc.sharpes),
+            returns=list(ep_acc.returns),
             uniques=list(ep_acc.uniques),
             prices=list(env.prices),
-            actions=list(step_acc.signals),
+            signals=list(step_acc.signals),
             agent=self.agent,
             flag=self.cli_args.net_type
         ).fetchone()
 
-        if ep_acc.advantages[-1] > 0:
+        if ep_acc.sharpes[-1] > 0:
             _id = str(row[0])
             directory = os.path.join(os.getcwd(), "saves", _id)
             filestar = os.path.join(directory, _id)
@@ -762,13 +761,13 @@ def loss_fn(params):
         # Every iteration, re-fetch from the database & pre-train new model. Acts same as saving/loading a model to disk,
         # but this allows to distribute across servers easily
         conn_runs = data.engine_runs.connect()
-        sql = "select hypers, advantages, advantage_avg from runs where flag=:f"
+        sql = "select hypers, sharpes from runs where flag=:f"
         runs = conn_runs.execute(text(sql), f=args.net_type).fetchall()
         conn_runs.close()
         X, Y = [], []
         for run in runs:
             X.append(hypers2vec(run.hypers))
-            Y.append([utils.calculate_score(run.advantages)])
+            Y.append([utils.calculate_score(run.sharpes)])
         boost_model = print_feature_importances(X, Y, feat_names)
 
         if args.guess != -1:
diff --git a/visualize/client/App.jsx b/visualize/client/App.jsx
index 500b410..58c163f 100644
--- a/visualize/client/App.jsx
+++ b/visualize/client/App.jsx
@@ -23,11 +23,10 @@ class App extends Component {
   componentDidMount() {
     fetch('http://localhost:5000').then(res => res.json()).then(data => {
       data.forEach(d => {
-        d.reward_avg = d.advantage_avg
         d.hypers = _.transform(d.hypers, (m,v,k) => {
           m[k.replace(/\./g, '_')] = typeof v == 'boolean' ? ~~v : v;
         });
-        d.unique_sigs = _.uniq(d.actions).length;
+        d.unique_sigs = _.uniq(d.signals).length;
       });
       this.forceRerender = true;
       this.setState({data});
@@ -146,7 +145,7 @@ class App extends Component {
     svg.select('g').remove(); // start clean
     let g = svg.append("g").attr("transform", "translate(" + margin.left + "," + margin.top + ")");
 
-    let rewards = data.map(d => d.advantages.map((v,i) => {
+    let rewards = data.map(d => d.sharpes.map((v,i) => {
       let y = v; // just human
       // let y = (d.rewards_agent[i] + v)/2; // human-agent average
       // y = _.clamp(y, -100, 100); // clamp so we don't break the graph
@@ -268,11 +267,11 @@ class App extends Component {
 
   mountSignals = () => {
     const {id} = this.clickedDatum;
-    fetch(`http://localhost:5000/actions/${id}`).then(res => res.json()).then(this._mountSignals);
+    fetch(`http://localhost:5000/signals/${id}`).then(res => res.json()).then(this._mountSignals);
   };
 
   _mountSignals = (data) => {
-    let {actions, prices} = data;
+    let {signals, prices} = data;
 
     let svg = d3.select("svg#signals");
     svg.select('g').remove(); // start fresh
@@ -325,7 +324,7 @@ class App extends Component {
       .enter()
         .append("circle")
         .classed('dot', true)
-        .style('fill', (d,i) => actions[i] < 0 ? 'red' : actions[i] > 0 ? 'green' : 'rgba(0,0,0,0)')
+        .style('fill', (d,i) => signals[i] < 0 ? 'red' : signals[i] > 0 ? 'green' : 'rgba(0,0,0,0)')
         .attr("r", 1)
         .attr("cx", (d,i) => x(i))
         .attr("cy", d => y(d));
diff --git a/visualize/server.py b/visualize/server.py
index cbddfa9..d9c69ca 100644
--- a/visualize/server.py
+++ b/visualize/server.py
@@ -15,9 +15,9 @@ def get_runs():
     rows = []
     conn = engine_runs.connect()
     # TODO prices/actions in separate route
-    for row in conn.execute('select id, hypers, advantage_avg, advantages, uniques from runs').fetchall():
+    for row in conn.execute('select id, hypers, sharpes, returns, uniques from runs').fetchall():
         row = dict(row.items())
-        row['advantage_avg'] = utils.calculate_score(row['advantages'])
+        row['reward_avg'] = utils.calculate_score(row['sharpes'])
         rows.append(row)
     conn.close()
 
@@ -25,10 +25,10 @@ def get_runs():
     return jsonify(rows)
 
 
-@app.route("/actions/<run_id>")
+@app.route("/signals/<run_id>")
 def get_actions(run_id):
     conn = engine_runs.connect()
-    query = 'select actions, prices from runs where id=:run_id'
+    query = 'select signals, prices from runs where id=:run_id'
     row = conn.execute(text(query), run_id=run_id).fetchone()
     conn.close()