# performance
> Exploring the signal generation options


# Data
Download the data so that we have something to work with

In [None]:
#|output: false
! ../scripts/download.sh

Downloading market data (~200 MB)
--2024-03-04 11:09:46--  https://datasets.tardis.dev/v1/bybit/trades/2023/01/01/XRPUSDT.csv.gz
Resolving datasets.tardis.dev (datasets.tardis.dev)... 104.18.40.205, 172.64.147.51
Connecting to datasets.tardis.dev (datasets.tardis.dev)|104.18.40.205|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1338512 (1.3M) [text/csv]
Saving to: ‘marketdata/20230101.XRP.csv.gz’


2024-03-04 11:09:47 (8.24 MB/s) - ‘marketdata/20230101.XRP.csv.gz’ saved [1338512/1338512]

--2024-03-04 11:09:47--  https://datasets.tardis.dev/v1/bybit/trades/2023/01/01/OPUSDT.csv.gz
Resolving datasets.tardis.dev (datasets.tardis.dev)... 104.18.40.205, 172.64.147.51
Connecting to datasets.tardis.dev (datasets.tardis.dev)|104.18.40.205|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 415897 (406K) [text/csv]
Saving to: ‘marketdata/20230101.OP.csv.gz’


2024-03-04 11:09:48 (7.12 MB/s) - ‘marketdata/20230101.OP.csv.gz’ saved [415897/415897]

In [None]:
!mv marketdata/ ..

# Inline Signal Set Calculator

In [None]:
import numpy as np

class SignalSetCalculator:
    def __init__(self):
        self.lastTradePrice: float = np.nan
        self.lastTradeAmount: float = np.nan
        self.dTradePrice: float = 0.
        self.dTradeAmount: float = 0.

    def update(self, message: dict):
        px = message['price']
        amt = message['amount']

        self.dTradePrice = px - self.lastTradePrice
        self.dTradeAmount = amt - self.lastTradeAmount
        self.lastTradePrice = px
        self.lastTradeAmount = amt
    
    def get(self):
        return np.array([
            self.lastTradePrice, self.lastTradeAmount, self.dTradePrice, self.dTradeAmount,
            ], dtype = np.float64)
        


# Ops Driven Calculator

In [None]:
import numpy as np

class InputOp:
    """ reads a message from the input """
    def __init__(self):
        self.dtype = dict
        self.value = {}
        self.inputs = []

    def update(self, message: dict):
        self.value = message

class GetOp:
    """ reads a field from the input message """
    def __init__(self, field: str, dtype: np.dtype, inputs: list):
        self.field = field
        self.dtype = dtype
        self.value = dtype()
        self.inputs = inputs

    def update(self, message: dict):
        self.value = self.dtype(message[self.field])

class DiffOp:
    """ computes the difference between the current and last value """
    def __init__(self, dtype: np.dtype, inputs: list):
        self.dtype = dtype
        self.value = dtype()
        self.last = dtype()
        self.inputs = inputs
    
    def update(self, x):
        self.value = x - self.last
        self.last = x

class SignalSetCalculatorOps:
    def __init__(self):
        self.input = InputOp()
        self.lastTradePrice = GetOp("price", np.float64, ["input"])
        self.lastTradeAmount = GetOp("amount", np.float64, ["input"])
        self.dTradePrice = DiffOp(np.float64, ["lastTradePrice"])
        self.dTradeAmount = DiffOp(np.float64, ["lastTradeAmount"])

    def updatesignal(self, signal):
        signal.update(*[getattr(self, x).value for x in signal.inputs])

    def update(self, message: dict):
        self.input.update(message)
        self.updatesignal(self.lastTradePrice)
        self.updatesignal(self.lastTradeAmount)
        self.updatesignal(self.dTradePrice)
        self.updatesignal(self.dTradeAmount)

    def get(self):
        return np.array([
            self.lastTradePrice.value, self.lastTradeAmount.value, self.dTradePrice.value, self.dTradeAmount.value,
            ], dtype=np.float64)


# Comparing the two approaches

They should produce same result. Want to see how fast they are as well.

In [None]:
import pandas as pd
import numpy as np

data=pd.read_csv("../marketdata/20231101.OP.csv.gz", compression="gzip"); data


Unnamed: 0,exchange,symbol,timestamp,local_timestamp,id,side,price,amount
0,bybit,OPUSDT,1698796801797000,1698796801835449,6487eaa4-ab7e-534d-b5a1-6d9ee482c8a4,sell,1.3961,250.0
1,bybit,OPUSDT,1698796801804000,1698796801842170,7e7537e0-aba4-52f6-8636-90a78e1e035f,sell,1.3961,150.0
2,bybit,OPUSDT,1698796801804000,1698796801842170,bd3dd448-527f-541a-99d9-161d9599d7d3,sell,1.3961,11.7
3,bybit,OPUSDT,1698796806911000,1698796806949468,b40d10f4-1efd-5923-be61-308ab947d073,sell,1.3960,115.6
4,bybit,OPUSDT,1698796809007000,1698796809044363,99bb5885-ff82-559d-88b0-c98cb538d1c2,buy,1.3961,22.2
...,...,...,...,...,...,...,...,...
136125,bybit,OPUSDT,1698883190808000,1698883190853743,d93744d8-62b5-5d60-be9a-0e2efa12da2e,buy,1.4379,219.9
136126,bybit,OPUSDT,1698883190815000,1698883190860478,4e716785-0fa5-5ffc-be29-9a05e252d985,buy,1.4379,15.6
136127,bybit,OPUSDT,1698883190815000,1698883190860478,b4bc5d5b-4a83-5dcd-b20c-6e98f3943893,buy,1.4379,28.0
136128,bybit,OPUSDT,1698883190815000,1698883190860478,81536c25-4cca-545f-a440-137b296277dd,buy,1.4379,298.6


In [None]:
def testSignalCalculator():
    fc = SignalSetCalculator()
    i=0
    result1=[]
    for row in data.to_dict(orient='records') :
        i += 1
        fc.update(row)
        if i%1000 == 0: result1.append(fc.get())
    return result1
%timeit testSignalCalculator()


214 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
def testSignalCalculatorOps():
    fc = SignalSetCalculatorOps()
    i=0
    result=[]
    for row in data.to_dict(orient='records') :
        i += 1
        fc.update(row)
        if i%1000 == 0: result.append(fc.get())
    return result
%timeit testSignalCalculatorOps()

381 ms ± 2.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


We see that the later approach is about twice as slow. Is the slowness worth it? 

In [None]:
def are_lists_of_arrays_close(list1, list2, rtol=1e-05, atol=1e-08):
    if len(list1) != len(list2):
        return False
    for array1, array2 in zip(list1, list2):
        if not np.allclose(array1, array2):
            return False
    return True


are_lists_of_arrays_close(testSignalCalculator(), testSignalCalculatorOps())

True

But at least both approaches are returning the same result :)

# Alternative - Using array calculations for backtesting

In [None]:
def testSignalCalculatorNumpy(): 
    d = data.copy()
    d['dTradePrice'] = d.price.diff()
    d['dTradeAmount'] = d.amount.diff()
    result = d[['price','amount', 'dTradePrice', 'dTradeAmount']].iloc[999::1000]
    return [row for row in result.to_numpy()]
%timeit testSignalCalculatorNumpy()


4.76 ms ± 113 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
are_lists_of_arrays_close(testSignalCalculator(), testSignalCalculatorNumpy())

True

We see that direct numpy vector operations are way faster than any iterative looping using python. 

About 50x faster than SignalSetCalculator() and about 100x faster than SignalSetCalculatorOps()

# Conculsion

For back-testing, we nearly certainly want to use numpy for calculating signals.

# Faster Execution

In [None]:
%pip install numba

Collecting numba
  Obtaining dependency information for numba from https://files.pythonhosted.org/packages/ef/20/94ef7b3afee76f47f3ad2d9dfc64f5cb29a365df00e5c563e7518e761bc0/numba-0.59.0-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Using cached numba-0.59.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.7 kB)
Collecting llvmlite<0.43,>=0.42.0dev0 (from numba)
  Obtaining dependency information for llvmlite<0.43,>=0.42.0dev0 from https://files.pythonhosted.org/packages/ba/3a/286d01191e62ddbe645d4a3f1e0d96106a98d3fd7f82441d20ffe93ab669/llvmlite-0.42.0-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Using cached llvmlite-0.42.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.8 kB)
Using cached numba-0.59.0-cp311-cp311-macosx_11_0_arm64.whl (2.6 MB)
Using cached llvmlite-0.42.0-cp311-cp311-macosx_11_0_arm64.whl (28.8 MB)
Installing collected packages: llvmlite, numba
Successfully installed llvmlite-0.42.0 numba-0.59.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is av

In [None]:
from numba import njit
@njit
def testSignalCalculator2():
    fc = SignalSetCalculator()
    i=0
    result1=[]
    for row in data.to_dict(orient='records') :
        i += 1
        fc.update(row)
        if i%1000 == 0: result1.append(fc.get())
    return result1
%timeit testSignalCalculator2()

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Untyped global name 'SignalSetCalculator': Cannot determine Numba type of <class 'type'>

File "../../../../../../var/folders/l_/l7636tpj5zzd5cx3pwzdd8k00000gn/T/ipykernel_77153/98501214.py", line 4:
<source missing, REPL/exec in use?>


Above does not work since numba can only compile functions, not python classes.