In [1]:
%load_ext autoreload
%matplotlib inline

In [2]:
%autoreload 2

import sys 
import pandas as pd
import numpy as np
import datetime
from datetime import timezone
import pytz
import random
from pycoingecko import CoinGeckoAPI
import statsmodels.tsa.stattools as ts 


# Custom imports
sys.path.append('..')
from bin.classes.basket import Basket
from bin.classes.coin import Coin
from bin.classes.processor import Processor
from bin.classes.cgprocessor import CGProcessor
from bin.utils import stats, transforms
sys.path.pop()

ModuleNotFoundError: No module named 'utils'

In [None]:
start_date, end_date = (2020,1,1), (2021,10,31)

NAT_CURR = 'usd'

LOOKBACKS = [7, 14, 30]
ROLL_WIND = LOOKBACKS[1]

In [None]:
##################
## LOCAL DRIVER ##
##################

cgp = CGProcessor()

# Create portfolio dataframe
port_val = cgp.create_portfolio(PORT_IDS, start_date, end_date)
data = cgp.data

cgp.normalize([], port=True)

In [None]:
display(data)

# Tests
Test cointegration of coin basket: {coins} = {c_1,...,c_n}
ie. Check that the set {coins} is cointegrated using Engle-Granger two-step test
This means there is some STATIONARY linear combination of {coins}

<ol>
    <li>Check that c_i is order-1 integrable</li>
    <li>Create basket of coins, find linear regression.</li>
    <li>Verify spread of basket is cointegrated.</li>
    <li>Define strategy for basket trades.</li>
</ol>

#### Concerns
<ul>
    <li>Should we normalize data before running our regression?</li>
</ul>

#### Conclusions
<ul>
    <li>OLS and Linear Regression yield the same coefficients</li>
    <li>While we usually assume stock data is I(1), it sometimes isn't (eg. Litecoin, BTC-Cash)</li>
    <li>Normalization might help us find a regression line that does cointegrate the portfolio</li>
    <li>We should run this test on different coins in different sized baskets.</li>
</ul>

In [None]:
############
## TEST 1 ##
############

# print("Normalizing...")
# scaler = MinMaxScaler()
# scaler.fit(X, y)
# X_scaled = scaler.transform(X) # TODO: Finish normalization
# X_scaled = pd.DataFrame(X_scaled/X_scaled[0], index=X.index) 
# X_scaled.columns = X.columns

print("Verifying coin price stationarity...")
coins = []
for coin_name in data:
    coin = Coin(coin_name)
    result = coin.is_good(data[coin_name], start_date, end_date)
    if result:
        coins.append(coin)
    else:
        print(coin_name, "is not I(1)")

print([coin.name_ for coin in coins], " are stationary \n")
        
basket = Basket(coins, target='bitcoin', processor=cgp)
basket.fit(data)
spread = basket.find_spread(data)
is_cointegrated = basket.is_coint(spread)
print("\n")
print("Cointegration test returned:", is_cointegrated)

short_entry, long_entry = basket.strat(spread)

# Explore Exchanges on CoinGecko

In [None]:
exchanges = cgp.cg.get_exchanges_list()
print("There are", len(exchanges), "exchanges.")

by_24hr_vol = sorted(exchanges, key=lambda d: d['trade_volume_24h_btc'], reverse=True) 
print("Top 3 by 24 trading volume:", [exchange['id'] for exchange in by_24hr_vol[:3]])
by_trust_score = sorted(exchanges, key=lambda d: d['trust_score_rank'])
print("Top 3 by trust score:", [exchange['id'] for exchange in by_trust_score[:3]])

For now just use the highest 100 market cap coins on coingecko

In [None]:
COLS = ['id', 'current_price', 'high_24h', 'low_24h', 'market_cap', 'total_volume']

coin_market = cgp.cg.get_coins_markets(vs_currency=NAT_CURR)
df_market = pd.DataFrame(coin_market, columns=COLS)
df_market.set_index('id', inplace=True)

In [None]:
df_market

From binomial theorem there are $2^{100}$ baskets so we need to find a way of filtering.

<b>There are also too many coins to pull all of them at once! We might have to create and host a database that contains updated data. We can pull around 50 or so coins at once.</b>

### Strategy
Take Pearson (Spearman?) correlation of dataframe to filter out some coins.

Idea:
We could train a model to take price data and try to spit out which coins are the most likely to be cointegrated. I think this could be some kind of 

In [None]:
%%time
# Create portfolio dataframe
port_val = cgp.create_portfolio(df_market[:50].index, start_date, end_date)
data = cgp.data
# TODO: Write to CSV
data

In [None]:
corr = data.corr()
corr

In [None]:
print("Verifying coin price stationarity...")
coins = []
for coin_name in data:
    coin = Coin(coin_name)
    result = coin.is_good(data[coin_name], start_date, end_date)
    if result:
        coins.append(coin)
    else:
        print(coin_name, "is not I(1)")

data = data[[coin.name_ for coin in coins]]
print(list(data.columns), " are stationary \n")

In [None]:
def test(results, coin_set, data, start_date=start_date, end_date=end_date):
    try:
        cgp = CGProcessor()
        coin_names = [coin.name_ for coin in coin_set]

        print("Testing cointegration for", coin_names)

        # Create portfolio dataframe
        basket_data = data[coin_names]

        print("Verifying coin price stationarity... \n")
        coins = []
        for coin_name in basket_data:
            coin = Coin(coin_name)
            result = coin.is_good(basket_data[coin_name], start_date, end_date)
            if not result:
                print(coin_name, "is not I(1), scratching basket.")
                continue
            coins.append(coin)

        target = coin_names[0]
        basket = Basket(coins, target=target, processor=cgp)
        basket.fit(basket_data)
        spread = basket.find_spread(basket_data)
        is_cointegrated = basket.is_coint(spread)

        results = ts.coint(basket_data[target], basket_data.drop([target], axis=1), return_results=True)

        # Check that statsmodels cointegration test also passes
        t_stat = results[0]
        t_crit = results[2][0]

        print(t_stat, t_crit)         

        if t_stat >= t_crit:
            print("Did not pass statsmodels test.")
            is_cointegrated = False

        print("\n")
        print("Cointegration test returned:", is_cointegrated)

        if is_cointegrated:
            results.add(tuple([basket, results[0]]))
        
        return results
    
    except Exception as e:
        print(e)

In [None]:
# Fixed Random Sampler
# num_baskets = 10
# potential_sets = dict()

# for basket_size in range(2, 10):
#     print("Creating baskets of size", basket_size)
#     potential_sets_sample = set()
#     tries = 0
#     while len(potential_sets_sample) < num_baskets:
#         potential_sets_sample.add(tuple(sorted(random.sample(coins, basket_size))))
#         tries += 1
#     potential_sets[basket_size] = potential_sets_sample

# Monte Carlo for the boys
# For now just use it to find cointegrated baskets
# Goal: 
# 1) Randomly sample to find cointegrated basket
# 2) When you do, backtest it against some metric (like PnL)
# 3) Stash results
# 4) Use this to create profit expectation
def simulate(data, n_trials):
    basket_sizes = range(2, 7)
    cointegrated = dict()
    for i in basket_sizes:
        cointegrated[i] = set()
    display(cointegrated)
    for _ in range(n_trials):
        basket_size = random.sample(basket_sizes, 1)[0]
        coin_set = random.sample(coins, basket_size)
        cointegrated[basket_size] = test(cointegrated[basket_size], coin_set, data)
        display(cointegrated)
    display(cointegrated)
    return cointegrated

In [None]:
from IPython.utils import io

with io.capture_output() as captured:
    cointegrated = simulate(data, 100)

In [None]:
cointegrated

In [None]:
for basket in cointegrated:
    print("Stat:", basket[1])
    basket = basket[0]
    spread = basket.find_spread(data[[c.name_ for c in basket.coins_]])
    print(np.where(spread > 3*basket.std_, True, False).sum())
    basket.strat(spread)