In [None]:
from ai.genetic import Genetic
from ai.interval import Interval
from ai.trainer import Trainer
from ai.extrema import Extrema
from scraper.cryptoreq import CryptoReq
from scraper.investreq import InvestReq
from scraper.trendreq import TrendReq
from scraper.tweetreq import TweetReq
from utils.database import Database
import pandas as pd
import numpy as np
import json
from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
from ai.minmax import Minmax
from statsmodels.tsa.stattools import adfuller

# Before you begin

Make sure to fill in the cryptocurrency names in the gui on http://localhost:5000

# Test database connection

In [None]:
Database().test()

# Getting names

When accessing the table with cryptocurrency names, you have the ability to get just names

In [None]:
names = Database().get_names(symbols=True)
for name, symbol in names:
    print(name, symbol)

print()

names = Database().get_names()
for name in names:
    print(name)

# Cryptocurrency price values

The initial approach to get cryptocurrency price values was to send requests at the coindesk.com server.
This approach was later discarded as there was no way of accessing data older than 24 hours.
Downloading cryptocurrency data this way no longer works due to the provider changing the format and access method of the data.

- `timeout` determines the time it takes to complete one cycle of calls
- `short_timeout` determines how long to wait after each call
- `scraping_window` sets the interval between `dt.now()` and the starting timestamp

In [None]:
CryptoReq(timeout=30, short_timeout=30, scraping_window=7200).loop()

# Scraping data

The data is being downloaded from the investing.com servers. User needs to add a custom symbol in the GUI under which is the cryptocurrency accessible. This symbol can be easily found via trancking the site's network communication. With this approach you can download data further in history. This allows the app to start with predictions within a few minutes after the initial start.

- `timeout` will determine the time it takes to complete one cycle of calls
- `short_timeout` determines how long to wait after each call.

In [None]:
InvestReq(timeout=60, short_timeout=5, scrape_days=30).loop()

# Downloading the data from Google Search

We are able to download popularity of given keyword. Popularity is defined as number of searches defined as a percentage of all of the searches in selected timeframe.

- `timeout` determines the time it takes to complete one cycle
- `short_timeout` determines how long to wait after each query name is downloaded
- `trend_timeout` sets the wait time after each call
- `scrape_days` sets the default number of days that are downloaded from timestamp `dt.now() - td(days=scrape_days)`

In [None]:
TrendReq(timeout=60, short_timeout=25, trend_timeout=5, scrape_days=30).loop()

# Downloading tweets

Initially there were attempts to include tweets in the prediction of the value of cryptocurrencies. This approach was later abandoned in favor of the data from Google Search as scraping tweets requires more CPU and has lesser information value. Tweets are not being saved. They are only printed out during scraping. The scraping window is set in tweetreq.py to two hours between now and 2 hours ago.

- `timeout` will determine the time it takes to complete one cycle
- `short_timeout` determines how long to wait after each call.

In [None]:
TweetReq(timeout=60, short_timeout=30).loop()

# Inserting data from backup csv

We are able to load backup data after specifying cryptocurrency names in the gui. The saved cryptocurrencies are BTC, ETH, LTC, NEO and XRP.
The approach is slower due to the architecture of the methods, which are supposed to prevent any data duplicities and inconsistencies.

In [None]:
names = Database().get_names()

def insert(name, type_name_from, type_name_to):
    data = pd.read_csv(f'/workspace/data/{type_name_from}_{name.lower()}.csv')
    data[name] = data['count']
    data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d %H:%M:%S')
    data = data.set_index('date')
    data = pd.DataFrame(data[name])
    Database().insert_into_history_table(type_name_to, data)

for name in names:
    try:
        insert(name, 'crypto', 'crypto')
    except:
        pass

insert('gold', 'gold', 'gold')
insert('vix', 'vix', 'vix')
insert('sap', 'sap', 'sap')

for name in names:
    try:
        insert(name, 'trends', 'trends')
    except:
        pass

# Saving data as csv

We are able to create backup csv in the same way we are able to load it back into database. Will fix missing values and save resulting dataset in data directory.

In [None]:
names = Database().get_names()

Database().select_all_from_and_fix_missing('vix',    'vix',    'ffill', replace_null=True, limit_area='inside', save_to_csv=True),
Database().select_all_from_and_fix_missing('sap',    'sap',    'ffill', replace_null=True, limit_area='inside', save_to_csv=True),
Database().select_all_from_and_fix_missing('gold',   'gold',   'ffill', replace_null=True, limit_area='inside', save_to_csv=True),

for name in names:
    Database().select_all_from_and_fix_missing(name, 'trends', 'linear', replace_null=False, limit_area='inside', save_to_csv=True)
    Database().select_all_from_and_fix_missing(name, 'crypto', 'linear', replace_null=True, limit_area='inside', save_to_csv=True)

# Training model

When training model we can specify how many epochs will be trained at most. The training class will automatically end training if the error of the validation set does not improve over 128 epochs.

- `names` - names of cryptocurrencies
- `window` - number of timestamps per one prediction window
- `cycles_t` - number of cycles in trainint model for time prediction
- `load_model_t` - use saved weights of model for value prediction (True) or overwrite with new data (False)
- `cycles_e` - number of cycles in trainint model for value prediction
- `load_model_e` - use saved weights of model for value prediction (True) or overwrite with new data (False)

In [None]:
names = Database().get_names()

Trainer().create_models(names=names, window=50, cycles_t=0, load_model_t=True, cycles_e=0, load_model_e=True)

# Creating data for genetic algorithm

After training models the Interval class will create prediction for the downloaded values in dataset and those will serve as an input to genetic algorithm, which will find the best strategy for trading based on predictions.

In [None]:
names = Database().get_names()

for name in names:
    Interval().generate_intervals(name)

# Genetic algorithm

- `population_count` max population count in each generation
- `top_count` number of surviving individuals from each generation
- `generations_before_exit` how many generations will there be in total
- `load_config` each generation the top individuals are saved in config file and can be loaded after start if this is `True`

In [None]:
names = Database().get_names()

Genetic(names).get_purchase_config(population_count=100, top_count=25, generations_before_exit=1000, load_configs=False)

# Visualizing transactions of the found configurations

Following cell can be used after the genetic algorithm finds the best configurations. It will plot the financial gain after each transaction for all monitored cryptocurrencies so that user can manually choose which configuration should be used.

In [None]:
names = Database().get_names()

def cash_up(config, predictions):

    transactions = 0.0
    gain = 0.0
    
    rising = config[0]
    sinking = config[1]
    time_buy_rising = config[2]
    time_buy_sinking = config[3]
    time_sell_rising = config[4]
    time_sell_sinking = config[5]

    stat = list()
    purchases_arr = list()

    for prediction in predictions:

        cash =  1.0
        bought = False
        bought_value = 0.0
        purchases = list()

        for interval in prediction:
                
            growth = interval[0]
            duration = interval[1]
            current_price = interval[2]
            advice = 'wait'

            if growth > rising:
                if duration > time_buy_rising:
                    advice = 'buy'
                if duration <= time_sell_rising:
                    advice = 'sell'
            if growth <= sinking:
                if duration < time_buy_sinking:
                    advice = 'buy'
                if duration >= time_sell_sinking:
                    advice = 'sell'

            if not bought and advice == 'buy':
                bought = True
                bought_value = current_price
                deposited = cash
                #print(f'BUY: {list(interval)}, CASH: {cash}, CURRENT_PRICE: {current_price}, BOUGHT_VALUE: {bought_value}, DEPOSITED: {deposited}')
            if bought and advice == 'sell':
                bought = False
                transactions += 1
                cash = (current_price / bought_value) * deposited
                purchases.append(cash)
                #print(f'SELL: {list(interval)}, CASH: {cash}, CURRENT_PRICE: {current_price}, BOUGHT_VALUE: {bought_value}, DEPOSITED: {deposited}')

        purchases_arr.append(purchases)
        gain += (cash - 1)
        stat.append(round(cash - 1, 4))

    for x in stat:
        if x < 0:
            gain = 0
    
    return gain, (transactions, tuple(stat)), purchases_arr

def get_dataset(name):
    with open(f'/workspace/data/interval_{name.lower()}.json', 'r') as file:
        data = np.array(json.load(file), dtype=float)
    return data

def get_datasets(names):
    return [get_dataset(name) for name in names]

def plot_config(config, predictions):


    gain , stat, evolutions = cash_up(config, predictions)

    fig, axs = plt.subplots(len(evolutions), figsize=(18.5, 18.5))
    fig.text(0.5, 0.04, 'Transakce', ha='center')
    fig.text(0.04, 0.5, 'Zisk', va='center', rotation='vertical')

    for i in range(len(evolutions)):
        axs[i].plot(evolutions[i])

    print(f'GAIN: {gain}, TRANSACTIONS: {int(stat[0])}, GAIN FOR EACH CRYPTO: {stat[1]}, CONFIG: {config}')

    plt.show()

predictions = get_datasets(Database().get_names())

with open(f'/workspace/data/config.json', 'r') as file:
    configs = np.array([ np.array(x[1], dtype=float) for x in json.load(file)[::-1][:40]])

for config in configs:
    plot_config(config, predictions)

# Example of local extremes

This is an example of findin local extremes in our data. Note that user needs to specify cryptocurrency name and symbol in gui. If the scrape docker container is not running, the user needs to execute `InvestReq(60, 5).loop()`. So that data for visualization can be downloaded.

In [None]:
names = Database().get_names()

def plot(df, extremes, g_a, selected_name):
    
    x = extremes
    y = arr[x]

    plt.figure(figsize=(18.5, 10.5))
    plt.xlabel('Minuty')
    plt.ylabel('Hodnota')
    plt.title(selected_name)
    plt.plot(pd.DataFrame(df))
    plt.plot(g_a)

    plt.scatter(x,y,s=70,color='green',zorder=2)

    plt.show()

selected_name = names[0]
df = Database().select_all_from_and_fix_missing(selected_name, 'crypto', 'linear', limit_area='inside').head(5000).tail(1000)
df_open = df['count'].values
arr = df_open
df = pd.DataFrame(arr)
extremes, g_a = Minmax().find_extremes(np.array(df[0], dtype='float'), False)
plot(df, extremes, g_a, selected_name)

# Stationarity of the data

The data needs to be stationary for better prediction accuracy. In the following cells we conduct visual and algorithmic check for stationarity of the data.

In [None]:
names = Database().get_names()

def describe_train_set(names):

    for name in names:
        df, _ = Extrema().get_prediction_set(name)
        df = df.astype(float)
        df.plot(subplots=True, layout=(7, 3), figsize=(20, 40), sharex=False, kind='kde', title=f'{name} extremes density', grid=True)
        df.plot(subplots=True, figsize=(20, 40), sharex=False, kind='line', title=f'{name} extremes in time', grid=True)

describe_train_set([names[0]])

In [None]:
names = Database().get_names()

print('Null Hypothesis H0 = If failed to be rejected, it suggests the time series has a unit root, meaning it is non-stationary')
print('Alternative Hypothesis H1 = The null hypothesis is rejected and suggests the time series does not have a unit root, meaning it is stationary')
print()

def ADF_Cal(x, name):
    print(f'======================={name}=======================')
    result = adfuller(x)
    ADF_stat = result[0]
    p = result[1]
    print(f'ADF Statistic: {ADF_stat}')
    print(f'p-value: {p}')
    print('Critical Values')
    levels = [.1 , .05 , .01]
    i = 0
    for key ,value in result[4].items():
        print(f'{key}: {value}')
        hyp = p < levels[i]
        if ADF_stat < value :
            cert = (1 - levels [ i ])*100
            print(f'{cert}% certain this is staionary')
            print(f'Reject H0: {hyp}')
            break
        i = i +1
        if i >= 3:
            print(f'Less than 90% certain that data is stationary')
            print(f'Reject H0: {hyp}')
    print()


df, _ = Extrema().get_prediction_set(names[0])
for name in df.columns:
    df = df.astype(float)
    ADF_Cal(df['interval_60'].values, name)