In [212]:
import numpy as np
import pandas as pd
from pathlib import Path
# from matplotlib import pyplot as plt
from PIL import Image
# import scipy.misc as smp

In [213]:
dataPath='../../../dataset/market_data/output/hsi/INDEX_DAILY_HSI.csv'
df = pd.read_csv(dataPath, parse_dates=['date'])
# print(df)
df = df.dropna(how='any') # remove row with null
df = df.sort_values(by=['date'], ascending=True) # make sure it is ordered by date
df.head(3)

Unnamed: 0,adj_close,close,date,high,low,open,volume
0,17072.820313,17072.820313,2000-01-04,17303.0,16933.519531,17303.0,0.0
1,15846.719727,15846.719727,2000-01-05,16608.550781,15688.490234,16608.550781,0.0
2,15153.230469,15153.230469,2000-01-06,15971.030273,14763.969727,15942.070313,0.0


In [214]:
# config
ohlc_image_height_5d = 15
volume_image_height_5d = 10 # or set this to a ratio of ohlc image?
gap_height_5d = 2
ohlc_image_height_20d = 60
volume_image_height_20d = 40 # or set this to a ratio of ohlc image?
gap_height_20d = 2

In [215]:
# Function to generate daily ohlc chart
# https://stackoverflow.com/questions/434583/what-is-the-fastest-way-to-draw-an-image-from-discrete-pixel-values-in-python
# https://stackoverflow.com/questions/57545125/attributeerror-module-scipy-misc-has-no-attribute-toimage

def generate_daily_ohlc_chart(
        size: int,          # height of the bitmap
        min: float,         # the minimum value in that period (5d etc)
        max: float,         # the maximum value in that period (5d etc)
        o: float,           # open
        h: float,           # high
        l: float,           # low
        c: float,           # close
        color = (255, 255, 255),
        bgColor = (0, 0, 0)
    ):
    # initize the data
    data = np.empty((size, 3, 3), dtype=np.uint8)
    # background color
    data[:,:] = bgColor
    # calculation
    _step = (max - min) / size
    # open
    _open = int((o - min) / _step) # will floor the number
    if _open == size: # -1 if its the maximum value
        _open -= 1
    data[_open,0] = color
    # high-low
    _high = int((h - min) / _step) # will floor the number
    _low = int((l - min) / _step) # will floor the number
    if _high == size: # -1 if its the maximum value
        _high -= 1
    if _low == size: # -1 if its the maximum value
        _low -= 1
    data[_low:_high + 1,1] = color
    # close
    _close = int((c - min) / _step) # will floor the number
    if _close == size: # -1 if its the maximum value
        _close -= 1
    data[_close,2] = color

    return np.flip(data, 0)


In [216]:
# Function to generate daily volume barchart
def generate_daily_volume_chart(
        size: int,          # height of the bitmap
        min: float,         # the minimum value in that period (5d etc)
        max: float,         # the maximum value in that period (5d etc)
        v: float,           # open
        color = (255, 255, 255),
        bgColor = (0, 0, 0)
    ):
    # initize the data
    data = np.empty((size, 3, 3), dtype=np.uint8)
    # background color
    data[:,:] = bgColor
    # calculation
    _step = (max - min) / size
    # no volume data
    if _step == 0:
        return data
    # drawing volume bar
    _volume = int((v - min) / _step) # will floor the number
    if (_volume != 0):
        data[0:_volume,1] = color

    # print(_step)
    # print(v)
    # print(_volume)

    return np.flip(data, 0)

In [217]:
# Function to generate a gap
def generate_gap(
        height: int,          # height of the bitmap
        width: int,          # height of the bitmap
        bgColor = (0, 0, 0)
    ):
    # initize the data
    data = np.empty((height, width, 3), dtype=np.uint8)
    # background color
    data[:,:] = bgColor
    return data

In [218]:
# Function to generate one image
def generate_image(
        dataset: pd.DataFrame,
        ohlc_height: int,
        volume_height: int, # set this to 0 if we don't want volume barchart
        gag_height: int, # set this to 0 if we don't want volume barchart
    ):
    data = []
    for index, row in dataset.iterrows():
        _ohlc = generate_daily_ohlc_chart(
            ohlc_height,
            float(dataset.min()['low']),
            float(dataset.max()['high']),
            float(row['open']),
            float(row['high']),
            float(row['low']),
            float(row['close']),
        )
        if volume_height == 0:
            data.append(_ohlc)
        else:
            _gap = generate_gap(
                gag_height,
                3,
            )
            _volume = generate_daily_volume_chart(
                volume_height,
                float(dataset.min()['volume']),
                float(dataset.max()['volume']),
                float(row['volume']),
            )
            _data = np.concatenate([_ohlc, _gap, _volume], axis=0)
            data.append(_data)

    imgData = np.concatenate(data, axis=1)
    img = Image.fromarray(imgData)
        
    return img

In [219]:
# example: 5 day data (ohlc only)
subset = df[500:505]
img = generate_image(subset, ohlc_image_height_5d, 0, 0)
# img.show()

In [220]:
# example: 5 day data (ohlc + volume)
subset = df[500:505]
img = generate_image(subset, ohlc_image_height_5d, volume_image_height_5d, gap_height_5d)
# img.show()

In [221]:
# example: 20 day data (ohlc)
subset = df[480:500]
img = generate_image(subset, ohlc_image_height_20d, 0, 0)
# img.show()

In [222]:
# example: 20 day data (ohlc + volume)
subset = df[480:500]
img = generate_image(subset, ohlc_image_height_20d, volume_image_height_20d, gap_height_20d)
# img.show()

In [223]:
# example: 120 day data (ohlc + volume)
subset = df[1000:1120]
img = generate_image(subset, 360, 240, 3)
# img.show()

In [224]:
# generate images dataset with labels

def feature_engineering(
        dataset: pd.DataFrame, # expect to load the whole dataset (clean data, no null)
        lookback_days: int, # how many days are included in the image (eg: 5 or 20)
        return_days: int, # how many days are 
        ohlc_image_height: int,
        volume_image_height: int, # put 0 if we don't want volume barchart
        gap_height: int, # put 0 if we don't want volume barchart
        path: str,
    ):
    meta = {
        'id': [],
        'return_ratio': []
    }
    # print(len(dataset))
    # print(dataset.iloc[lookback_days]['date'])
    # print(dataset.iloc[lookback_days]['date'] + np.timedelta64(return_days,'D'))
    for i in range(len(dataset)):
        subset = dataset[i:i+lookback_days] # retrieve lookback data
        # break if the subset does not have enough data (almost the end of the raw data)
        if len(subset) != lookback_days:
            break
        return_date = subset.iloc[-1]['date'] + np.timedelta64(return_days,'D')
        return_data_df = dataset.loc[df['date'] > return_date].sort_values(by=['date'])
        # break if return_data is empty (almost the end of the raw)
        if len(return_data_df) < 1:
            break
        img = generate_image(subset, ohlc_image_height, volume_image_height, gap_height)
        img.save(path + f'/{i}.png')

        return_data = return_data_df.iloc[0]
        # print('return data close: ', return_data['close'])
        # print('subset last data close: ', subset.iloc[-1]['close'] )
        return_ratio = return_data['close'] / subset.iloc[-1]['close'] - 1
        # print('return_percent: ', return_ratio)
        meta['id'].append(i)
        meta['return_ratio'].append(return_ratio)
        
    # print(meta)
    meta_df = pd.DataFrame(meta)
    # print(meta_df)
    meta_df.to_csv(path + "/meta.csv")
    return

In [225]:
# create demo images and metas 
test_set = df[2000:2500]
path = '../../../images/demo'
Path(path).mkdir(parents=True, exist_ok=True)
feature_engineering(test_set, 34, 20, ohlc_image_height_20d, volume_image_height_20d, gap_height_20d, path)