In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
# from matplotlib import pyplot as plt
from PIL import Image
# import scipy.misc as smp
from utils.feature_engineering import (
    add_return_data,
    add_simple_moving_average,
    generate_image_data,
)


In [None]:
dataPath='../../../dataset/market_data/output/nikkei_225/CONSTITUENTS_DAILY_9984.T.csv'
df = pd.read_csv(dataPath, parse_dates=['date'])
# print(df)
df = df.dropna(how='any') # remove row with null
df = df.sort_values(by=['date'], ascending=True) # make sure it is ordered by date
df.head(3)

In [None]:
# image config (size)
# 5d - 32x15 
image_height_5d = 32
volume_image_height_5d = 6
gap_height_5d = 2

# 20d - 64x60
image_height_20d = 64
volume_image_height_20d = 13
gap_height_20d = 1

# 60d - 96x180
image_height_60d = 96
volume_image_height_60d = 19
gap_height_60d = 1

# 252d - 256x756
image_height_252d = 256
volume_image_height_252d = 51
gap_height_252d = 1


In [None]:
# add return_date and return_value and moving average data

df = add_return_data(df, 5) # add return_5
df = add_return_data(df, 20) # add return_20
df = add_return_data(df, 60) # add return_60
df = add_simple_moving_average(df, 50) # add ma_50
df = add_simple_moving_average(df, 100) # add ma_100
df = add_simple_moving_average(df, 200) # add ma_200
df.to_csv("enhanced_df.csv")
df.sample(5)

In [None]:
# example: 50d-ma + ohlc in 252 days
subset = df[1000:1252]
imgData = generate_image_data(subset, image_height_252d, 0, 0, True, False, False)
img = Image.fromarray(imgData)
img.show()
# img.save('demo-252d-ohlc-volume-ma50d.png')

In [None]:
# example: 100d-ma + ohlc + volume in 252 days
subset = df[1000:1252]
imgData = generate_image_data(
    subset,
    image_height_252d - volume_image_height_252d - gap_height_252d,
    volume_image_height_252d,
    gap_height_252d,
    False,
    True,
    False
)
img = Image.fromarray(imgData)
img.show()
# img.save('demo-252d-ohlc-volume-ma50d.png')

In [None]:
# generate images dataset with labels

def feature_engineering(
        dataset: pd.DataFrame, # expect to load the whole dataset (clean data, no null)
        lookback_days: int, # how many days are included in the image (eg: 5, 20, 60, 252)
        return_days: int, # which return value is used for the label (5 or 20 or 60 only)
        ohlc_image_height: int,
        volume_image_height: int, # put 0 if we don't want volume barchart
        gap_height: int, # put 0 if we don't want volume barchart
        include_ma_50d: bool,
        include_ma_100d: bool,
        include_ma_200d: bool,
        label_negitive_margin: float,
        label_positive_margin: float,
        save_image: bool, # True: will generate all the images and save the meta.csv in the path folder
        save_meta: bool,
        path: str,
    ):
    if return_days not in [5, 20, 60]:
        raise Exception(f'Invalud return_days: {return_days}')
    meta = {
        'id': [],
        'img_data': [],
        'date': [],
        'return_value': [],
        'label': []
    }

    # a simple function to map return value to label class
    def label_funt(value: float):
        if value < label_negitive_margin:
            return 0
        if value < 0:
            return 1
        if value < label_positive_margin:
            return 2
        return 3
    
    # remove rows without ma data if we are including the curve
    if include_ma_50d:
        dataset = dataset.dropna(subset=['ma_50d'])
    if include_ma_100d:
        dataset = dataset.dropna(subset=['ma_100d'])
    if include_ma_200d:
        dataset = dataset.dropna(subset=['ma_200d'])
    
    for i in range(len(dataset)):
        subset = dataset[i:i+lookback_days] # retrieve lookback data
        # break if the subset does not have enough data (almost the end of the raw data)
        if len(subset) != lookback_days:
            print(f'End at i: {i} - len(subset) != lookback_days')
            break
        if np.isnan(subset.iloc[-1][f'return_{return_days}_value']):
            print(f'End at i: {i} - return value nan')
            break
        imgData = generate_image_data(
            subset,
            ohlc_image_height,
            volume_image_height,
            gap_height,
            include_ma_50d,
            include_ma_100d,
            include_ma_200d,
        )
        if save_image:
            img = Image.fromarray(imgData)
            img.save(path + f'/{i}.png')

        return_value = subset.iloc[-1][f'return_{return_days}_value']
        label = label_funt(float(return_value))
        meta['id'].append(i)
        meta['img_data'].append(imgData)
        meta['date'].append(subset.iloc[-1][f'date'])
        meta['return_value'].append(return_value)
        meta['label'].append(label)
        
    meta_df = pd.DataFrame(meta)
    if save_meta:
        meta_df.drop(columns=['img_data']).to_csv(path + "/meta.csv")
    return meta_df

In [None]:
# create demo images and metas 
dataset_raw = df[0:4000]
path = '../../../images/demo'
Path(path).mkdir(parents=True, exist_ok=True)
dataset = feature_engineering(
    dataset_raw,
    20,
    20,
    image_height_20d - volume_image_height_20d - gap_height_20d,
    volume_image_height_20d,
    gap_height_20d,
    True,
    False,
    False,
    -0.5,
    0.5,
    True,
    True,
    path
)

In [None]:
dataset.sample(5)