In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.preprocessing import OneHotEncoder

In [3]:
data = pd.read_csv("../../HKDailyStocksQuotes/0001.HK.csv", index_col = 'Date')
# data = pd.read_csv("0001.HK.csv", index_col = 'Date')
data.index = pd.to_datetime(data.index,  format="%Y%m%d")
# data.index = pd.to_datetime(data.index)

In [4]:
bins = [-np.inf, -0.1, -0.05, -0.03, 0, 0.03, 0.05, 0.1, np.inf]
names = ['<-0.1', '-0.1--0.05', '-0.05--0.03', ' -0.03-0', '0-0.03', '0.03-0.05', '0.05-0.1', '>0.1']
no_data_to_remove = 15

train_ratio = 0.7


def turning_points(array):
    """ turning_points(array) -> min_indices, max_indices
    Finds the turning points within an 1D array and returns the indices of the minimum and
    maximum turning points in two separate lists.
    """
    idx_max, idx_min = [], []
    if len(array) < 3:
        return idx_min, idx_max

    NEUTRAL, RISING, FALLING = range(3)

    def get_state(a, b):
        if a < b: return RISING
        if a > b: return FALLING
        return NEUTRAL

    ps = get_state(array[0], array[1])
    begin = 1
    for i in range(2, len(array)):
        s = get_state(array[i - 1], array[i])
        if s != NEUTRAL:
            if ps != NEUTRAL and ps != s:
                if s == FALLING:
                    idx_max.append((begin + i - 1) // 2)
                else:
                    idx_min.append((begin + i - 1) // 2)
            begin = i
            ps = s
    return idx_min, idx_max 

def create_turning_point_matrix_for_day_diff(data, day_diff, id_tp_array):
    # shift the turning point array to certain day difference, with the close price of turning point 
    id_tp_array = pd.DataFrame({'max' : [int(i in id_tp_array) for i in range(len(data))]})
    id_tp_array = id_tp_array.set_index(data.index)
    idx_tp_shift = id_tp_array.shift(day_diff)
    px_shift = data['Adj Close'].shift(day_diff)
    # cal px diff of Tday's px and TP's px
    px_diff = ((data['Adj Close'] - px_shift) / px_shift).mul(idx_tp_shift['max'], fill_value=0)
    
    
    # binning and transform to one hot categorization
    # fibonacci_bins = [-np.inf, -0.764, -0.618, -0.5, -0.382, 0, 0.382, 0.5, 0.618, 0.764, np.inf]
    
    px_diff_bin = pd.cut(px_diff, bins, labels=names)
    px_diff_bin = pd.get_dummies(px_diff_bin)
    px_diff_bin[px_diff==0] = 0 # px-diff==0 is matched into 1 bin, but it should be zero in all cols

    return px_diff_bin

def create_turning_point_3d_matrix(data):
    if data is None:
        return None, None
    # create turning points series
    idx_min, idx_max = turning_points(data['Adj Close'])

    # remove the first 15 rows explicitly
    max_matrix2 = create_turning_point_matrix_for_day_diff(data, 2, idx_max)[no_data_to_remove:]
    max_matrix3 = create_turning_point_matrix_for_day_diff(data, 3, idx_max)[no_data_to_remove:]
    max_matrix5 = create_turning_point_matrix_for_day_diff(data, 5, idx_max)[no_data_to_remove:]
    max_matrix10 = create_turning_point_matrix_for_day_diff(data, 10, idx_max)[no_data_to_remove:]
    max_matrix15 = create_turning_point_matrix_for_day_diff(data, 15, idx_max)[no_data_to_remove:]
    

    result_max = pd.concat([max_matrix2, max_matrix3, max_matrix5, max_matrix10, max_matrix15], keys=[2, 3, 5, 10, 15])
    result_max.index = result_max.index.swaplevel(1, 0)
    result_max = result_max.sort_index()
    result_max.index.names = ['Date', 'DayDiff']
    result_max.fillna(0)

    # remove the first 15 rows explicitly
    min_matrix2 = create_turning_point_matrix_for_day_diff(data, 2, idx_min)[no_data_to_remove:]
    min_matrix3 = create_turning_point_matrix_for_day_diff(data, 3, idx_min)[no_data_to_remove:]
    min_matrix5 = create_turning_point_matrix_for_day_diff(data, 5, idx_min)[no_data_to_remove:]
    min_matrix10 = create_turning_point_matrix_for_day_diff(data, 10, idx_min)[no_data_to_remove:]
    min_matrix15 = create_turning_point_matrix_for_day_diff(data, 15, idx_min)[no_data_to_remove:]

    result_min = pd.concat([min_matrix2, min_matrix3, min_matrix5, min_matrix10, min_matrix15], keys=[2, 3, 5, 10, 15])
    # rotate the matrix axes
    result_min.index = result_min.index.swaplevel(1, 0)
    result_min = result_min.sort_index()
    result_min.index.names = ['Date', 'DayDiff']
    result_min.fillna(0)

    return result_max, result_min

def create_technical_indicator_3d_matrix(data):
    if data is None:
        return None
    high_low_diff = (data['High'] - data['Low']) / data['Low']
    ma5_diff = (sma(data['Close'], 5) - data['Close']) / data['Close']
    ma10_diff = (sma(data['Close'], 10) - data['Close']) / data['Close']
    ma20_diff = (sma(data['Close'], 20) - data['Close']) / data['Close']

    high_low_diff_bin = pd.cut(high_low_diff, bins, labels=names)
    ma5_diff_bin = pd.cut(ma5_diff, bins, labels=names)
    ma10_diff_bin = pd.cut(ma10_diff, bins, labels=names)
    ma20_diff_bin = pd.cut(ma20_diff, bins, labels=names)

    result_hl_diff = pd.get_dummies(high_low_diff_bin)
    result_ma5_diff = pd.get_dummies(ma5_diff_bin)
    result_ma10_diff = pd.get_dummies(ma10_diff_bin)
    result_ma20_diff = pd.get_dummies(ma20_diff_bin)
    result_hl_diff[high_low_diff == 0] = 0  # remove 0 value
    result_ma5_diff[ma5_diff == 0] = 0  # remove 0 value
    result_ma10_diff[ma10_diff == 0] = 0  # remove 0 value
    result_ma20_diff[ma20_diff == 0] = 0  # remove 0 value
    result_hl_diff = result_hl_diff[no_data_to_remove:]
    result_ma5_diff = result_ma5_diff[no_data_to_remove:]
    result_ma10_diff = result_ma10_diff[no_data_to_remove:]
    result_ma20_diff = result_ma20_diff[no_data_to_remove:]

    result_matrix = pd.concat([result_hl_diff, result_ma5_diff, result_ma10_diff, result_ma20_diff],
                              keys=['hl_diff', 'ma5_diff', 'ma10_diff', 'ma20_diff'])
    # rotate result matrix axes
    result_matrix.index = result_matrix.index.swaplevel(1, 0)
    result_matrix = result_matrix.sort_index()
    result_matrix.index.names = ['Date', 'Indicator']

    return result_matrix

def sma(data, period=5):
    return data.rolling(period).mean()

def get_next_day(date, data):
    # next row in data
    next_index = data.index.get_loc(date)+1
    if(next_index >= len(data.index)):
        return None
    else:
        return data.index[data.index.get_loc(date)+1]
    
def enrich_market_data(data):
    if data is None:
        return

    data['ma5'] = sma(data['Close'], 5)
    data['rate_of_close'] = data['Close'].pct_change()
    return data[no_data_to_remove:]

def get_next_day(date, data):
    # next row in data
    next_index = data.index.get_loc(date) + 1
    if next_index >= len(data.index):
        return None
    else:
        return data.index[data.index.get_loc(date) + 1]
    
def clean_data(data):
    if data is None:
        return
    data = data.dropna()
    return data

def split_data_set_index(data):
    train_split = int(len(data) * train_ratio)

    train = data[:train_split].index
    test = data[train_split:].index
    return train, test

In [121]:
idx_min, idx_max = turning_points(data['Adj Close'])

# plt.plot(data[:100].index, pd.Series(max_array)[:100]*30)
# plt.plot(data['Adj Close'][:100], marker='o', markevery=idx_min)
# plt.plot(data['Adj Close'][:100], marker='o', markevery=idx_max)
# plt.ylabel('some numbers')
# plt.show()

In [13]:
data = clean_data(data)                  
turning_point_max, turning_point_min = create_turning_point_3d_matrix(data)
technical_indicator_matrix = create_technical_indicator_3d_matrix(data)
data = enrich_market_data(data)
# result_max.groupby('DayDiff').sum()

In [75]:
# technical_indicator_matrix[technical_indicator_matrix.isnull().any(axis=1)]
assert len(data) == len(turning_point_max.index.levels[0])
assert len(turning_point_max.index.levels[0]) == len(turning_point_min.index.levels[0])
assert len(turning_point_min.index.levels[0]) == len(technical_indicator_matrix.index.levels[0])
len(turning_point_max.index.levels[0])
len(data)

4782

In [154]:
ma20_diff = (sma(data['Close'], 20) - data['Close']) / data['Close']
ma20_diff[ma20_diff==0]
# max_matrix2 = create_turning_point_matrix_for_day_diff(data, 2, idx_max)
# max_matrix2 = max_matrix2[15:]
# max_matrix2

Series([], Name: Close, dtype: float64)

In [116]:
class State:
        def __init__(self, date, value):
            self.date = date
            self.value = value
            ''
def get_buy_signal_states_by_date(date):
    try:
        tp_max = turning_point_max.loc[date]
        tp_min = turning_point_min.loc[date]
        tech_indicator = tech_indicator_matrix.loc[date]
        s = np.concatenate((tp_max.values.flatten(), tp_min.values.flatten(), tech_indicator.values.flatten()),
                           axis=0)
        state = State(date, s)
        print("generated buySignalStates, date " + str(date))
        return state
    except KeyError:
        print("ERROR getting buy signal state for date " + str(date))
        return None

In [5]:
# data.loc[data.sample().index.values[0]]
one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit(np.array([i for i in range(7)]).reshape(-1, 1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=False)

In [10]:
# len(one_hot_encoder.active_features_)
# one_hot_encoder.transform([[1]])
# one_hot_encoder.transform([[0]])
action = one_hot_encoder.transform([[6]])
inverse_action = one_hot_encoder.inverse_transform(action)
print(action)
print(inverse_action)

[[0. 0. 0. 0. 0. 0. 1.]]
[[6.]]


In [22]:
action_map = {-3: 0,
              -2: 1,
              -1: 2,
              0: 3,
              1: 4,
              2: 5,
              3: 6}
random.choice(list(action_map.keys()))

0

In [48]:
# data = enrich_market_data(data)
# data[data.isnull().any(axis=1)]

In [6]:
train, test = split_data_set_index(data)

In [18]:
# data.index.get_loc(test[0])
# date.index[0]
day = pd.Series(train).sample().values[0]
print(day)
get_next_day(day,data)

2016-01-12T00:00:00.000000000


Timestamp('2016-01-13 00:00:00')