In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import datetime as dt
import os
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from dtw import dtw
from sklearn.metrics import pairwise_distances
import pickle
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
PRE_NAME = "onemin_ohlc_"
BEGIN_TIME = "09:00:00"
END_TIME = "10:30:00"
NUM_CLUSTER = 50
TIME_STEP = 5
def load_data(date_begin='20180612', date_end = '20200301', split_date_begin = '20200302', split_date_end = '20200915', train = True):
    X = []
    Y = []
    h1, m1, s1 = [int(x) for x in BEGIN_TIME.split(":")]
    h2, m2, s2 = [int(x) for x in END_TIME.split(":")]
    t1 = dt.timedelta(hours = h1, minutes=m1)
    t2 = dt.timedelta(hours = h2, minutes=m2)
    total_mins = (t2-t1).total_seconds()//60
    
    
    for sid in tqdm(os.listdir(os.path.join('dataset'))):
        for file in os.listdir(os.path.join('dataset', sid)):
            curday = file[12:20]
            if (train == True and curday <= date_end and curday >= date_begin) or (train == False and curday >= split_date_begin and curday <= split_date_end):
                df = pd.read_csv(os.path.join('dataset', sid, file))
                df = df
                mask = (df.loc[:, "time"] >= BEGIN_TIME) & (df.loc[:, "time"] <= END_TIME)
                front_df = df[mask].loc[:, "return"]
                front_df = front_df.take(np.arange(0, len(front_df), TIME_STEP))
                end_df = df[~mask].loc[:, "return"]
                if len(front_df) == (90 // TIME_STEP +1):
                    X.append(np.array(front_df))
                    Y.append(np.array(end_df))
    return np.array(X), np.array(Y)

def dtw_d(X, Y):
    manhattan_distance = lambda x, y: np.abs(x - y)
    d, cost_matrix, acc_cost_matrix, path = dtw(X, Y, dist=manhattan_distance)
    return d

def dtw_affinity(X):
    return pairwise_distances(X, metric=dtw_d)

def fastdtw_d(X, Y):
    return fastdtw(X, Y, dist=euclidean)[0]

def fastdtw_affinity(X):
    return pairwise_distances(X, metric=fastdtw_d)


X, Y = load_data()


  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:01<00:14,  1.19s/it][A
 15%|█▌        | 2/13 [00:02<00:13,  1.19s/it][A
 23%|██▎       | 3/13 [00:03<00:11,  1.18s/it][A
 31%|███       | 4/13 [00:04<00:10,  1.15s/it][A
 38%|███▊      | 5/13 [00:05<00:09,  1.14s/it][A
 46%|████▌     | 6/13 [00:06<00:07,  1.13s/it][A
 54%|█████▍    | 7/13 [00:07<00:06,  1.13s/it][A
 62%|██████▏   | 8/13 [00:09<00:05,  1.13s/it][A
 69%|██████▉   | 9/13 [00:10<00:04,  1.12s/it][A
 77%|███████▋  | 10/13 [00:11<00:03,  1.12s/it][A
 85%|████████▍ | 11/13 [00:12<00:02,  1.12s/it][A
 92%|█████████▏| 12/13 [00:13<00:01,  1.12s/it][A
100%|██████████| 13/13 [00:14<00:00,  1.13s/it][A


In [3]:
X_test, Y_test = load_data(train=False)


  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:00<00:04,  2.64it/s][A
 15%|█▌        | 2/13 [00:00<00:04,  2.64it/s][A
 23%|██▎       | 3/13 [00:01<00:03,  2.61it/s][A
 31%|███       | 4/13 [00:01<00:03,  2.61it/s][A
 38%|███▊      | 5/13 [00:01<00:03,  2.61it/s][A
 46%|████▌     | 6/13 [00:02<00:02,  2.63it/s][A
 54%|█████▍    | 7/13 [00:02<00:02,  2.65it/s][A
 62%|██████▏   | 8/13 [00:03<00:01,  2.59it/s][A
 69%|██████▉   | 9/13 [00:03<00:01,  2.62it/s][A
 77%|███████▋  | 10/13 [00:03<00:01,  2.64it/s][A
 85%|████████▍ | 11/13 [00:04<00:00,  2.66it/s][A
 92%|█████████▏| 12/13 [00:04<00:00,  2.68it/s][A
100%|██████████| 13/13 [00:04<00:00,  2.64it/s][A


In [4]:
print(X.shape)

(2990, 19)


In [5]:
print(X_test.shape)

(915, 19)


t1 = time.time()
fastdtw_affinity(X)
t2 = time.time()
print("time:", t2-t1)
t1 = time.time()
dtw_affinity(X)
t2 = time.time()
print("time:", t2-t1)

In [None]:
t1 = time.time()
ac = AgglomerativeClustering(n_clusters = NUM_CLUSTER,
                             affinity = fastdtw_affinity,
                             linkage = 'complete')
X_label = ac.fit_predict(X)
t2 = time.time()
print(t2-t1)

t1 = time.time()
ac = AgglomerativeClustering(n_clusters = NUM_CLUSTER,
                             affinity = dtw_affinity,
                             linkage = 'complete')
X_label = ac.fit_predict(X)
t2 = time.time()
print(t2-t1)

t1 = time.time()
ac = AgglomerativeClustering(n_clusters = NUM_CLUSTER,
                             linkage = 'complete')
X_label = ac.fit_predict(X)
t2 = time.time()
print(t2-t1)

In [None]:
X_label

In [None]:
pickle.dump(X_table, open(os.path.join('dataset_pk',"X.pk"), "wb"))
pickle.dump(X_table, open(os.path.join('dataset_pk',"Y.pk"), "wb"))
pickle.dump(X_table, open(os.path.join('dataset_pk',"X_tabel.pk"), "wb"))

In [None]:
for n in range(NUM_CLUSTER):
    plt.title(n)
    for i in range(len(X)):
    #     plt.subplot(10, 1, X_label[i]+1)
        if X_label[i] == n:
            plt.plot(X[i])
    plt.show()

In [None]:
def make_long_simple(y, cost):
    return y.max() - y[0] - cost
def make_short_simple(y, cost):
    return y[0] - y.min() - cost
def make_long_max_lost(y, cost):
    return y.min() - y[0] - cost
def make_short_max_lost(y, cost):
    return y[0] - y.max() - cost
def make_long(y, cost, exp_profit):
    if (y-y[0]-cost >= exp_profit).any():
        return exp_profit
    else:
        return y[-1] - y[0] - cost

def make_short(y, cost, exp_profit):
    if (y[0]-y-cost >= exp_profit).any():
        return exp_profit
    else:
        return -y[-1] + y[0] - cost

In [None]:
def make_table(X, X_label):
    X_mean = np.zeros((len(X_label), len(X[0])))
    for i in range(NUM_CLUSTER):
        X_mean[X_label[i]] = X[X_label==i].mean(axis=0)
    return X_mean
X_table = make_table(X, X_label)
pickle.dump(X_table, open(os.path.join('dataset_pk',"X_tabel.pk"), "wb"))

In [None]:
len(X)

In [None]:
Y.shape

In [None]:
profit_long_array = np.zeros(NUM_CLUSTER)
profit_short_array = np.zeros(NUM_CLUSTER)
lost_long_array = np.zeros(NUM_CLUSTER)
lost_short_array = np.zeros(NUM_CLUSTER)
num_long_array = np.zeros(NUM_CLUSTER).astype("int")
num_short_array = np.zeros(NUM_CLUSTER).astype("int")
for i in range(len(X)):
    profit_long_array[X_label[i]] += make_long_simple(Y[i], 0.002)
    profit_short_array[X_label[i]] += make_short_simple(Y[i], 0.002)
    lost_long_array[X_label[i]] += make_long_max_lost(Y[i], 0.002)
    lost_short_array[X_label[i]] += make_short_max_lost(Y[i], 0.002)
    num_long_array[X_label[i]] += 1
    num_short_array[X_label[i]] += 1
print(profit_long_array)
print(profit_short_array)
print(lost_long_array)
print(lost_short_array)
print(num_long_array)
print(num_short_array)

In [None]:
pickle.dump(ac, open("ac_model2.pc", 'wb'))

In [None]:
ac = pickle.load(open("ac_model2.pc", 'rb'))

In [None]:
profit_long_array = np.zeros(NUM_CLUSTER)
profit_short_array = np.zeros(NUM_CLUSTER)
lost_long_array = np.zeros(NUM_CLUSTER)
lost_short_array = np.zeros(NUM_CLUSTER)
num_long_array = np.zeros(NUM_CLUSTER).astype("int")
num_short_array = np.zeros(NUM_CLUSTER).astype("int")
for i in range(len(X)):
    profit_long_array[X_label[i]] += make_long_simple(Y[i], 0.002)
    profit_short_array[X_label[i]] += make_short_simple(Y[i], 0.002)
    lost_long_array[X_label[i]] += make_long_max_lost(Y[i], 0.002)
    lost_short_array[X_label[i]] += make_short_max_lost(Y[i], 0.002)
    num_long_array[X_label[i]] += 1
    num_short_array[X_label[i]] += 1
print('best avg long return',sorted(profit_long_array/num_long_array)[::-1][:10])
print('best avg short return' ,sorted(profit_short_array/num_short_array)[::-1][:10])
print('worst avg long return', sorted(lost_long_array/num_long_array)[::-1][:10])
print('worst avg short return', sorted(lost_short_array/num_short_array)[::-1][:10])
print(num_long_array)
print(num_short_array)

In [None]:
def find_cluster(X, X_table):
    prev_min = dtw_d(X, X_table[0])
    prev_min_arg = 0
    for i in range(1, NUM_CLUSTER):
        cur_min = dtw_d(X, X_table[i])
        if prev_min >= cur_min:
            prev_min = cur_min
            prev_min_arg = i
    return prev_min_arg

In [None]:
def find_cluster_L2(X, X_table):
    prev_min = dtw_d(X, X_table[0])
    prev_min_arg = 0
    for i in range(1, NUM_CLUSTER):
        cur_min = np.sum((X - X_table[i])**2)
        if prev_min >= cur_min:
            prev_min = cur_min
            prev_min_arg = i
    return prev_min_arg

In [None]:
find_cluster_L2(X_test[0], X_table)

In [None]:
profit_long_array = np.zeros(NUM_CLUSTER)
profit_short_array = np.zeros(NUM_CLUSTER)
lost_long_array = np.zeros(NUM_CLUSTER)
lost_short_array = np.zeros(NUM_CLUSTER)
num_long_array = np.zeros(NUM_CLUSTER).astype("int")
num_short_array = np.zeros(NUM_CLUSTER).astype("int")
for i in range(len(X)):
    profit_long_array[X_label[i]] += make_long(Y[i], 0.002, 0.025)
    profit_short_array[X_label[i]] += make_short(Y[i], 0.002, 0.025)
    num_long_array[X_label[i]] += 1
    num_short_array[X_label[i]] += 1
print('avg long return',sorted(profit_long_array/num_long_array)[::-1][:10])
print('avg short return' ,sorted(profit_short_array/num_short_array)[::-1][:10])
print(num_long_array)
print(num_short_array)
best_long_cluster = np.argsort(profit_long_array/num_long_array)[::-1]

In [None]:
best_long_cluster

In [None]:
def evaluate(X_test, Y_test, X_table, exp_profit=0.025):
    profit_long_array = np.zeros(NUM_CLUSTER)
    profit_short_array = np.zeros(NUM_CLUSTER)
    lost_long_array = np.zeros(NUM_CLUSTER)
    lost_short_array = np.zeros(NUM_CLUSTER)
    num_exchange_array = np.zeros(NUM_CLUSTER).astype("int")
    X_label = np.zeros(NUM_CLUSTER).astype("int")
    for i in tqdm(range(len(X_test))):
        X_label = find_cluster(X_test[i], X_table)
        # profit_long_array[X_label] += make_long(Y_test[i], 0.002, exp_profit)
        profit_short_array[X_label] += make_short(Y_test[i], 0.002, exp_profit)
        num_exchange_array[X_label] += 1

In [None]:
profit_long_array, profit_short_array, num_long_array  = evaluate(X_test, Y_test, X_table)

In [None]:
for i in range(len(best_long_cluster)):
    print(i, profit_long_array[best_long_cluster[i]], num_long_array[best_long_cluster[i]])

In [None]:
best_long_cluster = np.argsort(profit_long_array/num_long_array)[::-1]
for n in range(10):
    for i in range(len(X)):
    #     plt.subplot(10, 1, X_label[i]+1)
        if X_label[i] == n:
            plt.plot(X[i])
    plt.show()

In [None]:
print()