In [26]:
import collections
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.dummy import DummyClassifier
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils import to_categorical
%matplotlib inline 

LABEL_TO_INDEX = {-1: 0, 0: 1, 1: 2}

# Parameters
d = 5  # recommendation is < log3(T / 100), where T is number of available time units
theta = 0.0
confidence = 0.1
startTrain = '2016-01-01'
endTrain = '2016-03-30'
startTest = '2016-05-01'
endTest = '2016-05-15'

def loadData():
    df = pd.read_csv('btcnCNY_1-min_data_2012-01-01_to_2017-05-31.csv', usecols=[0,4])
    df.fillna(method='ffill', inplace=True)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
    df.set_index('Timestamp', inplace=True)
    return df

def normalize(x):
    if x.Close > theta:
        return 1
    elif x.Close < -theta:
        return -1
    return 0

# feature vector [last price diff, count of -1, 0, 1, longest consecutive -1, 0, 1]
def extractFeaturesAndLabels(prices):
    # history of price differences
    diff = prices.diff()
    h = diff.apply(normalize,'columns') 

    numSamples = len(h)-d

    # feature 1 - last price movement (use quantized value or actual price diff?) [should be actual price diff]
    X = np.array(h[d-1:len(h)-1]).reshape((numSamples,1))

    # feature 2 - tally counts
    cnt,cnts = collections.Counter(h[:d]),[]
    for i in range(numSamples):
        cnts += [[cnt[-1],cnt[0],cnt[1]]]
        cnt[h[i]] -= 1
        cnt[h[i+d]] += 1
    X = np.append(X, cnts, axis=1)    

    # feature 3 - longest consecutive run (-1,0,1)
    runs = []
    for i in range(d-1,numSamples+d-1):
        run, label = 0,h[i]
        for j in range(d):
            if h[i-j]==label:
                run += 1
            else:
                break
        runs.append(buildConsecutiveRunRow(label, run))
    X = np.append(X, runs, axis=1)
        
    return X, to_categorical(h[d:].values+1)

def extractFeaturesAndLabels2(prices):
    q = prices.diff().apply(normalize, 'columns')
    numSamples = len(q)-d
    X = np.ndarray(shape=(numSamples, d))
    for i in range(numSamples):
        X[i,:] = q[i:i+d]
    return X, to_categorical(q[d:].values+1)

def buildConsecutiveRunRow(label, run):
    ret = [0, 0, 0]
    ret[LABEL_TO_INDEX[label]] = run
    return ret

def scoreIgnoreZeros(X, y):    
    y_predict = clf.predict_proba(X)
    total,correct = 0,0
    for i,yp in enumerate(y_predict):
        if yp[0] > confidence or yp[2] > confidence:
            # -1,1 > than confidence threshold. classes are [-1, 0, 1]
            p = -1 if yp[0] > confidence else 1
            correct += 1 if p==y[i] else 0
            total += 1
    return correct, total

In [3]:
################## Load and preprocess data ##############
df = loadData()

In [27]:
################# Extract features and labels ############
X_train,y_train = extractFeaturesAndLabels(df[startTrain:endTrain])

X_test, y_test = extractFeaturesAndLabels(df[startTest:endTest])

In [30]:
################## Train Model ############################
model = Sequential()
model.add(Dense(64, input_dim=7))
model.add(Activation('relu'))
model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32)
loss_and_metrics = model.evaluate(X_train, y_train, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [35]:
print(loss_and_metrics)

[0.80944486872282173, 0.52404027932210584]
