In [56]:
import collections
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.dummy import DummyClassifier
import matplotlib.pyplot as plt
%matplotlib inline 

LABEL_TO_INDEX = {-1: 0, 0: 1, 1: 2}

# Parameters
d = 500  # recommendation is < log3(T / 100), where T is number of available time units
theta = 0.0
confidence = 0.5
startTrain = '2016-01-01'
endTrain = '2016-01-10'
startTest = '2016-01-01'
endTest = '2016-01-10'

def loadData():
    df = pd.read_csv('btcnCNY_1-min_data_2012-01-01_to_2017-05-31.csv', usecols=[0,4])
    df.fillna(method='ffill', inplace=True)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
    df.set_index('Timestamp', inplace=True)
    return df

def normalize(x):
    if x.Close > theta:
        return 1
    elif x.Close < -theta:
        return -1
    return 0

# feature vector [last price diff, count of -1, 0, 1, longest consecutive -1, 0, 1]
def extractFeaturesAndLabels(prices):
    # history of price differences
    diff = prices.diff()
    h = diff.apply(normalize,'columns') 

    numSamples = len(h)-d

    X = []
    for i in range(1, numSamples):
        X.append(diff[i:i+d].values.flatten().tolist())
        
    return X, h[d+1:].values

def buildConsecutiveRunRow(label, run):
    ret = [0, 0, 0]
    ret[LABEL_TO_INDEX[label]] = run
    return ret

def scoreIgnoreZeros(X, y):    
    y_predict = clf.predict_proba(X)
    total,correct = 0,0
    for i,yp in enumerate(y_predict):
        if yp[0] > confidence or yp[2] > confidence:
            # -1,1 > than confidence threshold. classes are [-1, 0, 1]
            p = -1 if yp[0] > confidence else 1
            correct += 1 if p==y[i] else 0
            total += 1
    return correct, total

def scoreNoIgnore(X, y):    
    y_predict = clf.predict(X) 
    total = len(y)
    correct = 0
    for i in range(len(y)):
        actual = y[i]
        predicted = y_predict[i]
        if actual == predicted:
            correct += 1
    return correct, total

# converts an array of quantized values (ex. [-1, 0, 1, 1, -1]) to a number
def toIndex(a):
    acc = 0
    for x in a:
        acc = acc*3 + (x+1) # shift quantized values by 1
    return acc
    
def buildProbabilityMap(prices):
    q = prices.diff().apply(normalize, 'columns')
    ec = [[0]*3 for _ in range(3**d)]
    for e in range(d, len(q)):
        n = toIndex(q[e-d:e])
        ec[n][q[e]+1] += 1
    return ec

def predictEC(n, ec):
    total = sum(ec[n])
    if total==0: return 0
    if ec[n][0] / total > confidence:
        return -1
    elif ec[n][2] / total > confidence:
        return 1
    return 0

def scoreEC(prices, ec):
    q = prices.diff().apply(normalize, 'columns')
    correct = total = 0
    for e in range(d, len(q)):
        n = toIndex(q[e-d:e])
        yp = predictEC(n, ec)
        if yp != 0:
            total += 1
            correct += 1 if yp==q[e] else 0
    return correct, total

In [2]:
################## Load and preprocess data ##############
df = loadData()

In [57]:
################# Extract features and labels ############
X_train,y_train = extractFeaturesAndLabels(df[startTrain:endTrain])

X_test, y_test = extractFeaturesAndLabels(df[startTest:endTest])

In [253]:
################ Add Polynomial features #################
# poly = PolynomialFeatures(2, interaction_only = True)
# X_train_poly = poly.fit_transform(X_train)
# X_test_poly = poly.fit_transform(X_test)

In [None]:
df[startTrain:endTrain].diff().values[0:5].flatten().tolist()

In [None]:
# print(len(X_train))
# print(len(X_test))
# diff = df[startTrain:endTrain].diff()
# h = diff.apply(normalize,'columns')
# print(h[0:10])
# print(diff[0:10])
print(X_train[0:10])
print(y_train[0:10])

In [58]:
################## Train Model ############################
#clf = RandomForestClassifier()
clf = LogisticRegression(multi_class= 'multinomial', solver='newton-cg')
#clf = DummyClassifier()

clf.fit(X_train,y_train)

correct, total = scoreIgnoreZeros(X_test, y_test)
# correctNoIgnore, totalNoIgnore = scoreNoIgnore(X_test, y_test)

print(clf)
print("Accuracy: ",correct/total if total != 0 else 0)
print("# Trades: ", total)
print("clf.score: ", clf.score(X_test, y_test))

# print("Accuracy (no ignore): ",correctNoIgnore/totalNoIgnore)
# y_predict = clf.predict(X_train)
# for i in range(500):
#     print(y_predict[i])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)
Accuracy:  0.5977565368512521
# Trades:  12659
clf.score:  0.588315706166


In [97]:
################# Use the EC Model #######################
# Doesn't give any better results
# ec = buildProbabilityMap(df[startTrain:endTrain])

# corr, total = scoreEC(df[startTest:endTest], ec)
# print(corr, total)

0 0
