In [83]:
%load_ext zipline

The zipline extension is already loaded. To reload it, use:
  %reload_ext zipline


In [84]:
import pickle 
import pandas as pd
import numpy as np 
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

def labelData(T = 5, profit_taking = 0.07, loss_stopping= -0.07):
    history_file = '../data/history.pickle'
    with open(history_file, 'rb') as f:
        history = pickle.load(f)
    N = len(history)
    Y = [0 for i in range(N-T)]
    X = [v for v in history[:-T]]
    for i in range(N - T):
        sec = history[i:i+T]
        curr = history[i]
        profit, profit_t = None, None 
        loss, loss_t = None, None
        label = None
        for j, v in enumerate(sec):
            delta = (v - curr)/curr
            if profit == None and delta > profit_taking:
                profit, profit_t = delta, j + i 
            if loss == None and delta < loss_stopping:
                loss, loss_t = delta, j + i
        if not (profit_t or loss_t):
            label = 0
        elif not profit_t:
            label = -1
        elif not loss_t:
            label = 1
        else:
            label = 2 * int(profit_t < loss_t) - 1
        Y[i] = label
    
    return np.array(X), np.array(Y)

def preprocessData(X, Y, win=7): 
    N = len(Y)
    XX, YY = [], []
    for i in range(win -1, N):
        XX.append(X[i+1-win:i+1])
        YY.append(Y[i])
    return np.array(XX), np.array(YY)
        
        
    


def trainModel(Xtrain, Ytrain, Xtest, Ytest):
    # fit final model
    #model = LR()
    model = RF()
     
    info = 'Training Sample: {}   '.format(Xtrain.shape) + \
    'Testing Sample: {}'.format(Xtest.shape)
    print(info)
    
    model.fit(Xtrain, Ytrain)
    # new instances where we do not know the answer
    # make a prediction and get predicted probabilities
    Ypredict = model.predict_proba(Xtest) 
    YpredictLabels = model.predict(Xtest)
    avg_mode = 'weighted'
    acc = model.score(Xtest, Ytest) * 100
    precision = precision_score(Ytest, YpredictLabels,average=avg_mode)  
    recall = recall_score(Ytest, YpredictLabels,average=avg_mode) 
    
    acc_res = 'Accuracy={:.2f}%.'.format(acc) 
    pre_res = 'Precision={:.2f}'.format(precision) 
    recall_res = 'Recall={:.2f}'.format(recall)
    info = 'model classes:{}'.format(model.classes_)
    
    print(info)
    print(acc_res) 
    print(pre_res)
    print(recall_res)
     
    return model
        
       
perf = pd.read_pickle("../data/data.pickle")
X, Y = perf['x'], perf['y']
win = 7
X, Y = preprocessData(X, Y, win)
Xtrain, Xtest, Ytrain, Ytest = \
    train_test_split(X, Y, test_size=0.33, shuffle=True)

model = trainModel(Xtrain, Ytrain, Xtest, Ytest)
output_file = '../model/LR.sav'
    
with open(output_file, 'wb') as f:
        pickle.dump(model, f)




Training Sample: (838, 7)   Testing Sample: (414, 7)
model classes:[-1  0  1]
Accuracy=81.40%.
Precision=0.77
Recall=0.81
