In [None]:
%matplotlib inline

import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn import ensemble

def get_last_days(dates, ref_date, num):
    last_dates = []
    # linear search right now
    ind = len(dates)
    for i in range(len(dates)):
        if dates[i] >= ref_date:
            ind = i
            break
    
    if ind < num:
        print "not enough historical data to get the last {0} days from {1}".format(num, ref_date)
        return []
    else:
        return dates[(i - num) : (i - 1)]
    
    
SEC = 1000000
MIN = 60 * SEC
DOLLAR = 1000000
DATA_PATH = "data/"

ticker = "IVV"
ref_date = "20150204"

files = os.listdir(DATA_PATH)
dates = []
for f in files:
    if f.endswith("_obp.txt"):
        dates.append(f.split('_')[1])
dates.sort()

last_dates = get_last_days(dates, ref_date, 15)

data = np.zeros(0)
sig = np.zeros(0)
for date in last_dates:
    data_filename = "{0}{1}_{2}_obp.txt".format(DATA_PATH, ticker, date)
    sig_filename = "{0}{1}_{2}_sig.txt".format(DATA_PATH, ticker, date)
    if data.shape[0] == 0:
        data = np.loadtxt(data_filename)
        sig = np.loadtxt(sig_filename)
    else:
        temp1 = np.loadtxt(data_filename)
        temp2 = np.loadtxt(sig_filename)
        data = np.concatenate((data, temp1), axis=0)
        sig = np.concatenate((sig, temp2), axis=0)

# Train
        
train_data = data[:4000, :]
cross_data = data[4000:, :]
train_sig = sig[:4000]
cross_sig = sig[4000:]

rf = ensemble.RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, 
                                     min_samples_split=2, min_samples_leaf=1, 
                                     min_weight_fraction_leaf=0.0, max_features='auto', 
                                     max_leaf_nodes=None, min_impurity_split=1e-07, 
                                     bootstrap=True, oob_score=False, n_jobs=1, 
                                     random_state=None, verbose=0, warm_start=False, class_weight=None)
rf.fit(train_data, train_sig)
cross_pred = rf.predict(cross_data)

print np.sum(np.absolute(cross_pred))
print np.sum(np.absolute(cross_sig))

# Histogram

fig, ax = plt.subplots()
ax.hist(sig_cross, color='lightblue', alpha=0.5, bins=[-1.5, -0.5, 0.5, 1.5])
ax.hist(cross_pred, color='salmon', alpha=0.5, bins=[-1.5, -0.5, 0.5, 1.5])
ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
ax.margins(0.05)
ax.set_ylim(bottom=0)
plt.show()

# Important features

# importances = rf.feature_importances_
# std = np.std([tree.feature_importances_ for tree in rf.estimators_],
#              axis=0)
# indices = np.argsort(importances)[::-1]

# # Print the feature ranking
# print("Feature ranking:")

# for f in range(data_train.shape[1]):
#     print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# # Plot the feature importances of the forest
# plt.figure()
# plt.title("Feature importances")
# plt.bar(range(data_train.shape[1]), importances[indices],
#        color="r", yerr=std[indices], align="center")
# plt.xticks(range(data.shape[1]), indices)
# plt.xlim([-1, data_train.shape[1]])
# plt.show()

# Correctness

correct = (cross_pred == cross_sig)
correct_p = correct[cross_sig == 1]
correct_c = correct[cross_sig == 0]
correct_m = correct[cross_sig == -1]

print np.mean(correct)
print np.mean(correct_p)
print np.mean(correct_c)
print np.mean(correct_m)
