In [None]:
import numpy as np
import os
from sklearn import linear_model
import matplotlib
import matplotlib.pyplot as plt
import random

def get_last_days(dates, ref_date, num):
    last_dates = []
    # linear search right now
    ind = len(dates)
    for i in range(len(dates)):
        if dates[i] >= ref_date:
            ind = i
            break
   
    if ind < num:
        print "not enough historical data to get the last {0} days from {1}".format(num, ref_date)
        return []
    else:
        return dates[(i - num) : (i - 1)]
    
    
def test_sgd(correct, correct_p, correct_n, correct_m, i):
    SEC = 1000000
    MIN = 60 * SEC
    DOLLAR = 1000000
    DATA_PATH = "data/"

    ticker = "IVV"
    ref_date = "20150204"

    files = os.listdir(DATA_PATH)
    dates = []
    for f in files:
        if f.endswith("_obp.txt"):
            dates.append(f.split('_')[1])
    dates.sort()

    last_dates = get_last_days(dates, ref_date, 15)

    data = np.zeros(0)
    sig = np.zeros(0)
    for date in last_dates:
        data_filename = "{0}{1}_{2}_obp.txt".format(DATA_PATH, ticker, date)
        sig_filename = "{0}{1}_{2}_sig.txt".format(DATA_PATH, ticker, date)
        if data.shape[0] == 0:
            data = np.loadtxt(data_filename)
            sig = np.loadtxt(sig_filename)
        else:
            temp1 = np.loadtxt(data_filename)
            temp2 = np.loadtxt(sig_filename)
            data = np.concatenate((data, temp1), axis=0)
            sig = np.concatenate((sig, temp2), axis=0)



    data_train = data[:4000, :]
    data_cross = data[4000:, :]
    sig_train = sig[:4000]
    sig_cross = sig[4000:]        

    sgd = linear_model.SGDClassifier(n_iter=500)
    sgd.fit(data_train, sig_train)

    cross_pred = sgd.predict(data_cross)

    temp = (cross_pred == sig_cross)
    correct[i] = np.mean(cross_pred == sig_cross)
    correct_p[i] = np.mean(temp[sig_cross == 1])
    correct_n[i] = np.mean(temp[sig_cross == 0])
    correct_m[i] = np.mean(temp[sig_cross == -1])
    

def run():
    num = 100
    correct = np.zeros(num)
    correct_p = np.zeros(num)
    correct_n = np.zeros(num)
    correct_m = np.zeros(num)
    for i in range(num):
        test_sgd(correct, correct_p, correct_n, correct_m, i)
    
    print "The percentage of total correct predictions is {}%".format(100*np.mean(correct))
    print "The percentage of total correct positive predictions is {}%".format(100*np.mean(correct_p))
    print "The percentage of total correct neutral predictions is {}%".format(100*np.mean(correct_n))
    print "The percentage of total correct negative predictions is {}%".format(100*np.mean(correct_m))    
    
    fig, ax = plt.subplots()
    ax.hist(sig_cross, color='lightblue', alpha=0.5, bins=[-1.5, -0.5, 0.5, 1.5])
    ax.hist(cross_pred, color='salmon', alpha=0.5, bins=[-1.5, -0.5, 0.5, 1.5])
    ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
    ax.margins(0.05)
    ax.set_ylim(bottom=0)
    plt.show()


run()