In [23]:
import math

import pandas as pd
import numpy as np

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from extractors import *

import warnings
warnings.filterwarnings("ignore")

In [40]:
# Algorithm Parameters
# Bounds on the data for each part of the algorithm
training_bound = 0.8
fit_bound = 0.9

In [41]:
# select which feature extractor to use
extractor = KNearestNeighborsExtractor()

# select which data to train and test on
file_name = "15min_manyFeatures.csv"

In [42]:
# Read in our data
print "Opening file..."
df = pd.read_csv(file_name, parse_dates=True) 
# remove datapoints without all features defined
df = df.dropna()
# reset our index after dropping invalid datapoints
df = df.reset_index(drop=True)
print "File opened."

Opening file...
File opened.


In [43]:
# create list of labels - 'increase' or 'decrease' for each datapoint
print "Creating label list..."
all_labels = []
for i in range(len(df["weighted_price"]) - 1):
    if df["weighted_price"][i] < df["weighted_price"][i+1]:
        all_labels.append("decrease")
    else:
        all_labels.append("increase")
print "Label list created."

Creating label list...
Label list created.


In [44]:
# create feature vectors using the given extractor
print "Creating feature list..."
feature_vectors = []
for index, row in df.iterrows():
    feature_vectors.append(extractor.getFeatures(row))
print "Feature list created."

Creating feature list...
Feature list created.


In [45]:
# remove last datapoint, as we don't have a label for it
feature_vectors = feature_vectors[:-1]
assert len(feature_vectors) == len(all_labels)
len(feature_vectors)

95237

In [30]:
# Remove the first half of the data set
all_labels = all_labels[len(all_labels) / 2:]
feature_vectors = feature_vectors[len(feature_vectors) / 2:]
assert len(feature_vectors) == len(all_labels)

In [31]:
# normalize the feature vectors
# feature_vectors = preprocessing.scale(feature_vectors)

In [46]:
# calculate number of datapoints to use for training
training_datapoints = int(math.floor(len(all_labels) * training_bound))

In [47]:
# create list of training features and labels
train_labels = all_labels[:training_datapoints]
train_features = feature_vectors[:training_datapoints]

# normalize training data
scaler = StandardScaler()
scaler.fit(train_features)
train_features = scaler.transform(train_features)

In [48]:
# set up the classifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), verbose=10, random_state=1)

In [53]:
# fit the classifier to the data
clf = clf.fit(train_features, train_labels)

In [50]:
# optimize hyperparameters

# calculate number of datapoints to use for optimizing
fit_datapoints = int(math.floor(len(all_labels) * fit_bound))
# create list of fit features and labels
fit_labels = all_labels[training_datapoints:fit_datapoints]
fit_features = feature_vectors[training_datapoints:fit_datapoints]
assert len(fit_labels) == len(fit_features)

In [15]:
potential_alphas = 10.0 ** -np.arange(1, 7)
print potential_alphas

best_alpha = potential_alphas[0]
best_correct = 0
for test_alpha in potential_alphas:
    clf.set_params(alpha=test_alpha)
    # test current k value
    total = 0
    correct = 0
    for i in range(len(fit_labels)):
        if clf.predict(fit_features[i]) == fit_labels[i]:
            correct += 1
        total += 1
    if correct > best_correct:
        best_alpha = test_alpha
        best_correct = correct
    print "Accuracy: {}".format(100.0 * correct / total)
print best_alpha

[  1.00000000e-01   1.00000000e-02   1.00000000e-03   1.00000000e-04
   1.00000000e-05   1.00000000e-06]
Accuracy: 51.9739605208
Accuracy: 51.9739605208
Accuracy: 51.9739605208
Accuracy: 51.9739605208
Accuracy: 51.9739605208
Accuracy: 51.9739605208
0.1


In [54]:
# now, we will use the best k value for our testing
#clf.set_params(alpha=best_alpha)

# create list of testing labels and feature vectors
test_labels = all_labels[fit_datapoints:]
test_features = feature_vectors[fit_datapoints:]
assert len(test_labels) == len(test_features)
test_features = scaler.transform(test_features)

In [55]:
correct = 0
total = 0
for i in range(len(test_labels)):
    if clf.predict(test_features[i]) == test_labels[i]:
        correct += 1
    total += 1
100.0 * correct / total

52.45695086098278

In [18]:
100.0 * correct / total

49.20201595968081

In [28]:
best_k

14

In [41]:
10.0 ** -np.arange(1, 7)

array([  1.00000000e-01,   1.00000000e-02,   1.00000000e-03,
         1.00000000e-04,   1.00000000e-05,   1.00000000e-06])