In [24]:
import math

import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from extractors import *

import warnings
warnings.filterwarnings("ignore")

In [25]:
# Algorithm Parameters
# Bounds on the data for each part of the algorithm
training_bound = 0.8
fit_bound = 0.9

In [26]:
# select which feature extractor to use
extractor = KNearestNeighborsExtractor()

# select which data to train and test on
file_name = "15min_manyFeatures.csv"

In [27]:
# Read in our data
print "Opening file..."
df = pd.read_csv(file_name, parse_dates=True) 
# remove datapoints without all features defined
df = df.dropna()
# reset our index after dropping invalid datapoints
df = df.reset_index(drop=True)
print "File opened."

Opening file...
File opened.


In [28]:
# create list of labels - 'increase' or 'decrease' for each datapoint
print "Creating label list.."
all_labels = []
for i in range(len(df["weighted_price"]) - 1):
    if df["weighted_price"][i] < df["weighted_price"][i+1]:
        all_labels.append("decrease")
    else:
        all_labels.append("increase")
print "Label list created."

Creating label list..
Label list created.


In [29]:
# create feature vectors using the given extractor
print "Creating feature list..."
feature_vectors = []
for index, row in df.iterrows():
    feature_vectors.append(extractor.getFeatures(row))
print "Feature list created."

Creating feature list...
Feature list created.


In [30]:
# remove last datapoint, as we don't have a label for it
feature_vectors = feature_vectors[:-1]
assert len(feature_vectors) == len(all_labels)

In [31]:
# Remove the first half of the data set
all_labels = all_labels[len(all_labels) / 2:]
feature_vectors = feature_vectors[len(feature_vectors) / 2:]
assert len(feature_vectors) == len(all_labels)

In [32]:
# calculate number of datapoints to use for training
training_datapoints = int(math.floor(len(all_labels) * training_bound))

In [33]:
# create list of training features and labels
train_labels = all_labels[:training_datapoints]
train_features = feature_vectors[:training_datapoints]

In [34]:
# set up the classifier
neigh = KNeighborsClassifier()

In [35]:
# fit the classifier to the data
neigh = neigh.fit(train_features, train_labels)

In [36]:
# optimize hyperparameters

# calculate number of datapoints to use for optimizing
fit_datapoints = int(math.floor(len(all_labels) * fit_bound))
# create list of fit features and labels
fit_labels = all_labels[training_datapoints:fit_datapoints]
fit_features = feature_vectors[training_datapoints:fit_datapoints]
assert len(fit_labels) == len(fit_features)

In [37]:
best_k = 1
best_correct = 0
for test_k in range(1,15):
    print "Test k = {}...".format(test_k)
    neigh.set_params(n_neighbors=test_k)
    correct = 0
    total = 0
    # test current k value
    for i in range(len(fit_labels)):
        if neigh.predict(fit_features[i]) == fit_labels[i]:
            correct += 1
        total += 1
    if correct > best_correct:
        best_k = test_k
        best_correct = correct
    print "Accuracy: {}".format(1.0 * correct / total)

Test k = 1...
Accuracy: 0.488030239395
Test k = 2...
Accuracy: 0.484880302394
Test k = 3...
Accuracy: 0.490760184796
Test k = 4...
Accuracy: 0.491390172197
Test k = 5...
Accuracy: 0.491180176396
Test k = 6...
Accuracy: 0.492860142797
Test k = 7...
Accuracy: 0.493490130197
Test k = 8...
Accuracy: 0.493490130197
Test k = 9...
Accuracy: 0.490130197396
Test k = 10...
Accuracy: 0.490760184796
Test k = 11...
Accuracy: 0.489080218396
Test k = 12...
Accuracy: 0.489080218396
Test k = 13...
Accuracy: 0.486770264595
Test k = 14...
Accuracy: 0.487400251995


In [38]:
# now, we will use the best k value for our testing
neigh.set_params(n_neighbors=best_k)

# create list of testing labels and feature vectors
test_labels = all_labels[fit_datapoints:]
test_features = feature_vectors[fit_datapoints:]
assert len(test_labels) == len(test_features)

In [40]:
correct = 0
total = 0
for i in range(len(test_labels)):
    if neigh.predict(test_features[i]) == test_labels[i]:
        correct += 1
    total += 1

In [41]:
1.0 * correct / total

0.4794204115917682

In [39]:
best_k

7