In [1]:
"""
A random forest classifier aimed at determining whether a stock will be higher or lower after some given amount of days.
Replication of Khaidem, Saha, & Roy Dey (2016)

Documentation on function:
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
"""

import pandas as pd
from sklearn.ensemble import RandomForestClassifier as make_forest
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import accuracy_score as acc
import numpy as np

'''
### Outline ###
We have a bunch of columns of different length target values
We drop all target values except the ones we want to analyze (or else when we remove NA we will remove too much data)
We then input the data and features in to the first .fit parameter, and the labels in the second
'''

num_features = 5
prediction_window = 14
full_data = pd.read_csv('data_preprocessed.csv')

In [2]:
# drop all target columns not to be analyzed
headers = full_data.columns.values
headers = headers[13:] # should return just the headers of the target values
headers = headers[headers!='Target({})'.format(prediction_window)]
selected_data = full_data.drop(headers, axis=1)
selected_data = selected_data.dropna(axis=0, how='any') # using the subset parameter might allow us to skip dropping other targets?

In [35]:
print(selected_data.head(10000))

        Close     Volume        RSI      PROC          SO          WR  \
14      29.45  2581167.0  29.847495 -0.059105  100.000000 -100.000000   
15      29.29  1929019.0  31.065760 -0.053941   90.404040  -85.858586   
16      29.17  3120255.0  30.995475 -0.054457   84.848485  -93.939394   
17      28.69  3532089.0  28.189300 -0.068809   82.828283 -100.000000   
18      28.03  4236738.0  24.863884 -0.089935   85.772358 -100.000000   
19      29.13  4001809.0  37.177542 -0.054835   89.423077 -100.000000   
20      29.50  4209397.0  42.792109 -0.031199   77.564103  -64.743590   
21      29.54  4353100.0  41.001565 -0.037471   85.256410  -52.884615   
22      29.07  4744169.0  33.750000 -0.066774  100.000000  -51.602564   
23      29.17  6828734.0  39.100346 -0.041407   86.642599  -62.454874   
24      29.20  4952879.0  35.294118 -0.051948  100.000000  -58.844765   
25      29.41  3684273.0  38.868613 -0.039830  100.000000  -55.000000   
26      29.25  2328789.0  38.517179 -0.041612  100.

In [4]:
### Drop useless labels ###
selected_data.drop(["Unnamed: 0"], axis = 1, inplace = True)
selected_data.drop(["Date"], axis = 1, inplace = True)
selected_data.drop(["Symbol","Open","High","Low"], axis = 1, inplace = True)


In [5]:
print(selected_data.head())

    Close     Volume        RSI      PROC          SO          WR       EWMA  \
14  29.45  2581167.0  29.847495 -0.059105  100.000000 -100.000000  29.512076   
15  29.29  1929019.0  31.065760 -0.053941   90.404040  -85.858586  29.364025   
16  29.17  3120255.0  30.995475 -0.054457   84.848485  -93.939394  29.234675   
17  28.69  3532089.0  28.189300 -0.068809   82.828283 -100.000000  28.871558   
18  28.03  4236738.0  24.863884 -0.089935   85.772358 -100.000000  28.310519   

    Target(14)  
14        -1.0  
15        -1.0  
16        -1.0  
17        -1.0  
18        -1.0  


In [30]:
# Split in Test-x and Test-y #
train_y = selected_data['Target({})'.format(prediction_window)][:10000].as_matrix()
test_y = selected_data['Target({})'.format(prediction_window)][10000:15000].as_matrix()

selected_train = selected_data.drop(['Target({})'.format(prediction_window)], axis = 1)
train_x = selected_train[:10000].as_matrix()
test_x = selected_train[10000:15000].as_matrix()

In [22]:
print(len(full_data))

813632


In [8]:
print(train_x)

[[  2.94500000e+01   2.58116700e+06   2.98474946e+01 ...,  -1.00000000e+02
    2.95120762e+01  -1.00000000e+00]
 [  2.92900000e+01   1.92901900e+06   3.10657596e+01 ...,  -8.58585859e+01
    2.93640254e+01  -1.00000000e+00]
 [  2.91700000e+01   3.12025500e+06   3.09954751e+01 ...,  -9.39393939e+01
    2.92346751e+01  -1.00000000e+00]
 ..., 
 [  4.16800000e+01   1.09916060e+07   5.79317269e+01 ...,  -4.27777778e+01
    4.19369928e+01   1.00000000e+00]
 [  4.02900000e+01   1.59412710e+07   5.01340483e+01 ...,  -5.87209302e+01
    4.08389976e+01   1.00000000e+00]
 [  3.95300000e+01   1.07190680e+07   3.80254154e+01 ...,  -1.00000000e+02
    3.99663325e+01  -1.00000000e+00]]


In [31]:
# Ignore the warning. Works anyway
Random_Forest = make_forest(n_estimators=65, max_features=num_features, bootstrap=True, oob_score=True, verbose=1)
Random_Forest.fit(train_x, train_y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [29]:
print("TEST: TREE VS ORIGINAL")
res = Random_Forest.predict(test_x)
print("ACCURACY: "+str(acc(test_y, res)))

TEST: TREE VS ORIGINAL
ACCURACY: 1.0


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  65 out of  65 | elapsed:    0.0s finished


In [21]:

print(test_y)

[-1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
