In [1]:
"""
A random forest classifier aimed at determining whether a stock will be higher or lower after some given amount of days.
Replication of Khaidem, Saha, & Roy Dey (2016)

Documentation on function:
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
"""

import pandas as pd
from sklearn.ensemble import RandomForestClassifier as make_forest
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import accuracy_score as acc
import numpy as np

'''
### Outline ###
We have a bunch of columns of different length target values
We drop all target values except the ones we want to analyze (or else when we remove NA we will remove too much data)
We then input the data and features in to the first .fit parameter, and the labels in the second
'''

num_features = 5
prediction_window = 14
full_data = pd.read_csv('data_preprocessed.csv')

In [4]:
# drop all target columns not to be analyzed
headers = full_data.columns.values
headers = headers[13:] # should return just the headers of the target values
headers = headers[headers!='Target({})'.format(prediction_window)]
selected_data = full_data.drop(headers, axis=1)
selected_data = selected_data.dropna(axis=0, how='any') # using the subset parameter might allow us to skip dropping other targets?

In [5]:
print(selected_data.head())

    Unnamed: 0        Date   Open   High    Low  Close     Volume Symbol  \
14          14  2010-01-25  29.42  29.65  29.23  29.45  2581167.0      A   
15          15  2010-01-26  29.37  29.50  28.98  29.29  1929019.0      A   
16          16  2010-01-27  29.20  29.29  28.73  29.17  3120255.0      A   
17          17  2010-01-28  29.13  29.37  28.49  28.69  3532089.0      A   
18          18  2010-01-29  28.93  29.17  27.92  28.03  4236738.0      A   

          RSI      PROC          SO          WR       EWMA  Target(14)  
14  29.847495 -0.059105  100.000000 -100.000000  29.512076        -1.0  
15  31.065760 -0.053941   90.404040  -85.858586  29.364025        -1.0  
16  30.995475 -0.054457   84.848485  -93.939394  29.234675        -1.0  
17  28.189300 -0.068809   82.828283 -100.000000  28.871558        -1.0  
18  24.863884 -0.089935   85.772358 -100.000000  28.310519        -1.0  


In [6]:
### Drop useless labels ###
selected_data.drop(["Unnamed: 0"], axis = 1, inplace = True)
selected_data.drop(["Date"], axis = 1, inplace = True)
selected_data.drop(["Symbol"], axis = 1, inplace = True)

In [7]:
print(selected_data.head())

     Open   High    Low  Close     Volume        RSI      PROC          SO  \
14  29.42  29.65  29.23  29.45  2581167.0  29.847495 -0.059105  100.000000   
15  29.37  29.50  28.98  29.29  1929019.0  31.065760 -0.053941   90.404040   
16  29.20  29.29  28.73  29.17  3120255.0  30.995475 -0.054457   84.848485   
17  29.13  29.37  28.49  28.69  3532089.0  28.189300 -0.068809   82.828283   
18  28.93  29.17  27.92  28.03  4236738.0  24.863884 -0.089935   85.772358   

            WR       EWMA  Target(14)  
14 -100.000000  29.512076        -1.0  
15  -85.858586  29.364025        -1.0  
16  -93.939394  29.234675        -1.0  
17 -100.000000  28.871558        -1.0  
18 -100.000000  28.310519        -1.0  


In [8]:
# Split in Test-x and Test-y #
train_y = selected_data['Target({})'.format(prediction_window)][:2000].as_matrix()
test_y = selected_data['Target({})'.format(prediction_window)][2000:2100].as_matrix()

selected_data.drop(['Target({})'.format(prediction_window)], axis = 1, inplace = True)
train_x = selected_data[:2000].as_matrix()
test_x = selected_data[2000:2100].as_matrix()

In [10]:
# Ignore the warning. Works anyway
Random_Forest = make_forest(max_features=num_features, bootstrap=True, oob_score=True, verbose=1)
Random_Forest.fit(train_x, train_y)

print("TEST: TREE VS ORIGINAL")
res = Random_Forest.predict(test_x)
print("ACCURACY: "+str(acc(test_y, res)))

TEST: TREE VS ORIGINAL
ACCURACY: 1.0


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
