In [13]:
import sys
sys.path.append('C:/ProgramData/Anaconda3/envs/tf-gpu/Lib/site-packages')
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import accuracy_score

import keras
from keras.models import Sequential
from keras import optimizers
from keras.layers import Dense, Dropout, Activation, LeakyReLU, BatchNormalization
from keras import losses

#from tensorflow.python.keras.callbacks import TensorBoard
#from time import time
#tensorboard = TensorBoard(log_dir="logs/{}".format(time()))


#Get constants
%run ./CONSTANTS.ipynb
PERIODS = GET_PERIODS()

In [2]:
def train_given_symbol(symb):
    
    #Get the dataframe from our function Request.ipynb
    %run ./Request.ipynb
    raw_data_df = get_symbol_data_df(symb)
    
    #Pull out the goals and percent change
    goals_np_arr = raw_data_df['goal'].values
    pc_np_arr = raw_data_df['percent_change'].values
    
    #Pull out the input data (don't include the percent change or the goals) as a numpy array
    inputs_np_arr = raw_data_df.copy().drop(columns=['goal', 'percent_change']).values
    
    #Add previous periods onto each row and normalize
    inputs_np_arr = period_expand_np_arr(inputs_np_arr)
    inputs_np_arr = normalize_np_arr(inputs_np_arr)
    
    #We have to make sure the goal column matches the rows still!
    goals_np_arr = goals_np_arr[PERIODS - 1:]
    pc_np_arr = pc_np_arr[PERIODS - 1:]
    
    #We have to put the goals back for the test/train split to work
    full_data_np_arr = np.append(inputs_np_arr, np.reshape(goals_np_arr, (-1, 1)), 1)
    full_data_np_arr = np.append(full_data_np_arr, np.reshape(pc_np_arr, (-1, 1)), 1)
    
    #Train the models
    dnn_model = train_dnn_classifier(full_data_np_arr)
    rf_model = train_rf_classifier(full_data_np_arr)
    
    return dnn_model, rf_model
    

In [3]:
#Add previous periods onto each row

def period_expand_np_arr(raw_np_arr):
    
    #Create an new array that has the <PERIODS> previous periods included on the row
    full_np_arr = np.empty((np.size(raw_np_arr, 0) - PERIODS + 1, (np.size(raw_np_arr, 1) * PERIODS)))
    
    #We want to exclude <PREV_PERIODS> documents from the start of our new doc because they don't have enough previous periods to 
    #make a full multi-period row
    for rowIndex in range(len(raw_np_arr[(PERIODS-1):])):
        
        #Despite rowIndex looking like it starts at PERIODS, like what makes sense
        #It starts at 0 and just removes PERIODS elements from the end
        #So we get to add PERIODS to the index so it makes sense
        realRowIndex = rowIndex + PERIODS - 1
        
        #Start the new row with our existing row
        newRow = raw_np_arr[realRowIndex]
    
        #Iterate over the last few rows and append them to our new row
        for i in range(1, PERIODS):
            newRow = np.append(newRow, raw_np_arr[realRowIndex - i])
    
        #assign the row to our new index
        #We'll use the unadjusted rowIndex to base it around 0
        full_np_arr[rowIndex] = newRow
    
    return full_np_arr
    

In [4]:
#Normalize a numpy array

def normalize_np_arr(np_arr):

    #Use Keras' normalization
    new_np_arr = keras.utils.normalize(np_arr, axis=0, order=2)
    
    return new_np_arr

In [5]:
def train_dnn_classifier(data_np_arr):
    (train, test) = train_test_split(data_np_arr, test_size=0.25, random_state=13)

    x_train = train[:,:-2]
    x_test = test[:,:-2]

    y_train = train[:,-2]
    y_test = test[:,-2]
    
    #NN - Binary classifier
    model = Sequential()
    model.add(Dense(x_train.shape[1], input_dim=x_train.shape[1], activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(500, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(250, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(125, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(75, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    model.fit(x_train, y_train,
              epochs=100,
              batch_size=200)
    
    score = model.evaluate(x_test, y_test, batch_size=128)
    print('The NN model has accuracy stats:', score)
    
    #model.save('dnn_classifier.h5')
    
    return model

In [6]:
def train_rf_classifier(data_np_arr):
    
    (train, test) = train_test_split(data_np_arr, test_size=0.25, random_state=13)

    x_train = train[:,:-2]
    x_test = test[:,:-2]

    y_train = train[:,-2]
    y_test = test[:,-2]
    
    parameters = {'bootstrap': True,
              'min_samples_leaf': 3,
              'n_estimators': 500, 
              'min_samples_split': 5,
              'max_features': 'sqrt',
              'max_depth': 500,
              'max_leaf_nodes': None}

    RF_model = RandomForestClassifier(**parameters)
    
    RF_model.fit(x_train, y_train)
    
    #Test the accuracy
    RF_predictions = RF_model.predict(x_test)
    score = accuracy_score(y_test, RF_predictions)
    print('The RF model has accuracy stats:', score)
    
    #SAVE MODEL
    filename = 'rf_model.sav'
    pickle.dump(RF_model, open(filename, 'wb'))
    
    return RF_model

In [7]:
train_given_symbol('A')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
The NN model has accuracy stats: [0.963755630480543, 0.516129031715756]
The RF model has accuracy stats: 0.5045492142266336


(<keras.engine.sequential.Sequential at 0x14a6c0617f0>,
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=500, max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=3, min_samples_split=5,
             min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False))