In [1]:
import quandl as quandl
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os

In [2]:
def get_scaled_data_value_arrays(all_data_table):
        data_value_arrays = all_data_table.values
        data_value_arrays = data_value_arrays.astype('float32')
        # normalize features
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_data_value_arrays = scaler.fit_transform(data_value_arrays)
        return scaled_data_value_arrays

def get_performance_for_labels(sequence_df_start, sequence_df_end):
        price_perf = sequence_df_end['close_price_next_quarter'] - sequence_df_start['close_price'] / sequence_df_start[
            'close_price']
        baseline_perf = sequence_df_end['baseline_price_next_quarter'] - sequence_df_start['baseline_price'] / \
                        sequence_df_start['baseline_price']
        perf = price_perf - baseline_perf / baseline_perf
        return perf
    
def to_sequence_data(data_arrays, seq_num=1):
        return data_arrays.reshape((data_arrays.shape[0] // seq_num, seq_num, data_arrays.shape[1]))
    
def get_dataset_from_intrinio_for_RNN():
        print('---- From intrinio ----')
        all_data_table = pd.DataFrame()
        all_labels = []

        # Read data from files
        resources_path = '../resources'
        files = os.listdir(resources_path)
        for file in files:
            print(file)
            df = pd.read_csv(os.path.join(resources_path, file))
            all_data_table = all_data_table.append(df, ignore_index=True)
            print(all_data_table.shape)

        all_data_table = all_data_table.replace('nm', np.nan)
        threshold = int(0.9 * all_data_table.shape[0])
        all_data_table = all_data_table.dropna(axis='columns', thresh=threshold)
        all_data_table = all_data_table.dropna(axis='rows', how='any')
        all_data_table.sort_values('date', ascending=True, inplace=True)
        print(all_data_table.shape)

        sequence_df = pd.DataFrame()
        sequence_length = 4  # 4 quarters as a sequence
        unique_tickers = all_data_table['ticker'].unique().tolist()
        for ticker in unique_tickers:
            ticker_df = all_data_table[all_data_table['ticker'] == ticker]
            idx = 0
            while idx < ticker_df.shape[0] - sequence_length:
                sequence_df = sequence_df.append(ticker_df[idx:idx + sequence_length], ignore_index=True)
                labels_perf = get_performance_for_labels(sequence_df.iloc[idx, :],
                                                         sequence_df.iloc[idx + sequence_length-1, :])
                all_labels.append(labels_perf)
                idx += sequence_length

        sequence_df.drop(['ticker', 'date'], axis=1, inplace=True)
        # all_data_table.drop(['close_price_next_quarter', 'baseline_price_next_quarter'], axis=1)

        print(sequence_df.shape)

        # while all_data_table.shape[0] % sequence_length != 0:
        #     all_data_table.drop(all_data_table.head(1).index, inplace=True)

        scaled_data_value_arrays = get_scaled_data_value_arrays(sequence_df)
        return to_sequence_data(scaled_data_value_arrays, int(sequence_length)), all_labels

In [3]:
dataset, labels = get_dataset_from_intrinio_for_RNN()

---- From intrinio ----
fundamental_EA.csv
(36, 194)
fundamental_ADBE.csv
(72, 203)
fundamental_ORCL.csv
(108, 207)
fundamental_CRM.csv
(144, 212)
fundamental_GOOGL.csv
(180, 216)
fundamental_INTC.csv
(216, 216)
fundamental_MSFT.csv
(252, 216)
fundamental_AAPL.csv
(288, 217)
fundamental_IBM.csv
(324, 221)
(227, 142)
(200, 140)


In [4]:
    def split_tran_test_data(data_value_arrays, labels):
        train_number = int(0.7 * data_value_arrays.shape[0])
        train_X, train_y = data_value_arrays[:train_number, :], labels[:train_number]
        test_X, test_y = data_value_arrays[train_number:, :], labels[train_number:]
        print('---- split ----')
        print(train_number)
        print(labels)
        return test_X, test_y, train_X, train_y

In [5]:
type(dataset)

numpy.ndarray

In [6]:
    # dataset, labels = get_dataset_from_intrinio_for_RNN()
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_labels = scaler.fit_transform(labels)

    # Split into train and test sets 7-3
    test_X, test_Y, train_X, train_Y = split_tran_test_data(dataset, scaled_labels)   

---- split ----
35
[0.058642   0.         0.10780981 0.18997395 0.45247449 0.59662927
 0.058642   0.         0.10780981 0.18997395 0.45247449 0.058642
 0.         0.10780981 0.18997395 0.45247449 0.59662927 1.
 0.058642   0.         0.10780981 0.18997395 0.45247449 0.59662927
 1.         0.058642   0.         0.10780981 0.18997395 0.45247449
 0.59662927 1.         0.058642   0.         0.10780981 0.18997395
 0.45247449 0.058642   0.         0.10780981 0.18997395 0.45247449
 0.058642   0.         0.10780981 0.18997395 0.45247449 0.058642
 0.         0.10780981]




In [7]:
print('train_X shape', train_X.shape)
print('train_Y shape', train_Y.shape)
print('test_X shape', test_X.shape)
print('test_Y shape', test_Y.shape)

('train_X shape', (35, 4, 140))
('train_Y shape', (35,))
('test_X shape', (15, 4, 140))
('test_Y shape', (15,))


In [8]:
from keras.layers import Dense, Input, Dropout
from keras.layers.recurrent import LSTM
from keras.optimizers import SGD
from keras.models import Model
from keras.models import load_model
from keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from math import sqrt
import os

Using TensorFlow backend.


In [9]:
train_X.shape[2]

140

In [11]:
input_layer = Input(shape=(train_X.shape[1],train_X.shape[2]), dtype='float32')

lstm_layer1 = LSTM(64, input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=True)(input_layer)
lstm_layer2 = LSTM(32, input_shape=(train_X.shape[1],64), return_sequences=False)(lstm_layer1)

dropout_layer = Dropout(0.2)(lstm_layer2)
output_layer = Dense(1, activation='linear')(dropout_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='mean_absolute_error', optimizer='adam')
model.summary()

print('------- Model Summary -------')
print(model.summary())

# fit the network
save_weights_at = os.path.join('../keras_models',
                                       'Intrinio_Stock_Fundamental_LSTM_weights.hdf5')
save_best = ModelCheckpoint(save_weights_at, monitor='val_loss', verbose=0,
                                    save_best_only=True, save_weights_only=False, mode='min',
                                    period=1)
model.fit(train_X, train_Y, epochs=20, batch_size=16, validation_data=(test_X, test_Y), verbose=2,
                            shuffle=True, callbacks=[save_best])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 4, 140)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 4, 64)             52480     
_________________________________________________________________
lstm_4 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 64,929
Trainable params: 64,929
Non-trainable params: 0
_________________________________________________________________
------- Model Summary -------
_________________________________________________________________
Layer (type)                 Output Shape 

ImportError: `save_model` requires h5py.