In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import pickle
from tensorflow.keras.models import load_model

In [2]:
def loadDataFiles():
    market_df = pickle.load(open('Market_train',"rb"))
    news_df = pickle.load(open("News_train", "rb"))
    print('Finished loading datafiles!')
    return market_df, news_df


In [3]:
def preprocess_data(mkt_df, news_df):
    mkt_df['time'] = pd.to_datetime(mkt_df['time'])
    news_df['time'] = pd.to_datetime(news_df['time'])
    mkt_df['time'] = mkt_df['time'].dt.date
    news_df['time'] = news_df['time'].dt.date
    assetCodes = []
    index = 0
    for x in news_df['assetCodes']:
        x = x.split(',')[0].split("'")[1]
        assetCodes.append(x)
    news_df['assetCode'] = np.asarray(assetCodes)
    irrelevantColumns = ['sourceTimestamp', 'firstCreated', 'sourceId', 
                         'headline', 'provider', 'subjects', 'audiences',
                        'headlineTag', 'marketCommentary', 'assetCodes', 'assetName']
    news_df.drop(irrelevantColumns, axis=1, inplace=True)
    mkt_df.drop(['assetName'], axis=1, inplace=True)
    modifiednews = news_df.groupby(['time','assetCode'], sort=False).aggregate(np.mean).reset_index()
    
    # join news reports to market data, note many assets will have many days without news data
    merged = pd.merge(mkt_df, modifiednews, how='left', on=['time', 'assetCode'], copy=False) 
    merged = merged.fillna(0)
    print('Finished preprocessing data!')
    return merged


In [4]:
market_data, news_data = loadDataFiles()


Finished loading datafiles!


In [5]:
X = preprocess_data(market_data, news_data)


Finished preprocessing data!


In [6]:
def normalizeY(ydf):
    ydf = (ydf + 1) / 2
    return ydf


In [7]:
X = X[X['returnsOpenNextMktres10'] >= -1]
X = X[X['returnsOpenNextMktres10'] <= 1]

y = X['returnsOpenNextMktres10']

X.drop(['returnsOpenNextMktres10'], axis=1, inplace=True)
y = normalizeY(y)
assetCodesAndTime = X.iloc[:, :2]
X = X.iloc[:, 2:]


In [8]:
def getNNModel(numhiddenlayers=2, nodes=4): # returns NN given hidden layers and nodes
    layers = []
    layers.append(keras.layers.Flatten(input_shape=(35,)))

    for x in range(numhiddenlayers):
        layers.append(keras.layers.Dense(nodes, activation=tf.nn.relu, use_bias=True))

    layers.append(keras.layers.Dense(1, activation=tf.nn.sigmoid))
    model = keras.Sequential(layers)
    sgd = keras.optimizers.SGD(lr=.3)
    model.compile(optimizer=sgd,
              loss='mean_absolute_error',
              metrics=['accuracy'])
    return model


In [10]:
def regularize(df):
    for column in df:
        colmin = np.amin(df[column])
        colmax = np.amax(df[column])
        df[column] = (df[column] - colmin) / (colmax - colmin)
    return df

In [11]:
def splitDataset(X, y, split):
    index = int(split*len(y.index))
    y_train, y_test = np.split(y, [index])
    X_train, X_test = X.iloc[:index, :], X.iloc[index:, :]
    return X_train, y_train, X_test, y_test

In [12]:
X = regularize(X)

In [13]:
def saveModel(model, model_name):
    model.save(model_name + '.h5')

In [14]:
def loadModel(filename):
    model = load_model(filename)
    return model

In [14]:
nnmodel = getNNModel(3, 15)
nnmodel.fit(X, y, epochs=1, verbose=1, batch_size=100000, validation_split=.7)

Train on 1221660 samples, validate on 2850543 samples
Epoch 1/1


<tensorflow.python.keras.callbacks.History at 0x137c46d68>

In [15]:
def lossMatrix(X, y):
    layers = [3, 4, 5]
    nodes = [10, 15, 20]
    lossmatrix = []
    X_train, y_train, X_test, y_test = splitDataset(X, y, .7)
    for layer in layers:
        lossforlayer = []
        for node in nodes:
            nnmodel2 = getNNModel(layer, node)
            nnmodel2.fit(X_train , y_train, epochs=1, verbose=1, batch_size=1000000)
            loss, acc = nnmodel2.evaluate(X_test, y_test)
            lossforlayer.append(loss)
        lossmatrix.append(lossforlayer)
    for x in lossmatrix:
        print(x)

In [32]:
# lossMatrix(X, y)

4072203
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
[0.02924319398984977, 0.02407273787866651, 0.02244628967740677]
[0.022060430283365993, 0.02228930966185683, 0.021880768763167115]
[0.021853938172650998, 0.021630632141529542, 0.02181874975266783]


In [19]:
lrmodel = getLinearRegressionModel(len(X.columns.values))
lrmodel.fit(X,y, batch_size=1000000, epochs=30, validation_split=.3)

Train on 2850542 samples, validate on 1221661 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x297a4de80>

In [20]:
# xplot = list(range(len(y_test)))
# plt.plot(xplot, lrpredictions)
# plt.show()

NameError: name 'y_test' is not defined

In [None]:
# plt.plot(xplot, y_test)
# plt.show()

In [21]:
newnnmodel = getNNModel(3,15)
hist = keras.callbacks.History()
num_epochs = 10
newnnmodel.fit(X, y, epochs=num_epochs, batch_size=1000000, callbacks=[hist], validation_split=.3)

Train on 2850542 samples, validate on 1221661 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x125ea1c88>