In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing import sequence
from keras.datasets import imdb

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential, Input, Model
from keras.layers import Dense, Dropout
from keras.layers import SimpleRNN, LSTM
from keras.layers.embeddings import Embedding

from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Source of data: https://www.kaggle.com/aaron7sun/stocknews

In [2]:
DJIA_df = pd.read_csv('DJIA_table.csv')
news_df = pd.read_csv('Combined_News_DJIA.csv')

In [3]:
DJIA_df.reindex(index=DJIA_df.index[::-1])
DJIA_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2016-07-01,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141
1,2016-06-30,17712.759766,17930.609375,17711.800781,17929.990234,133030000,17929.990234
2,2016-06-29,17456.019531,17704.509766,17456.019531,17694.679688,106380000,17694.679688
3,2016-06-28,17190.509766,17409.720703,17190.509766,17409.720703,112190000,17409.720703
4,2016-06-27,17355.210938,17355.210938,17063.080078,17140.240234,138740000,17140.240234


In [4]:
DJIA_df['Change'] = DJIA_df['Close'].diff()
DJIA_df['Change'] = DJIA_df['Change'].shift(-1)
DJIA_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,Change
0,2016-07-01,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141,-19.378907
1,2016-06-30,17712.759766,17930.609375,17711.800781,17929.990234,133030000,17929.990234,-235.310546
2,2016-06-29,17456.019531,17704.509766,17456.019531,17694.679688,106380000,17694.679688,-284.958985
3,2016-06-28,17190.509766,17409.720703,17190.509766,17409.720703,112190000,17409.720703,-269.480469
4,2016-06-27,17355.210938,17355.210938,17063.080078,17140.240234,138740000,17140.240234,260.509766


In [5]:
DJIA_df.dropna(inplace=True)
DJIA_df.isna().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Volume       0
Adj Close    0
Change       0
dtype: int64

In [6]:
from sklearn.preprocessing import MinMaxScaler

# how do you normalize these? How do you not data snoop?
Data_df = DJIA_df[[col for col in list(DJIA_df) if (col != 'Change' and col != 'Date')]]
Target_df = DJIA_df['Change']

# now the normalization
min_max_scaler = MinMaxScaler()
X_data = min_max_scaler.fit_transform(Data_df.values)
Y = Target_df.values

In [7]:
def make_datasets(X, Y, length=60):
    if len(X) != len(Y):
        print('you done messed up')
    X_data_list = []
    Y_data_list = []
    for i in range(length, len(X)):
        X_data_list.append(X[i-length:i,:])
        Y_data_list.append(Y[i])
    X_data = np.array(X_data_list)
    Y_data = np.array(Y_data_list)
    return(X_data, Y_data)

In [8]:
X_DJIA, Y = make_datasets(X_data, Y)

In [9]:
news_df.head()

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,b'U.S. troops still in Georgia (did you know t...,b'Why Russias response to Georgia was right',"b'Gorbachev accuses U.S. of making a ""serious ...","b'Russia, Georgia, and NATO: Cold War Two'",b'Remember that adorable 62-year-old who led y...,b'War in Georgia: The Israeli connection',b'All signs point to the US encouraging Georgi...,b'Christopher King argues that the US and NATO...,b'America: The New Mexico?',"b""BBC NEWS | Asia-Pacific | Extinction 'by man..."
3,2008-08-13,0,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',...,b'Elephants extinct by 2020?',b'US humanitarian missions soon in Georgia - i...,"b""Georgia's DDOS came from US sources""","b'Russian convoy heads into Georgia, violating...",b'Israeli defence minister: US against strike ...,b'Gorbachev: We Had No Choice',b'Witness: Russian forces head towards Tbilisi...,b' Quarter of Russians blame U.S. for conflict...,b'Georgian president says US military will ta...,b'2006: Nobel laureate Aleksander Solzhenitsyn...
4,2008-08-14,1,b'All the experts admit that we should legalis...,b'War in South Osetia - 89 pictures made by a ...,b'Swedish wrestler Ara Abrahamian throws away ...,b'Russia exaggerated the death toll in South O...,b'Missile That Killed 9 Inside Pakistan May Ha...,"b""Rushdie Condemns Random House's Refusal to P...",b'Poland and US agree to missle defense deal. ...,"b'Will the Russians conquer Tblisi? Bet on it,...",...,b'Bank analyst forecast Georgian crisis 2 days...,"b""Georgia confict could set back Russia's US r...",b'War in the Caucasus is as much the product o...,"b'""Non-media"" photos of South Ossetia/Georgia ...",b'Georgian TV reporter shot by Russian sniper ...,b'Saudi Arabia: Mother moves to block child ma...,b'Taliban wages war on humanitarian aid workers',"b'Russia: World ""can forget about"" Georgia\'s...",b'Darfur rebels accuse Sudan of mounting major...,b'Philippines : Peace Advocate say Muslims nee...


In [10]:
news_df.drop(columns=['Date','Label'],inplace=True)

In [11]:
news_df['All'] = ''
for col in news_df:
    if col!='All':
        news_df['All'] += news_df[col]
news_df = news_df.astype(str)

In [12]:
NUM_TOP_WORDS = None
MAX_ART_LEN = 50 # maximum and minimum number of words

X_news = []

#to make sure each news is MAX_ART_LEN, thus seperatable
tokenizer = Tokenizer(num_words=NUM_TOP_WORDS)
tokenizer.fit_on_texts(news_df['All'])

for col in news_df:
    if col!='All':
        sequences = tokenizer.texts_to_sequences(news_df[col])
        X_tmp = pad_sequences(sequences, maxlen=MAX_ART_LEN,padding='pre')
        X_news += [X_tmp]


word_index = tokenizer.word_index
NUM_TOP_WORDS = len(word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
top_words = min((len(word_index),NUM_TOP_WORDS))
print('Found %s unique tokens. Distilled to %d top words.' % (len(word_index),top_words))

Found 67507 unique tokens. Distilled to 67507 top words.


In [13]:
X_news = [news[60:-1] for news in X_news]

In [14]:
X_final = X_news + [X_DJIA]
Idx = np.array([i for i in range(len(X_DJIA))])
Idx_train, Idx_test = train_test_split(Idx, test_size=0.1,random_state=42)
X_train = [_[Idx_train] for _ in X_final]
X_test = [_[Idx_test] for _ in X_final]
Y_train = Y[Idx_train]
Y_test = Y[Idx_test]

In [15]:
from keras.models import Sequential, Input, Model
from keras.layers import Dense
from keras.layers import SimpleRNN
from keras.layers.embeddings import Embedding
from keras.layers import concatenate

NEW_NUM = 5

EMBED_SIZE = 51

all_inputs = []

#RNN for all top 25 news share same embeding and RNN layers, defined as below
news_embeding_layer = Embedding(top_words, # input dimension (max int of OHE)
                    EMBED_SIZE, # output dimension size
                    input_length=MAX_ART_LEN)
news_rnn_layer = SimpleRNN(25,dropout=0.2, recurrent_dropout=0.2)


#first news pass through the RNN
news_input = Input(shape=(MAX_ART_LEN, ))
all_inputs.append(news_input)

input_embed = news_embeding_layer(news_input)
print(input_embed.shape)
x_tmp = news_rnn_layer(input_embed)
x_news = x_tmp


#all rest news pass through the RNN, and concanate them, also need to specify the input
for start in range(1,25):
    news_input = Input(shape=(MAX_ART_LEN, ))
    all_inputs.append(news_input)
    input_embed = news_embeding_layer(news_input)
    x_tmp = news_rnn_layer(input_embed)
    x_news = concatenate([x_news, x_tmp])


x_news = Dense(NEW_NUM, activation='sigmoid')(x_news)

news_model=Model(inputs=all_inputs,outputs=x_news)

news_model.compile(loss='binary_crossentropy', 
              optimizer='rmsprop', 
              metrics=['accuracy'])
#news_model.summary()

(?, 50, 51)


In [16]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

# you will need to install pydot properly on your machine to get this running
#SVG(model_to_dot(news_model).create(prog='dot', format='svg'))

In [17]:
BATCH_SIZE = 2
TIME_STEPS = 60
feature_num = 6

DIJA_input=Input(shape=(TIME_STEPS,feature_num))
DJIA_rnn_layer=LSTM(10)
x_DIJA = DJIA_rnn_layer(DIJA_input)
DJIA_model = Model(inputs=DIJA_input,outputs=x_DIJA)

all_inputs.append(DIJA_input)

DJIA_model.compile(loss='binary_crossentropy', 
              optimizer='rmsprop', 
              metrics=['accuracy'])
DJIA_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_26 (InputLayer)        (None, 60, 6)             0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 10)                680       
Total params: 680
Trainable params: 680
Non-trainable params: 0
_________________________________________________________________


In [18]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

# you will need to install pydot properly on your machine to get this running
#SVG(model_to_dot(DJIA_model).create(prog='dot', format='svg'))

In [19]:
x_final = concatenate([x_news, x_DIJA])
x = Dense(1, activation='relu')(x_final)

final_model=Model(inputs=all_inputs,outputs=x)

final_model.compile(loss='mse', 
              optimizer='adam', 
              metrics=['accuracy'])
#final_model.summary()

In [20]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

# you will need to install pydot properly on your machine to get this running
#SVG(model_to_dot(final_model).create(prog='dot', format='svg'))

In [27]:
def wraped_model(rnn_news_type=SimpleRNN,rnn_DJIA_type=LSTM):
    all_inputs = []
        
    NEWS_NUM = 2
    EMBED_SIZE = 51
    
    news_embeding_layer = Embedding(top_words,
                        EMBED_SIZE,
                        input_length=MAX_ART_LEN)
    news_rnn_layer = rnn_news_type(10,dropout=0.2, recurrent_dropout=0.2)
    
    news_input = Input(shape=(MAX_ART_LEN, ))
    all_inputs.append(news_input)
    input_embed = news_embeding_layer(news_input)
    x_tmp = news_rnn_layer(input_embed)
    x_news = x_tmp
    for start in range(1,25):
        news_input = Input(shape=(MAX_ART_LEN, ))
        all_inputs.append(news_input)
        input_embed = news_embeding_layer(news_input)
        x_tmp = news_rnn_layer(input_embed)
        x_news = concatenate([x_news, x_tmp])
    x_news = Dense(NEWS_NUM, activation='relu')(x_news)
    
    
    
    BATCH_SIZE = 2
    TIME_STEPS = 60
    feature_num = 6
    
    DIJA_input=Input(shape=(TIME_STEPS,feature_num))
    DJIA_rnn_layer=rnn_DJIA_type(2,dropout=0.4, recurrent_dropout=0.2)
    x_DIJA = DJIA_rnn_layer(DIJA_input)
    all_inputs.append(DIJA_input)
    
    x_final = concatenate([x_news, x_DIJA])
    x = Dense(10, activation='relu')(x_final)
    x = Dense(1, activation='linear')(x)

    final_model=Model(inputs=all_inputs,outputs=x)

    final_model.compile(loss='mse', 
                  optimizer='adam')
    return final_model

In [28]:
model = wraped_model(LSTM,LSTM)
#model.summary()

In [29]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

# you will need to install pydot properly on your machine to get this running
#SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [30]:
model.fit(X_train,Y_train,batch_size=128,epochs=20,verbose=1,validation_data=(X_test,Y_test))

Train on 1735 samples, validate on 193 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7ffd420fd978>

In [31]:
model.predict(X_test)

array([[ 3.48568459e+01],
       [ 8.55746918e+01],
       [ 3.64781914e+01],
       [ 4.48306618e+01],
       [ 6.57605827e-02],
       [ 1.10593767e+01],
       [ 2.97610416e+01],
       [ 6.50932846e+01],
       [ 1.81920776e+01],
       [ 3.74066544e+01],
       [ 9.50244713e+00],
       [ 1.10367441e+01],
       [ 6.99500275e+01],
       [ 4.78590393e+01],
       [ 6.13443851e+00],
       [ 4.75432777e+01],
       [ 3.81529846e+01],
       [ 6.49200821e+01],
       [ 9.23124599e+00],
       [ 4.89969864e+01],
       [-6.37224257e-01],
       [ 3.73332939e+01],
       [ 2.56723728e+01],
       [ 3.06055870e+01],
       [ 3.16947389e+00],
       [ 2.67301979e+01],
       [ 5.51972847e+01],
       [ 2.50404720e+01],
       [ 7.28668823e+01],
       [ 1.69202309e+01],
       [ 3.84074020e+01],
       [-6.08408332e-01],
       [-6.44036233e-01],
       [ 2.49839573e+01],
       [ 2.86155319e+01],
       [ 4.50891991e+01],
       [ 3.30012321e+01],
       [ 3.01145763e+01],
       [ 4.1

In [32]:
print(Y_test)

[ 2.07000000e+02 -2.13896490e+01  7.32695310e+01  8.31699220e+01
 -6.31894530e+01 -6.31894530e+01 -1.06629883e+02 -3.90488280e+01
 -1.52500000e+02 -3.24316410e+01  1.70689453e+02  2.62998040e+01
  1.49804690e+01 -1.38691410e+01 -1.83379883e+02  2.68370117e+02
  2.54150391e+02  5.31298830e+01 -6.61035200e+00 -5.36806640e+01
 -5.05498050e+01  5.04480468e+02  2.64101570e+01 -5.33007800e+00
  1.01370117e+02  2.13270508e+02  6.26806640e+01  1.15150391e+02
 -2.15140625e+02  2.48419922e+02 -3.36308600e+01 -9.12617190e+01
  3.93798830e+01 -1.10240234e+02  5.04394530e+01  4.84492190e+01
  1.62870118e+02  6.65800780e+01  5.57802730e+01 -1.06409668e+02
 -7.67099610e+01  9.38691400e+01 -1.03839843e+02  1.42201172e+02
  1.57140625e+02  3.57011710e+01  4.19629883e+02  1.39879882e+02
  1.59709961e+02  1.71103510e+01  8.70019600e+00  2.11894530e+01
  1.39939453e+02 -3.31992190e+01  2.42120117e+02  3.98105470e+01
  1.21410157e+02 -1.06290039e+02 -1.64790039e+02 -1.12130859e+02
 -1.49414000e-01 -5.24501

US inflation rate: https://www.multpl.com/inflation/table/by-month
To be included.