In [252]:
#author: James Chan © 2018
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import st_utils as ut
import datetime as dt
import string

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from dateutil import parser
from datetime import timedelta, date

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

%matplotlib inline

In [3]:
# initialize date range
start_date = '2018-01-01'
end_date = '2018-8-31'

In [63]:
# grab news.  nnly need to do this once
keywords = ['tesla'] #these three stocks move largely base on news
news_source = 'wsj.com, bloomberg.com, cnbc.com'
df = ut.get_news(keywords, start_date, end_date, news_source)
df.to_csv('news_dataset.csv')

{'status': 'ok', 'totalResults': 1798, 'articles': []}
total number of articles  1798
downloading page:  1
downloading page:  2
downloading page:  3
downloading page:  4
downloading page:  5
downloading page:  6
downloading page:  7
downloading page:  8
downloading page:  9
downloading page:  10
downloading page:  11
downloading page:  12
downloading page:  13
downloading page:  14
downloading page:  15
downloading page:  16
downloading page:  17
downloading page:  18
download complete


In [201]:
# get Tesla's prices
dates = pd.date_range(start_date, end_date)
file = 'TSLA.csv'
df_stock = ut.get_data(file, dates)

In [202]:
# calculate return 
df_stock['Return'] = df_stock['Adj Close'].pct_change()
df_stock.drop(df_stock.index[0], inplace=True)

In [203]:
df_news = pd.read_csv('news_dataset.csv', index_col=0, encoding="ISO-8859-1")

In [204]:
# map published time to close date.  see figure.1
def map_to_close_date(published_date):
    dt = parser.parse(published_date[:-1]) #-1 to ignore the Z, which is GMT.
    dt = dt - timedelta(hours=20) #shift back by 20 hrs.
    return pd.Timestamp(year=dt.year, month=dt.month, day=dt.day)

In [205]:
# map news to close date
df_news['Published'] = df_news['Published'].apply(map_to_close_date)

In [206]:
# combine title and body into single text
df_news['Text'] = df_news['Title'] + ' ' + df_news['Body']

In [207]:
# just keep published, and the combined text
df_news = df_news[['Published','Text']]

In [None]:
# merge news and stock
df_stock['Published'] = df_stock.index
df_merged = pd.merge(df_news, df_stock, how='left', on='Published')
df_merged = df_merged.dropna()
df_merged.sort_values(by='Published')

In [None]:
df_stock['Published'] = df_stock.index
df_merged = df_merged.dropna()
df_merged.sort_values(by='Published')

In [210]:
def tokenize_news(text):
    if(type(text)==float):
        return []
    lmtzr = WordNetLemmatizer()
    stmr = PorterStemmer()
    translator = str.maketrans('', '', string.punctuation)
    text = (text.translate(translator))
    text = "".join(c for c in text if ord(c)<128) #strip no n ascii characters
    tokens = nltk.tokenize.word_tokenize(text)
    t = []
    minlen =  4
    maxlen = 20 
    for token in tokens:
        if len(token) < minlen or len(token) > maxlen or token.isnumeric() or token in stopwords.words('english'):
            pass
        else:
            token = lmtzr.lemmatize(token)
            token = stmr.stem(token)
            t.append(token)
    return t

In [212]:
df_merged['Text'] = df_merged['Text'].apply(tokenize_news)

In [None]:
df_merged

In [214]:
#construct vocabulary and frequency.  frequency is not really needed, just nice to have for understanding the concept.
vocab = {}
frequency = {}
ignore = ['bloomberg', 'journal']
index = 0
for title in df_merged['Text']:
    for word in title:
        if word in ignore:
            continue
        if word not in vocab:
            vocab[word] = index
            index += 1
            frequency[word] = 1
        else:
            frequency[word] += 1

In [None]:
#visualize words frequency
for i in sorted(frequency.items(), key=lambda x:x[1], reverse=True):
    print(i)

In [None]:
vocab_len = len(vocab)
df_merged

In [233]:
def to_one_hot(text):
    #return one hot vector of shape (vocab_len, 1)
    array = np.zeros((vocab_len, 1))
    for word in text:
        if word in vocab:
            index = vocab[word]
            array[index] = 1
    return array
    

In [234]:
# here is the exciting part. here we are going to create the training data!
X = np.empty((vocab_len,0))
for text in df_merged['Text']:
    array = to_one_hot(text)
    X = np.hstack((X,array))

In [246]:
# get labels
Y = df_merged['Return'].values

In [250]:
#verify shape
print(X.shape)
print(Y.shape)

(4535, 1227)
(1227,)


In [266]:
threshold = .001
negative_pos = (Y < -threshold)
positive_pos = (Y > threshold)
q = np.vstack((Y > -threshold, Y < threshold))
q

array([[False, False, False, ..., False,  True,  True],
       [ True,  True,  True, ...,  True, False, False]])

In [253]:
model = Sequential()
model.add(Dense(20, input_dim=3, init='uniform', activation='relu'))
model.add(Dense(20, input_dim=3, init='uniform', activation='relu'))
model.add(Dense(20, init='uniform', activation='sigmoid'))