In [93]:
#author: James Chan © 2018
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import st_utils as ut
import trader_utils as tu
import string

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from dateutil import parser
from datetime import timedelta, date

from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers

from sklearn.model_selection import train_test_split

%matplotlib inline

In [94]:
# initialize date range
start_date = '2018-01-01'
end_date = '2018-8-31'

In [95]:
# grab news.  nnly need to do this once
keywords = ['tesla'] #these three stocks move largely base on news
news_source = 'wsj.com, bloomberg.com, cnbc.com'
df = ut.get_news(keywords, start_date, end_date, news_source)
df.to_csv('news_dataset.csv')

{'status': 'ok', 'totalResults': 1875, 'articles': []}
total number of articles  1875
downloading page:  1
downloading page:  2
downloading page:  3
downloading page:  4
downloading page:  5
downloading page:  6
downloading page:  7
downloading page:  8
downloading page:  9
downloading page:  10
downloading page:  11
downloading page:  12
downloading page:  13
downloading page:  14
downloading page:  15
downloading page:  16
downloading page:  17
downloading page:  18
downloading page:  19
download complete


In [96]:
# get Tesla's prices
dates = pd.date_range(start_date, end_date)
file = 'TSLA.csv'
df_stock = ut.get_data(file, dates)
df_stock.head()

Unnamed: 0,Adj Close
2018-01-02,320.529999
2018-01-03,317.25
2018-01-04,314.619995
2018-01-05,316.579987
2018-01-08,336.410004


In [97]:
# calculate return 
df_stock['Return'] = df_stock['Adj Close'].pct_change()
df_stock.drop(df_stock.index[0], inplace=True)

In [98]:
df_news = pd.read_csv('news_dataset.csv', index_col=0, encoding="ISO-8859-1")
df_news.head()

Unnamed: 0,Published,Title,Body,Keyword,Source
0,2018-04-03T13:05:17Z,"Tesla Ends 1Q Making 2,020 Model 3 Sedans per ...","Tesla 1Q deliveries totaled 29,980 vehicles, B...",tesla,Bloomberg
1,2018-07-24T07:48:09Z,Tesla Wants Its Money Back,Elon Musk is asking for donations to save Tesl...,tesla,The Wall Street Journal
2,2018-07-01T22:13:00Z,What Tesla can do to keep the bears at bay dur...,"Buckle up, because Tesla could be in for a bum...",tesla,CNBC
3,2018-05-09T15:00:00Z,Two Florida teenagers killed in Tesla crash,Tesla said it is cooperating with authorities ...,tesla,CNBC
4,2018-05-10T19:00:00Z,NHTSA joins NTSB in looking into fatal Tesla c...,Tesla said it is cooperating with authorities ...,tesla,CNBC


In [99]:
# map published time to close date.  see figure.1
def map_to_close_date(published_date):
    dt = parser.parse(published_date[:-1]) #-1 to ignore the Z, which is GMT.
    dt = dt - timedelta(hours=20) #shift back by 20 hrs.
    return pd.Timestamp(year=dt.year, month=dt.month, day=dt.day)

In [100]:
# map news to close date
df_news['Published'] = df_news['Published'].apply(map_to_close_date)

In [101]:
# combine title and body into single text
df_news['Text'] = df_news['Title'] + ' ' + df_news['Body']

In [102]:
# just keep published, and the combined text
df_news = df_news[['Published','Text']]

In [103]:
# merge news and stock
df_stock['Published'] = df_stock.index
df_merged = pd.merge(df_news, df_stock, how='left', on='Published')
df_merged = df_merged.dropna()
df_merged.sort_values(by='Published')
df_merged.head()

Unnamed: 0,Published,Text,Adj Close,Return
0,2018-04-02,"Tesla Ends 1Q Making 2,020 Model 3 Sedans per ...",252.479996,-0.051291
1,2018-07-23,Tesla Wants Its Money Back Elon Musk is asking...,303.200012,-0.033102
3,2018-05-08,Two Florida teenagers killed in Tesla crash Te...,301.970001,-0.002642
4,2018-05-09,NHTSA joins NTSB in looking into fatal Tesla c...,306.850006,0.016161
5,2018-04-12,"Tesla, Investigators Feud Over a Crash Tesla, ...",294.079987,-0.022763


In [104]:
def tokenize_news(text):
    if(type(text)==float):
        return []
    lmtzr = WordNetLemmatizer()
    stmr = PorterStemmer()
    translator = str.maketrans('', '', string.punctuation)
    text = (text.translate(translator))
    text = "".join(c for c in text if ord(c)<128) #strip no n ascii characters
    tokens = nltk.tokenize.word_tokenize(text)
    t = []
    minlen =  4
    maxlen = 20 
    for token in tokens:
        if len(token) < minlen or len(token) > maxlen or token.isnumeric() or token in stopwords.words('english'):
            pass
        else:
            token = lmtzr.lemmatize(token)
            token = stmr.stem(token)
            t.append(token)
    return t

In [105]:
df_merged['Text'] = df_merged['Text'].apply(tokenize_news)

In [106]:
df_merged.head()

Unnamed: 0,Published,Text,Adj Close,Return
0,2018-04-02,"[tesla, end, make, model, sedan, week, tesla, ...",252.479996,-0.051291
1,2018-07-23,"[tesla, want, money, back, elon, musk, ask, do...",303.200012,-0.033102
3,2018-05-08,"[florida, teenag, kill, tesla, crash, tesla, s...",301.970001,-0.002642
4,2018-05-09,"[nhtsa, join, ntsb, look, fatal, tesla, crash,...",306.850006,0.016161
5,2018-04-12,"[tesla, investig, feud, over, crash, tesla, in...",294.079987,-0.022763


In [107]:
#construct vocabulary and frequency.  frequency is not really needed, just nice to have for understanding the concept.
vocab = {}
frequency = {}
ignore = ['bloomberg', 'journal']
index = 0
for title in df_merged['Text']:
    for word in title:
        if word in ignore:
            continue
        if word not in vocab:
            vocab[word] = index
            index += 1
            frequency[word] = 1
        else:
            frequency[word] += 1

In [108]:
#visualize words frequency
for i, freq in enumerate(sorted(frequency.items(), key=lambda x:x[1], reverse=True)):
    if i == 10:
        break
    print(freq)

('tesla', 1097)
('musk', 461)
('elon', 366)
('compani', 249)
('stock', 246)
('model', 241)
('say', 182)
('china', 160)
('share', 159)
('market', 159)


In [109]:
vocab_len = len(vocab)

In [110]:
def to_one_hot(text):
    #return one hot vector of shape (vocab_len, 1)
    array = np.zeros((vocab_len, 1))
    for word in text:
        if word in vocab:
            index = vocab[word]
            array[index] = 1
    return array
    

In [111]:
# here is the exciting part. here we are going to create the training data!
X = np.empty((vocab_len,0))
for text in df_merged['Text']:
    array = to_one_hot(text)
    X = np.hstack((X,array))
X = X.T

In [112]:
# get labels
Y = df_merged['Return'].values.copy()

In [113]:
negative_idx = (Y <= 0)
positive_idx = (Y > 0)
Y[negative_idx] = 0
Y[positive_idx] = 1

In [118]:
print("Input Feature Size (num_examples X vocab_size): ",X.shape)
print("Label Size (num_examples): ", Y.shape)

Input Feature Size (num_examples X vocab_size):  (1228, 4534)
Label Size (num_examples):  (1228,)


In [122]:
#train test split
train_size = .9
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(1105, 4534)
(123, 4534)
(1105,)
(123,)


In [123]:
#train the model
model = Sequential()
model.add(Dense(1, input_dim=X_train.shape[1], activation='relu', kernel_initializer='uniform'))
# model.add(Dense(12, activation='relu', kernel_initializer='uniform'))
# model.add(Dense(1, activation='sigmoid', kernel_initializer='uniform'))

optimizer = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.00001, amsgrad=False)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=50, batch_size=32, verbose=0)

<keras.callbacks.History at 0x1b3c2409a20>

In [124]:
#view weights
weights, biases = model.layers[0].get_weights()
for i, k in enumerate(vocab.keys()):
    if i == 10:
        break
    print(k, weights[i])

tesla [0.02952633]
end [-0.24720483]
make [0.03284777]
model [0.01605833]
sedan [0.04877686]
week [0.00196528]
deliveri [-0.03385526]
total [0.08856238]
vehicl [0.05465103]
new [0.02539405]


In [125]:
#assess in-sample accuracy
predictions = model.predict_classes(X_train)
predictions = predictions.reshape((predictions.shape[0],))
np.sum(predictions == Y_train) / Y_train.shape[0]


0.9683257918552036

In [126]:
#assess out-of-sample accuracy
predictions = model.predict_classes(X_test)
predictions = predictions.reshape((predictions.shape[0],))
np.sum(predictions == Y_test) / Y_test.shape[0]

0.5284552845528455