In [359]:
#author: James Chan © 2018
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import st_utils as ut
import datetime as dt
import string

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from dateutil import parser
from datetime import timedelta, date

from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers

%matplotlib inline

In [3]:
# initialize date range
start_date = '2018-01-01'
end_date = '2018-8-31'

In [63]:
# grab news.  nnly need to do this once
keywords = ['tesla'] #these three stocks move largely base on news
news_source = 'wsj.com, bloomberg.com, cnbc.com'
df = ut.get_news(keywords, start_date, end_date, news_source)
df.to_csv('news_dataset.csv')

{'status': 'ok', 'totalResults': 1798, 'articles': []}
total number of articles  1798
downloading page:  1
downloading page:  2
downloading page:  3
downloading page:  4
downloading page:  5
downloading page:  6
downloading page:  7
downloading page:  8
downloading page:  9
downloading page:  10
downloading page:  11
downloading page:  12
downloading page:  13
downloading page:  14
downloading page:  15
downloading page:  16
downloading page:  17
downloading page:  18
download complete


In [287]:
# get Tesla's prices
dates = pd.date_range(start_date, end_date)
file = 'TSLA.csv'
df_stock = ut.get_data(file, dates)

In [288]:
# calculate return 
df_stock['Return'] = df_stock['Adj Close'].pct_change()
df_stock.drop(df_stock.index[0], inplace=True)

In [289]:
df_news = pd.read_csv('news_dataset.csv', index_col=0, encoding="ISO-8859-1")

In [290]:
# map published time to close date.  see figure.1
def map_to_close_date(published_date):
    dt = parser.parse(published_date[:-1]) #-1 to ignore the Z, which is GMT.
    dt = dt - timedelta(hours=20) #shift back by 20 hrs.
    return pd.Timestamp(year=dt.year, month=dt.month, day=dt.day)

In [291]:
# map news to close date
df_news['Published'] = df_news['Published'].apply(map_to_close_date)

In [292]:
# combine title and body into single text
df_news['Text'] = df_news['Title'] + ' ' + df_news['Body']

In [293]:
# just keep published, and the combined text
df_news = df_news[['Published','Text']]

In [294]:
# merge news and stock
df_stock['Published'] = df_stock.index
df_merged = pd.merge(df_news, df_stock, how='left', on='Published')
df_merged = df_merged.dropna()
df_merged.sort_values(by='Published')

Unnamed: 0,Published,Text,Adj Close,Return
391,2018-01-03,Tesla's Lackluster Model 3 Sales Miss Lowered ...,317.250000,-0.010233
348,2018-01-03,Why Are You Even Reading That Tesla Announceme...,317.250000,-0.010233
1086,2018-01-03,"Dow breaks above 25,000 for the first time eve...",317.250000,-0.010233
1461,2018-01-03,Stocks making the biggest moves in premarket t...,317.250000,-0.010233
42,2018-01-03,"Stocks to Watch: Tesla, AMD, Macys, Walgreens...",317.250000,-0.010233
29,2018-01-03,"Tesla delivers 1,550 Model 3 sedans and 29,870...",317.250000,-0.010233
22,2018-01-03,Tesla Model 3 production snarls have Wall Stre...,317.250000,-0.010233
549,2018-01-03,"Ex-Google, Tesla Driverless Car Czars Partner ...",317.250000,-0.010233
953,2018-01-03,Five Things You Need to Know to Start Your Day...,317.250000,-0.010233
236,2018-01-03,Tesla's Fourth Quarter Model 3 Deliveries Fall...,317.250000,-0.010233


In [295]:
df_stock['Published'] = df_stock.index
df_merged = df_merged.dropna()
df_merged.sort_values(by='Published')

Unnamed: 0,Published,Text,Adj Close,Return
391,2018-01-03,Tesla's Lackluster Model 3 Sales Miss Lowered ...,317.250000,-0.010233
348,2018-01-03,Why Are You Even Reading That Tesla Announceme...,317.250000,-0.010233
1086,2018-01-03,"Dow breaks above 25,000 for the first time eve...",317.250000,-0.010233
1461,2018-01-03,Stocks making the biggest moves in premarket t...,317.250000,-0.010233
42,2018-01-03,"Stocks to Watch: Tesla, AMD, Macys, Walgreens...",317.250000,-0.010233
29,2018-01-03,"Tesla delivers 1,550 Model 3 sedans and 29,870...",317.250000,-0.010233
22,2018-01-03,Tesla Model 3 production snarls have Wall Stre...,317.250000,-0.010233
549,2018-01-03,"Ex-Google, Tesla Driverless Car Czars Partner ...",317.250000,-0.010233
953,2018-01-03,Five Things You Need to Know to Start Your Day...,317.250000,-0.010233
236,2018-01-03,Tesla's Fourth Quarter Model 3 Deliveries Fall...,317.250000,-0.010233


In [296]:
def tokenize_news(text):
    if(type(text)==float):
        return []
    lmtzr = WordNetLemmatizer()
    stmr = PorterStemmer()
    translator = str.maketrans('', '', string.punctuation)
    text = (text.translate(translator))
    text = "".join(c for c in text if ord(c)<128) #strip no n ascii characters
    tokens = nltk.tokenize.word_tokenize(text)
    t = []
    minlen =  4
    maxlen = 20 
    for token in tokens:
        if len(token) < minlen or len(token) > maxlen or token.isnumeric() or token in stopwords.words('english'):
            pass
        else:
            token = lmtzr.lemmatize(token)
            token = stmr.stem(token)
            t.append(token)
    return t

In [297]:
df_merged['Text'] = df_merged['Text'].apply(tokenize_news)

In [298]:
df_merged

Unnamed: 0,Published,Text,Adj Close,Return
0,2018-04-02,"[tesla, end, make, model, sedan, week, tesla, ...",252.479996,-0.051291
1,2018-07-23,"[tesla, want, money, back, elon, musk, ask, do...",303.200012,-0.033102
3,2018-05-08,"[florida, teenag, kill, tesla, crash, tesla, s...",301.970001,-0.002642
4,2018-05-09,"[nhtsa, join, ntsb, look, fatal, tesla, crash,...",306.850006,0.016161
5,2018-04-12,"[tesla, investig, feud, over, crash, tesla, in...",294.079987,-0.022763
6,2018-04-16,"[tesla, halt, model, product, again, wall, str...",291.209991,-0.030399
7,2018-06-29,"[elon, musk, tweet, featur, tesla, pickup, tru...",342.950012,-0.019947
8,2018-03-27,"[tesla, defend, autopilot, record, fed, launch...",279.179993,-0.082188
9,2018-03-29,"[tesla, recal, model, car, over, bolt, issu, w...",266.130005,0.032392
10,2018-04-26,"[kany, cant, save, tesla, from, chipotl, long,...",285.480011,0.017065


In [299]:
#construct vocabulary and frequency.  frequency is not really needed, just nice to have for understanding the concept.
vocab = {}
frequency = {}
ignore = ['bloomberg', 'journal']
index = 0
for title in df_merged['Text']:
    for word in title:
        if word in ignore:
            continue
        if word not in vocab:
            vocab[word] = index
            index += 1
            frequency[word] = 1
        else:
            frequency[word] += 1

In [300]:
#visualize words frequency
for i in sorted(frequency.items(), key=lambda x:x[1], reverse=True):
    print(i)

('tesla', 1095)
('musk', 458)
('elon', 363)
('compani', 251)
('stock', 245)
('model', 241)
('say', 182)
('share', 159)
('china', 159)
('market', 158)
('electr', 156)
('product', 140)
('trade', 137)
('make', 123)
('car', 121)
('year', 112)
('tech', 112)
('vehicl', 107)
('move', 105)
('report', 103)
('citi', 101)
('wall', 96)
('street', 96)
('investor', 96)
('maker', 87)
('auto', 86)
('trump', 85)
('said', 83)
('busi', 81)
('first', 81)
('analyst', 81)
('crash', 80)
('could', 80)
('billion', 79)
('execut', 78)
('call', 75)
('today', 72)
('biggest', 71)
('plan', 70)
('time', 70)
('here', 69)
('need', 68)
('expect', 66)
('like', 65)
('world', 65)
('chief', 64)
('technolog', 64)
('presid', 64)
('factori', 63)
('week', 61)
('batteri', 61)
('earn', 60)
('spacex', 60)
('cash', 57)
('money', 55)
('even', 55)
('industri', 54)
('take', 53)
('with', 53)
('million', 53)
('rais', 52)
('chines', 52)
('start', 51)
('accord', 51)
('post', 50)
('facebook', 50)
('employe', 49)
('appl', 49)
('futur', 48)


('thought', 2)
('surpris', 2)
('token', 2)
('adjust', 2)
('comeback', 2)
('riddl', 2)
('outsel', 2)
('doe', 2)
('favor', 2)
('gross', 2)
('floor', 2)
('rearend', 2)
('micro', 2)
('voluntarili', 2)
('faulti', 2)
('forev', 2)
('evercor', 2)
('revolt', 2)
('thesi', 2)
('entitl', 2)
('compet', 2)
('astronom', 2)
('flurri', 2)
('restructur', 2)
('residenti', 2)
('perceiv', 2)
('spiegel', 2)
('guard', 2)
('mount', 2)
('premium', 2)
('handout', 2)
('necessari', 2)
('basic', 2)
('incom', 2)
('august', 2)
('allergan', 2)
('wild', 2)
('weight', 2)
('leapfrog', 2)
('empir', 2)
('clue', 2)
('bounc', 2)
('sunrun', 2)
('inexperienc', 2)
('gigafactori', 2)
('havent', 2)
('infrastructur', 2)
('narrow', 2)
('timelin', 2)
('height', 2)
('construct', 2)
('calosha', 2)
('famou', 2)
('nikola', 2)
('infring', 2)
('testifi', 2)
('beneath', 2)
('supercharg', 2)
('loung', 2)
('western', 2)
('significantli', 2)
('mechan', 2)
('appar', 2)
('leak', 2)
('soro', 2)
('taunt', 2)
('mainli', 2)
('doubter', 2)
('rude',

('parad', 1)
('midweek', 1)
('konglist', 1)
('realest', 1)
('humbl', 1)
('stateown', 1)
('meat', 1)
('processor', 1)
('accessibleand', 1)
('pundit', 1)
('youll', 1)
('tactic', 1)
('prognost', 1)
('bomb', 1)
('pose', 1)
('destabil', 1)
('regret', 1)
('enact', 1)
('werent', 1)
('optic', 1)
('twofront', 1)
('yuan', 1)
('heart', 1)
('resist', 1)
('taught', 1)
('liber', 1)
('bastion', 1)
('authoritarian', 1)
('mclaren', 1)
('corvett', 1)
('nope', 1)
('infusionenergi', 1)
('magnesium', 1)
('asbesto', 1)
('trove', 1)
('visa', 1)
('dilemma', 1)
('coordin', 1)
('expuls', 1)
('parcel', 1)
('chauffeur', 1)
('realworld', 1)
('breather', 1)
('toptier', 1)
('regularli', 1)
('pursuit', 1)
('hollywoodworthi', 1)
('pompeo', 1)
('contagion', 1)
('slight', 1)
('decoupl', 1)
('whiff', 1)
('surrend', 1)
('todo', 1)
('mandat', 1)
('unbound', 1)
('clair', 1)
('mccaskil', 1)
('showdown', 1)
('unemploy', 1)
('17year', 1)
('middleincom', 1)
('prosper', 1)
('cop', 1)
('hovnanian', 1)
('demo', 1)
('dorseyand', 1)

In [301]:
vocab_len = len(vocab)
df_merged

Unnamed: 0,Published,Text,Adj Close,Return
0,2018-04-02,"[tesla, end, make, model, sedan, week, tesla, ...",252.479996,-0.051291
1,2018-07-23,"[tesla, want, money, back, elon, musk, ask, do...",303.200012,-0.033102
3,2018-05-08,"[florida, teenag, kill, tesla, crash, tesla, s...",301.970001,-0.002642
4,2018-05-09,"[nhtsa, join, ntsb, look, fatal, tesla, crash,...",306.850006,0.016161
5,2018-04-12,"[tesla, investig, feud, over, crash, tesla, in...",294.079987,-0.022763
6,2018-04-16,"[tesla, halt, model, product, again, wall, str...",291.209991,-0.030399
7,2018-06-29,"[elon, musk, tweet, featur, tesla, pickup, tru...",342.950012,-0.019947
8,2018-03-27,"[tesla, defend, autopilot, record, fed, launch...",279.179993,-0.082188
9,2018-03-29,"[tesla, recal, model, car, over, bolt, issu, w...",266.130005,0.032392
10,2018-04-26,"[kany, cant, save, tesla, from, chipotl, long,...",285.480011,0.017065


In [302]:
def to_one_hot(text):
    #return one hot vector of shape (vocab_len, 1)
    array = np.zeros((vocab_len, 1))
    for word in text:
        if word in vocab:
            index = vocab[word]
            array[index] = 1
    return array
    

In [401]:
# here is the exciting part. here we are going to create the training data!
X = np.empty((vocab_len,0))
for text in df_merged['Text']:
    array = to_one_hot(text)
    X = np.hstack((X,array))
X = X.T

In [376]:
# get labels
Y = df_merged['Return'].values.copy()

In [389]:
# threshold = .000
# negative_idx = (Y < -threshold)
# positive_idx = (Y > threshold)
# neutral_idx = np.vstack((Y > -threshold, Y < threshold))
# neutral_idx = np.all(neutral_idx, axis=0)
# Y[negative_idx] = -1
# Y[positive_idx] = 1
# Y[neutral_idx] = 0

negative_idx = (Y <= 0)
positive_idx = (Y > 0)
Y[negative_idx] = 0
Y[positive_idx] = 1


In [402]:
print(X.shape)
print(Y.shape)

(1227, 4535)
(1227,)


In [409]:
train_size = .9
split = int(X.shape[0] * train_size)
X_train = X[:split,:]
X_test = X[split:,:]
Y_train = Y[:split]
Y_test = Y[split:]

In [410]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(1104, 4535)
(123, 4535)
(1104,)
(123,)


In [411]:
model = Sequential()
model.add(Dense(12, input_dim=X_train.shape[1], activation='relu', kernel_initializer='uniform'))
model.add(Dense(12, activation='relu', kernel_initializer='uniform'))
model.add(Dense(1, activation='sigmoid', kernel_initializer='uniform'))

optimizer = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.00001, amsgrad=False)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=20, batch_size=32, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1b4a7b51d30>

In [424]:
#assess out of sample accuracy

predictions = model.predict_classes(X_test)
predictions.reshape((predictions.shape[0],))
prediciton
# predictions = model.predict_classes(X_test)
# for i, prediction in enumerate(predictions):
#     print(prediction, Y_test[i])

(123, 123)