In [229]:
#author: James Chan © 2018
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import st_utils as sut
import trader_utils as tut
import string

from scipy.stats import mode

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from dateutil import parser
from datetime import timedelta, date

from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers

from sklearn.model_selection import train_test_split

%matplotlib inline

In [3]:
# initialize date range
start_date = '2018-01-01'
end_date = '2018-8-31'

In [95]:
# grab news.  only need to do this once
keywords = ['tesla'] #these three stocks move largely base on news
news_source = 'wsj.com, bloomberg.com, cnbc.com'
df = sut.get_news(keywords, start_date, end_date, news_source)
df.to_csv('news_dataset.csv')

{'status': 'ok', 'totalResults': 1875, 'articles': []}
total number of articles  1875
downloading page:  1
downloading page:  2
downloading page:  3
downloading page:  4
downloading page:  5
downloading page:  6
downloading page:  7
downloading page:  8
downloading page:  9
downloading page:  10
downloading page:  11
downloading page:  12
downloading page:  13
downloading page:  14
downloading page:  15
downloading page:  16
downloading page:  17
downloading page:  18
downloading page:  19
download complete


In [95]:
# get Tesla's prices
dates = pd.date_range(start_date, end_date)
file = 'TSLA.csv'
df_stock = sut.get_data(file, dates)
df_stock.head()

Unnamed: 0,Adj Close
2018-01-02,320.529999
2018-01-03,317.25
2018-01-04,314.619995
2018-01-05,316.579987
2018-01-08,336.410004


In [96]:
# calculate return 
df_stock['Return'] = df_stock['Adj Close'].pct_change()
df_stock.drop(df_stock.index[0], inplace=True)

In [97]:
df_news = pd.read_csv('news_dataset.csv', index_col=0, encoding="ISO-8859-1")
df_news.head()

Unnamed: 0,Published,Title,Body,Keyword,Source
0,2018-04-03T13:05:17Z,"Tesla Ends 1Q Making 2,020 Model 3 Sedans per ...","Tesla 1Q deliveries totaled 29,980 vehicles, B...",tesla,Bloomberg
1,2018-07-24T07:48:09Z,Tesla Wants Its Money Back,Elon Musk is asking for donations to save Tesl...,tesla,The Wall Street Journal
2,2018-07-01T22:13:00Z,What Tesla can do to keep the bears at bay dur...,"Buckle up, because Tesla could be in for a bum...",tesla,CNBC
3,2018-05-09T15:00:00Z,Two Florida teenagers killed in Tesla crash,Tesla said it is cooperating with authorities ...,tesla,CNBC
4,2018-05-10T19:00:00Z,NHTSA joins NTSB in looking into fatal Tesla c...,Tesla said it is cooperating with authorities ...,tesla,CNBC


In [98]:
# map published time to close date.  see figure.1
def map_to_close_date(published_date):
    dt = parser.parse(published_date[:-1]) #-1 to ignore the Z, which is GMT.
    dt = dt - timedelta(hours=20) #shift back by 20 hrs.
    return pd.Timestamp(year=dt.year, month=dt.month, day=dt.day)

In [99]:
# map news to close date
df_news['Published'] = df_news['Published'].apply(map_to_close_date)

In [100]:
# combine title and body into single text
df_news['Text'] = df_news['Title'] + ' ' + df_news['Body']

In [101]:
# just keep published, and the combined text
df_news = df_news[['Published','Text']]

In [102]:
# merge news and stock
df_stock['Published'] = df_stock.index
df_merged = pd.merge(df_news, df_stock, how='left', on='Published')
df_merged = df_merged.dropna()
df_merged = df_merged.sort_values(by='Published')
df_merged.head()

Unnamed: 0,Published,Text,Adj Close,Return
1002,2018-01-03,Five Things You Need to Know to Start Your Day...,317.25,-0.010233
609,2018-01-03,What's News: Business & Finance Whats News: B...,317.25,-0.010233
29,2018-01-03,"Tesla delivers 1,550 Model 3 sedans and 29,870...",317.25,-0.010233
1531,2018-01-03,Stocks making the biggest moves in premarket t...,317.25,-0.010233
764,2018-01-03,"After-hours buzz: INTC, TSLA & more See which ...",317.25,-0.010233


In [103]:
def tokenize_news(text):
    if(type(text)==float):
        return []
    lmtzr = WordNetLemmatizer()
    stmr = PorterStemmer()
    translator = str.maketrans('', '', string.punctuation)
    text = (text.translate(translator))
    text = "".join(c for c in text if ord(c)<128) #strip no n ascii characters
    tokens = word_tokenize(text)
    t = []
    minlen =  4
    maxlen = 20 
    for token in tokens:
        if len(token) < minlen or len(token) > maxlen or token.isnumeric() or token in stopwords.words('english'):
            pass
        else:
            token = lmtzr.lemmatize(token)
            token = stmr.stem(token)
            t.append(token)
    return t

In [104]:
df_merged['Text'] = df_merged['Text'].apply(tokenize_news)

In [105]:
df_merged.head()

Unnamed: 0,Published,Text,Adj Close,Return
1002,2018-01-03,"[five, thing, need, know, start, your, caught,...",317.25,-0.010233
609,2018-01-03,"[what, new, busi, financ, what, new, busi, fin...",317.25,-0.010233
29,2018-01-03,"[tesla, deliv, model, sedan, total, vehicl, fo...",317.25,-0.010233
1531,2018-01-03,"[stock, make, biggest, move, premarket, trade,...",317.25,-0.010233
764,2018-01-03,"[afterhour, buzz, intc, tsla, stock, post, mov...",317.25,-0.010233


In [109]:
df_merged

Unnamed: 0,Published,Text,Adj Close,Return
1002,2018-01-03,"[five, thing, need, know, start, your, caught,...",317.250000,-0.010233
609,2018-01-03,"[what, new, busi, financ, what, new, busi, fin...",317.250000,-0.010233
29,2018-01-03,"[tesla, deliv, model, sedan, total, vehicl, fo...",317.250000,-0.010233
1531,2018-01-03,"[stock, make, biggest, move, premarket, trade,...",317.250000,-0.010233
764,2018-01-03,"[afterhour, buzz, intc, tsla, stock, post, mov...",317.250000,-0.010233
351,2018-01-03,"[even, read, that, tesla, announc, investor, k...",317.250000,-0.010233
395,2018-01-03,"[tesla, lacklust, model, sale, miss, lower, wa...",317.250000,-0.010233
42,2018-01-03,"[stock, watch, tesla, maci, walgreen, boot, al...",317.250000,-0.010233
592,2018-01-03,"[tesla, model, stori, get, compani, news, deli...",317.250000,-0.010233
587,2018-01-03,"[hyundai, turn, driverlesscar, startup, silico...",317.250000,-0.010233


In [107]:
#construct vocabulary and frequency.  frequency is not really needed, just nice to have for understanding the concept.
vocab = {}
frequency = {}
ignore = ['bloomberg', 'journal']
index = 0
for title in df_merged['Text']:
    for word in title:
        if word in ignore:
            continue
        if word not in vocab:
            vocab[word] = index
            index += 1
            frequency[word] = 1
        else:
            frequency[word] += 1

In [167]:
#visualize words frequency
for i, freq in enumerate(sorted(frequency.items(), key=lambda x:x[1], reverse=True)):
    if i == 10:
        break
    print(freq)

('tesla', 1097)
('musk', 461)
('elon', 366)
('compani', 249)
('stock', 246)
('model', 241)
('say', 182)
('china', 160)
('market', 159)
('share', 159)


In [47]:
vocab_len = len(vocab)

In [129]:
in_sd = df_merged['Published'].iloc[0]
in_ed = pd.datetime(2018,6,29)
out_sd = pd.datetime(2018,7,2)
out_ed = df_merged['Published'].iloc[-1]

df_training_data = df_merged[np.logical_and(df_merged['Published'] >= in_sd, df_merged['Published'] <= in_ed)]
df_testing_data = df_merged[np.logical_and(df_merged['Published'] >= out_sd, df_merged['Published'] <= out_ed)]

In [130]:
df_training_data.iloc[:20]

Unnamed: 0,Published,Text,Adj Close,Return
1002,2018-01-03,"[five, thing, need, know, start, your, caught,...",317.25,-0.010233
609,2018-01-03,"[what, new, busi, financ, what, new, busi, fin...",317.25,-0.010233
29,2018-01-03,"[tesla, deliv, model, sedan, total, vehicl, fo...",317.25,-0.010233
1531,2018-01-03,"[stock, make, biggest, move, premarket, trade,...",317.25,-0.010233
764,2018-01-03,"[afterhour, buzz, intc, tsla, stock, post, mov...",317.25,-0.010233
351,2018-01-03,"[even, read, that, tesla, announc, investor, k...",317.25,-0.010233
395,2018-01-03,"[tesla, lacklust, model, sale, miss, lower, wa...",317.25,-0.010233
42,2018-01-03,"[stock, watch, tesla, maci, walgreen, boot, al...",317.25,-0.010233
592,2018-01-03,"[tesla, model, stori, get, compani, news, deli...",317.25,-0.010233
587,2018-01-03,"[hyundai, turn, driverlesscar, startup, silico...",317.25,-0.010233


In [131]:
df_testing_data.iloc[:20]

Unnamed: 0,Published,Text,Adj Close,Return
536,2018-07-02,"[tesla, stop, brake, roll, test, push, model, ...",335.070007,-0.022977
324,2018-07-02,"[tesla, watcher, predict, what, star, tesla, g...",335.070007,-0.022977
983,2018-07-02,"[tech, ralli, long, cnbc, cnbc, tech, ralli, l...",335.070007,-0.022977
1597,2018-07-02,"[progress, dont, hate, trump, even, berni, san...",335.070007,-0.022977
368,2018-07-02,"[wall, street, impress, tesla, model, mileston...",335.070007,-0.022977
315,2018-07-02,"[tesla, profit, septemb, say, gene, munster, l...",335.070007,-0.022977
714,2018-07-02,"[german, leader, say, migrat, clash, with, mer...",335.070007,-0.022977
1600,2018-07-02,"[glencor, target, corrupt, probeenergi, journa...",335.070007,-0.022977
330,2018-07-02,"[tesla, chief, engin, tesla, inc, engin, doug,...",335.070007,-0.022977
485,2018-07-02,"[tesla, share, close, despit, model, goal, ann...",335.070007,-0.022977


In [132]:
def to_one_hot(text):
    #return one hot vector of shape (vocab_len, 1)
    array = np.zeros((vocab_len, 1))
    for word in text:
        if word in vocab:
            index = vocab[word]
            array[index] = 1
    return array

In [143]:
def get_X(df_text):
    X = np.empty((vocab_len,0))
    for text in df_text:
        array = to_one_hot(text)
        X = np.hstack((X,array))
    X = X.T
    return X

In [183]:
# here is the exciting part. here we are going to create the training data!
train_x = get_X(df_training_data['Text'])
test_x = get_X(df_testing_data['Text'])

In [184]:
# get labels
train_y = df_training_data['Return'].values.copy()
test_y = df_testing_data['Return'].values.copy()

In [185]:
train_y[train_y <= 0] = 0
train_y[train_y > 0] = 1

test_y[test_y <= 0] = 0
test_y[test_y > 0] = 1

In [186]:
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(1098, 4534)
(1098,)
(130, 4534)
(130,)


In [None]:
#train the model
model = Sequential()
model.add(Dense(1, input_dim=X_train.shape[1], activation='relu', kernel_initializer='uniform'))
# model.add(Dense(12, activation='relu', kernel_initializer='uniform'))
# model.add(Dense(1, activation='sigmoid', kernel_initializer='uniform'))

optimizer = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.00001, amsgrad=False)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(train_x, train_y, epochs=50, batch_size=32, verbose=1)

In [188]:
#view weights
weights, biases = model.layers[0].get_weights()
for i, k in enumerate(vocab.keys()):
    if i == 10:
        break
    print(k, weights[i])

five [-0.05141117]
thing [-0.04538951]
need [0.07935308]
know [0.07184695]
start [0.07729388]
your [0.03023542]
caught [0.05877448]
what [0.05702874]
move [0.04279188]
market [-0.02427554]


In [193]:
#assess in-sample accuracy
predictions = model.predict_classes(train_x)
predictions = predictions.reshape((predictions.shape[0],))
np.sum(predictions == train_y) / train_y.shape[0]

0.982695810564663

In [194]:
#assess out-of-sample accuracy
predictions = model.predict_classes(test_x)
predictions = predictions.reshape((predictions.shape[0],))
np.sum(predictions == test_y) / test_y.shape[0]

0.5153846153846153

In [241]:
a = mode([0,1,2,3,3])
a[0][0]

3

In [236]:
sd = df_training_data['Published'].iloc[0]
ed = df_training_data['Published'].iloc[-1]
trades = tut.get_prices('TSLA', sd, ed)
trades[:] = 0
curr_date = sd
votes = []
for date, text in zip(df_training_data['Published'], df_training_data['Text']):
    print(votes)
    if date != curr_date:
        trades.loc[date] = mode(votes)[0][0]
        votes = []
        curr_date = date
    text_vector = to_one_hot(text)
    prediction = model.predict_classes(text_vector.T)
    votes.append(prediction[0][0])
trades

[]
[0]
[0, 0]
[0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


ValueError: cannot copy sequence with size 2 to array axis with dimension 1

In [None]:
#generate trades
def generate_trades(df):
    sd = df_training_data['Published'].iloc[0]
    ed = df_training_data['Published'].iloc[-1]
    for date in df_training_data['Published']
    test_x, _ = get_xy(self.ticker, start_date, end_date, self.rolling_window, self.n)
    test_x = self.scaler.transform(test_x)
    predictions = self.model.predict_classes(test_x)
    actual_start = self.rolling_window - 1
    trades = predictions_to_trades(self.ticker, predictions, start_date, end_date, self.n, actual_start)
    return trades

In [None]:
starting_cash = 100000
holding_limit = 1000
btr = BackTester(starting_cash, holding_limit)

In [31]:
df_merged

Unnamed: 0,Published,Text,Adj Close,Return
0,2018-04-02,"[tesla, end, make, model, sedan, week, tesla, ...",252.479996,-0.051291
1,2018-07-23,"[tesla, want, money, back, elon, musk, ask, do...",303.200012,-0.033102
3,2018-05-08,"[florida, teenag, kill, tesla, crash, tesla, s...",301.970001,-0.002642
4,2018-05-09,"[nhtsa, join, ntsb, look, fatal, tesla, crash,...",306.850006,0.016161
5,2018-04-12,"[tesla, investig, feud, over, crash, tesla, in...",294.079987,-0.022763
6,2018-04-16,"[tesla, halt, model, product, again, wall, str...",291.209991,-0.030399
7,2018-06-29,"[elon, musk, tweet, featur, tesla, pickup, tru...",342.950012,-0.019947
8,2018-03-27,"[tesla, defend, autopilot, record, fed, launch...",279.179993,-0.082188
9,2018-03-29,"[tesla, recal, model, car, over, bolt, issu, w...",266.130005,0.032392
10,2018-04-26,"[kany, cant, save, tesla, from, chipotl, long,...",285.480011,0.017065
