In [2]:
#author: James Chan © 2018
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import st_utils as ut
import datetime as dt
import string

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from dateutil import parser
from datetime import timedelta, date
%matplotlib inline

In [3]:
# Initialize date range
start_date = '2018-01-01'
end_date = '2018-8-31'

In [63]:
# Grab news.  Only need to do this once
keywords = ['tesla'] #these three stocks move largely base on news
news_source = 'wsj.com, bloomberg.com, cnbc.com'
df = ut.get_news(keywords, start_date, end_date, news_source)
df.to_csv('news_dataset.csv')

{'status': 'ok', 'totalResults': 1798, 'articles': []}
total number of articles  1798
downloading page:  1
downloading page:  2
downloading page:  3
downloading page:  4
downloading page:  5
downloading page:  6
downloading page:  7
downloading page:  8
downloading page:  9
downloading page:  10
downloading page:  11
downloading page:  12
downloading page:  13
downloading page:  14
downloading page:  15
downloading page:  16
downloading page:  17
downloading page:  18
download complete


In [175]:
# Get Tesla's prices
dates = pd.date_range(start_date, end_date)
file = 'TSLA.csv'
df_stock = ut.get_data(file, dates)

In [176]:
# Calculate return 
df_stock['Return'] = df_stock['Adj Close'].pct_change()
df_stock.drop(df_stock.index[0], inplace=True)

In [181]:
df_news = pd.read_csv('news_dataset.csv', index_col=0, encoding="ISO-8859-1")

In [182]:
# map published time to close date.  see figure.1
def map_to_close_date(published_date):
    dt = parser.parse(published_date[:-1]) #-1 to ignore the Z, which is GMT.
    dt = dt - timedelta(hours=20) #shift back by 20 hrs.
    return pd.Timestamp(year=dt.year, month=dt.month, day=dt.day)

In [183]:
# map news to close date
df_news['Published'] = df_news['Published'].apply(map_to_close_date)

In [197]:
# combine title and body into single text
df_news['Text'] = df_news['Title'] + ' ' + df_news['Body']

In [199]:
# just keep published, and the combined text
df_news = df_news[['Published','Text']]

In [171]:
# merge news and stock
df_merged = pd.merge(df_news, df_stock, how='left', on='Published')

In [172]:
df_stock['Published'] = df_stock.index
df_merged = df_merged.dropna()
df_merged.sort_values(by='Published')

Unnamed: 0,Published,Title,Body,Keyword,Source,Adj Close,Return
1767,2018-01-05,A User's Guide To Driverless Cars,The wait for the self-driving future is coming...,tesla,Bloomberg,316.579987,0.006230
1088,2018-01-05,Takata Recalls Another 3.3 Million Air Bags Un...,"Takata Corp., the parts supplier that filed fo...",tesla,Bloomberg,316.579987,0.006230
1087,2018-01-05,"Top VC deals: Apple buys Buddybuild, Google ba...",A weekly recap of some of the most interesting...,tesla,CNBC,316.579987,0.006230
623,2018-01-08,"Tesla is a 'threat,' but we're 'more than up t...",Nio CFO Louis Hsieh discussed Tesla's potentia...,tesla,CNBC,336.410004,0.062638
1613,2018-01-08,"Don't Worry, Petrolheads. Driverless Cars Are ...",Carmakers and tech companies have fallen over ...,tesla,Bloomberg,336.410004,0.062638
1474,2018-01-08,"SpaceX-Launched Satellite Isn't Seen in Orbit,...",A military satellite launched by Elon Musks S...,tesla,Bloomberg,336.410004,0.062638
256,2018-01-08,Tesla Powers Up New York Gigafactory Solar Roo...,Article URL: https://www.bloomberg.com/news/ar...,tesla,Bloomberg,336.410004,0.062638
278,2018-01-08,Cramer Remix: Teslas stock is up for one simp...,Jim Cramer lifted the hood on Teslas move hig...,tesla,CNBC,336.410004,0.062638
1010,2018-01-08,'It might raise national security issues': Chi...,It might not be easy for foreign firms looking...,tesla,CNBC,336.410004,0.062638
908,2018-01-08,Cramer says 4 thingsincluding the tax overhau...,"Jim Cramer explained how index funds, stock sh...",tesla,CNBC,336.410004,0.062638


In [155]:
def tokenize_news(text):
    if(type(text)==float):
        return []
    lmtzr = WordNetLemmatizer()
    stmr = PorterStemmer()
    translator = str.maketrans('', '', string.punctuation)
    text = (text.translate(translator))
    text = "".join(c for c in text if ord(c)<128) #strip no n ascii characters
    tokens = nltk.tokenize.word_tokenize(text)
    t = []
    minlen =  4
    maxlen = 20 
    for token in tokens:
        if len(token) < minlen or len(token) > maxlen or token.isnumeric() or token in stopwords.words('english'):
            pass
        else:
            token = lmtzr.lemmatize(token)
            token = stmr.stem(token)
            t.append(token)
    return t

In [156]:
df_merged['Title'] = df_merged['Title'].apply(tokenize_news)


In [157]:
df_merged['Body'] = df_merged['Body'].apply(tokenize_news)

In [158]:
df_merged

Unnamed: 0,Published,Title,Body,Keyword,Source,Adj Close,Return
0,2018-04-02,"[tesla, end, make, model, sedan, week]","[tesla, deliveri, total, vehicl, bloomberg, ne...",tesla,Bloomberg,252.479996,-0.051291
1,2018-07-23,"[tesla, want, money, back]","[elon, musk, ask, donat, save, tesla, wall, st...",tesla,The Wall Street Journal,303.200012,-0.033102
3,2018-05-08,"[florida, teenag, kill, tesla, crash]","[tesla, said, cooper, author, tesla, model, cr...",tesla,CNBC,301.970001,-0.002642
4,2018-05-09,"[nhtsa, join, ntsb, look, fatal, tesla, crash,...","[tesla, said, cooper, author, tesla, model, cr...",tesla,CNBC,306.850006,0.016161
5,2018-04-12,"[tesla, investig, feud, over, crash]","[tesla, investig, feud, over, crash, unusu, pu...",tesla,The Wall Street Journal,294.079987,-0.022763
6,2018-04-16,"[tesla, halt, model, product, again, wall, str...","[wall, street, journal, tesla, halt, model, pr...",tesla,The Wall Street Journal,291.209991,-0.030399
7,2018-06-29,"[elon, musk, tweet, featur, tesla, pickup, tru...","[tesla, elon, musk, ask, twitter, featur, incl...",tesla,CNBC,342.950012,-0.019947
8,2018-03-27,"[tesla, defend, autopilot, record, fed, launch...","[tesla, defend, autopilot, driverassist, techn...",tesla,CNBC,279.179993,-0.082188
9,2018-03-29,"[tesla, recal, model, car, over, bolt, issu, w...","[wall, street, journal, tesla, recal, model, c...",tesla,The Wall Street Journal,266.130005,0.032392
10,2018-04-26,"[kany, cant, save, tesla]","[from, chipotl, long, road, back, indonesian, ...",tesla,Bloomberg,285.480011,0.017065


In [159]:
#construct vocabulary and frequency.  frequency is not really needed, just nice to have for understanding the concept.
vocab = {}
frequency = {}
ignore = ['bloomberg', 'journal']
index = 0
for title in df_merged['Title']:
    for word in title:
        if word in ignore:
            continue
        if word not in vocab:
            vocab[word] = index
            index += 1
            frequency[word] = 1
        else:
            frequency[word] += 1
    
for body in df_merged['Body']:
    for word in body:
        if word in ignore:
            continue
        if word not in vocab:
            vocab[word] = index
            index += 1
            frequency[word] = 1
        else:
            frequency[word] += 1

In [None]:
#visualize words frequency
for i in sorted(frequency.items(), key=lambda x:x[1], reverse=True):
    print(i)

In [None]:
vocab_len = len(vocab)

In [None]:
#here is the exciting part. here we are going to create the training data!
df['X']
df['Y']

def 
for row_c1, row_c2 in zip(df['Title'], df['Body'])
    X = np.range([])
    
