In [2]:
#author: James Chan © 2018
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import st_utils as ut
import datetime as dt
import string

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from dateutil import parser
from datetime import timedelta, date
%matplotlib inline

In [3]:
# Initialize date range
start_date = '2018-01-01'
end_date = '2018-8-31'

In [63]:
# Grab news.  Only need to do this once
keywords = ['tesla'] #these three stocks move largely base on news
news_source = 'wsj.com, bloomberg.com, cnbc.com'
df = ut.get_news(keywords, start_date, end_date, news_source)
df.to_csv('news_dataset.csv')

{'status': 'ok', 'totalResults': 1798, 'articles': []}
total number of articles  1798
downloading page:  1
downloading page:  2
downloading page:  3
downloading page:  4
downloading page:  5
downloading page:  6
downloading page:  7
downloading page:  8
downloading page:  9
downloading page:  10
downloading page:  11
downloading page:  12
downloading page:  13
downloading page:  14
downloading page:  15
downloading page:  16
downloading page:  17
downloading page:  18
download complete


In [92]:
# Get Tesla's prices
dates = pd.date_range(start_date, end_date)
file = 'TSLA.csv'
df_stock = ut.get_data(file, dates)

In [106]:
# Calculate return 
df_stock['Return'] = df_stock['Adj Close'].pct_change()
df_stock.drop(df_stock.index[0], inplace=True)

In [107]:
df_news = pd.read_csv('news_dataset.csv', index_col=0, encoding="ISO-8859-1")

In [108]:
# Clean Dataset
# 1. Change date format to pandas datetime
# 2. Remove any row with title less than 5 words or body less than 10 words.
# 3. Remove any rowo with entries that have more than 3 question marks.

In [109]:
# map published time to close date.  see figure.1
def map_to_close_date(published_date):
    dt = parser.parse(published_date[:-1]) #-1 to ignore the Z, which is GMT.
    dt = dt - timedelta(hours=20) #shift back by 20 hrs.
    return pd.Timestamp(year=dt.year, month=dt.month, day=dt.day)

In [110]:
#make common column for merge
df_news['Published'] = df_news['Published'].apply(map_to_close_date)
df_stock['Published'] = df_stock.index


In [111]:
#merge news and stock
df_merged = pd.merge(df_news, df_stock, how='left', on='Published')

In [112]:
df_merged = df_merged.dropna()
df_merged.sort_values(by='Published')

Unnamed: 0,Published,Title,Body,Keyword,Source,Adj Close,Return
54,2018-01-04,"Fly Charts: Tesla Disappointment, Bomb-Cyclone...",From old smartphones to the consequences of Va...,tesla,Bloomberg,314.619995,-0.008290
140,2018-01-04,"""Fast Money"" final trades: EWM, EEM and more",The Fast Money traders share their final tra...,tesla,CNBC,314.619995,-0.008290
1078,2018-01-04,"Your first trade for Friday, January 5","The ""Fast Money"" traders share their first mov...",tesla,CNBC,314.619995,-0.008290
857,2018-01-04,SpaceX will launch most powerful rocket in th...,"SpaceX claims the Falcon Heavy is the ""most po...",tesla,CNBC,314.619995,-0.008290
1087,2018-01-05,"Top VC deals: Apple buys Buddybuild, Google ba...",A weekly recap of some of the most interesting...,tesla,CNBC,316.579987,0.006230
1088,2018-01-05,Takata Recalls Another 3.3 Million Air Bags Un...,"Takata Corp., the parts supplier that filed fo...",tesla,Bloomberg,316.579987,0.006230
1767,2018-01-05,A User's Guide To Driverless Cars,The wait for the self-driving future is coming...,tesla,Bloomberg,316.579987,0.006230
1010,2018-01-08,'It might raise national security issues': Chi...,It might not be easy for foreign firms looking...,tesla,CNBC,336.410004,0.062638
256,2018-01-08,Tesla Powers Up New York Gigafactory Solar Roo...,Article URL: https://www.bloomberg.com/news/ar...,tesla,Bloomberg,336.410004,0.062638
908,2018-01-08,Cramer says 4 thingsincluding the tax overhau...,"Jim Cramer explained how index funds, stock sh...",tesla,CNBC,336.410004,0.062638


In [113]:
def tokenize_news(text):
    if(type(text)==float):
        return []
    lmtzr = WordNetLemmatizer()
    stmr = PorterStemmer()
    translator = str.maketrans('', '', string.punctuation)
    text = (text.translate(translator))
    text = "".join(c for c in text if ord(c)<128) #strip no n ascii characters
    tokens = nltk.tokenize.word_tokenize(text)
    t = []
    for token in tokens:
        if len(token) <= 3 or token.isnumeric() or token in stopwords.words('english'):
            pass
        else:
            token = lmtzr.lemmatize(token)
            token = stmr.stem(token)
            t.append(token)
    return t

In [114]:
df_merged['Title'] = df_merged['Title'].apply(tokenize_news)


In [115]:
df_merged['Body'] = df_merged['Body'].apply(tokenize_news)

In [116]:
df_merged

Unnamed: 0,Published,Title,Body,Keyword,Source,Adj Close,Return
0,2018-04-02,"[tesla, end, make, model, sedan, week]","[tesla, deliveri, total, vehicl, bloomberg, ne...",tesla,Bloomberg,252.479996,-0.051291
1,2018-07-23,"[tesla, want, money, back]","[elon, musk, ask, donat, save, tesla, wall, st...",tesla,The Wall Street Journal,303.200012,-0.033102
3,2018-05-08,"[florida, teenag, kill, tesla, crash]","[tesla, said, cooper, author, tesla, model, cr...",tesla,CNBC,301.970001,-0.002642
4,2018-05-09,"[nhtsa, join, ntsb, look, fatal, tesla, crash,...","[tesla, said, cooper, author, tesla, model, cr...",tesla,CNBC,306.850006,0.016161
5,2018-04-12,"[tesla, investig, feud, over, crash]","[tesla, investig, feud, over, crash, unusu, pu...",tesla,The Wall Street Journal,294.079987,-0.022763
6,2018-04-16,"[tesla, halt, model, product, again, wall, str...","[wall, street, journal, tesla, halt, model, pr...",tesla,The Wall Street Journal,291.209991,-0.030399
7,2018-06-29,"[elon, musk, tweet, featur, tesla, pickup, tru...","[tesla, elon, musk, ask, twitter, featur, incl...",tesla,CNBC,342.950012,-0.019947
8,2018-03-27,"[tesla, defend, autopilot, record, fed, launch...","[tesla, defend, autopilot, driverassist, techn...",tesla,CNBC,279.179993,-0.082188
9,2018-03-29,"[tesla, recal, model, car, over, bolt, issu, w...","[wall, street, journal, tesla, recal, model, c...",tesla,The Wall Street Journal,266.130005,0.032392
10,2018-04-26,"[kany, cant, save, tesla]","[from, chipotl, long, road, back, indonesian, ...",tesla,Bloomberg,285.480011,0.017065


In [119]:
#construct vocabulary
vocab = {}
for title in df_merged['Title']:
    for word in title:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1
    
for body in df_merged['Body']:
    for word in body:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1

In [120]:
vocab

{'tesla': 1079,
 'end': 4,
 'make': 121,
 'model': 230,
 'sedan': 38,
 'week': 61,
 'want': 38,
 'money': 55,
 'back': 45,
 'florida': 14,
 'teenag': 3,
 'kill': 12,
 'crash': 80,
 'nhtsa': 2,
 'join': 19,
 'ntsb': 9,
 'look': 33,
 'fatal': 40,
 'investig': 39,
 'feud': 6,
 'over': 22,
 'halt': 5,
 'product': 137,
 'again': 13,
 'wall': 93,
 'street': 93,
 'journal': 78,
 'elon': 362,
 'musk': 457,
 'tweet': 28,
 'featur': 8,
 'pickup': 10,
 'truck': 37,
 'take': 52,
 'suggest': 9,
 'defend': 11,
 'autopilot': 39,
 'record': 13,
 'fed': 4,
 'launch': 39,
 'recal': 11,
 'car': 120,
 'bolt': 5,
 'issu': 30,
 'kany': 5,
 'cant': 13,
 'save': 14,
 'buri': 2,
 'cut': 13,
 'memo': 2,
 'home': 30,
 'depot': 11,
 'solar': 40,
 'sale': 36,
 'panason': 10,
 'say': 182,
 'open': 47,
 'work': 27,
 'with': 52,
 'china': 159,
 'factori': 63,
 'bloomberg': 188,
 'crackup': 1,
 'foretold': 1,
 'lose': 12,
 'energi': 28,
 'leader': 16,
 'reorgan': 6,
 'norwegian': 6,
 'right': 16,
 'upset': 3,
 'servic

In [125]:
for i in sorted(vocab.items(), key=lambda x:x[1]):
    print(i)

[('crackup', 1),
 ('foretold', 1),
 ('superfan', 1),
 ('cancer', 1),
 ('billionsher', 1),
 ('roar', 1),
 ('thorniest', 1),
 ('us9000', 1),
 ('couchher', 1),
 ('defi', 1),
 ('graviti', 1),
 ('now', 1),
 ('bombcyclon', 1),
 ('cream', 1),
 ('withdrew', 1),
 ('buffalo', 1),
 ('fullselfdriv', 1),
 ('rocketpow', 1),
 ('smucker', 1),
 ('hurdl', 1),
 ('stall', 1),
 ('trilliondollar', 1),
 ('selffulfil', 1),
 ('propheci', 1),
 ('intergalact', 1),
 ('planher', 1),
 ('highlyanticip', 1),
 ('sixfold', 1),
 ('temper', 1),
 ('junctur', 1),
 ('document', 1),
 ('verizon', 1),
 ('vmware', 1),
 ('honeywel', 1),
 ('rockwel', 1),
 ('collin', 1),
 ('georg', 1),
 ('exappl', 1),
 ('immort', 1),
 ('dictat', 1),
 ('documentari', 1),
 ('pypl', 1),
 ('east', 1),
 ('conduct', 1),
 ('limelight', 1),
 ('spectr', 1),
 ('patch', 1),
 ('butterfli', 1),
 ('sting', 1),
 ('versu', 1),
 ('solardimsa', 1),
 ('signet', 1),
 ('vera', 1),
 ('bradley', 1),
 ('axov', 1),
 ('teardown', 1),
 ('uncov', 1),
 ('prowess', 1),
 ('shor