In [2]:
#author: James Chan © 2018
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import st_utils as ut
import datetime as dt
import string

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from dateutil import parser
from datetime import timedelta, date
%matplotlib inline

In [3]:
# Initialize date range
start_date = '2018-01-01'
end_date = '2018-8-31'

In [63]:
# Grab news.  Only need to do this once
keywords = ['tesla'] #these three stocks move largely base on news
news_source = 'wsj.com, bloomberg.com, cnbc.com'
df = ut.get_news(keywords, start_date, end_date, news_source)
df.to_csv('news_dataset.csv')

{'status': 'ok', 'totalResults': 1798, 'articles': []}
total number of articles  1798
downloading page:  1
downloading page:  2
downloading page:  3
downloading page:  4
downloading page:  5
downloading page:  6
downloading page:  7
downloading page:  8
downloading page:  9
downloading page:  10
downloading page:  11
downloading page:  12
downloading page:  13
downloading page:  14
downloading page:  15
downloading page:  16
downloading page:  17
downloading page:  18
download complete


In [64]:
# Get Tesla's prices
dates = pd.date_range(start_date, end_date)
file = 'TSLA.csv'
df_stock = ut.get_data(file, dates)

In [65]:
# Calculate return 
df_stock['Return'] = df_stock['Adj Close'].pct_change()
df_stock.drop(df_stock.index[0], inplace=True)

In [66]:
df_news = pd.read_csv('news_dataset.csv', index_col=0, encoding="ISO-8859-1")

In [67]:
# Clean Dataset
# 1. Change date format to pandas datetime
# 2. Remove any row with title less than 5 words or body less than 10 words.
# 3. Remove any rowo with entries that have more than 3 question marks.

In [68]:
# map published time to close date.  see figure.1
def map_to_close_date(published_date):
    dt = parser.parse(published_date[:-1]) #-1 to ignore the Z, which is GMT.
    dt = dt - timedelta(hours=20) #shift back by 20 hrs.
    return pd.Timestamp(year=dt.year, month=dt.month, day=dt.day)

In [69]:
#make common column for merge
df_news['Published'] = df_news['Published'].apply(map_to_close_date)
df_stock['Published'] = df_stock.index


In [70]:
#merge news and stock
df_merged = pd.merge(df_news, df_stock, how='left', on='Published')

In [None]:
df_merged.sort_values(by='Published')

In [72]:
def tokenize_news(text):
    if(type(text)==float):
        return []
    lmtzr = WordNetLemmatizer()
    stmr = PorterStemmer()
    translator = str.maketrans('', '', string.punctuation)
    text = (text.translate(translator))
    text = "".join(c for c in text if ord(c)<128) #strip no n ascii characters
    tokens = nltk.tokenize.word_tokenize(text)
    t = []
    for token in tokens:
        if len(token) <= 3 or token.isnumeric() or token in stopwords.words('english'):
            pass
        else:
            token = lmtzr.lemmatize(token)
            token = stmr.stem(token)
            t.append(token)
    return t

In [73]:
df_merged['Title'] = df_merged['Title'].apply(tokenize_news)


In [74]:
df_merged['Body'] = df_merged['Body'].apply(tokenize_news)

In [75]:
df_merged

Unnamed: 0,Published,Title,Body,Keyword,Source,Adj Close,Return
0,2018-04-02,"[tesla, end, make, model, sedan, week]","[tesla, deliveri, total, vehicl, bloomberg, ne...",tesla,Bloomberg,252.479996,-0.051291
1,2018-07-23,"[tesla, want, money, back]","[elon, musk, ask, donat, save, tesla, wall, st...",tesla,The Wall Street Journal,303.200012,-0.033102
2,2018-07-01,"[what, tesla, keep, bear, pivot, trade, week]","[buckl, tesla, could, bumpi, ride, week, tesla...",tesla,CNBC,,
3,2018-05-08,"[florida, teenag, kill, tesla, crash]","[tesla, said, cooper, author, tesla, model, cr...",tesla,CNBC,301.970001,-0.002642
4,2018-05-09,"[nhtsa, join, ntsb, look, fatal, tesla, crash,...","[tesla, said, cooper, author, tesla, model, cr...",tesla,CNBC,306.850006,0.016161
5,2018-04-12,"[tesla, investig, feud, over, crash]","[tesla, investig, feud, over, crash, unusu, pu...",tesla,The Wall Street Journal,294.079987,-0.022763
6,2018-04-16,"[tesla, halt, model, product, again, wall, str...","[wall, street, journal, tesla, halt, model, pr...",tesla,The Wall Street Journal,291.209991,-0.030399
7,2018-06-29,"[elon, musk, tweet, featur, tesla, pickup, tru...","[tesla, elon, musk, ask, twitter, featur, incl...",tesla,CNBC,342.950012,-0.019947
8,2018-03-27,"[tesla, defend, autopilot, record, fed, launch...","[tesla, defend, autopilot, driverassist, techn...",tesla,CNBC,279.179993,-0.082188
9,2018-03-29,"[tesla, recal, model, car, over, bolt, issu, w...","[wall, street, journal, tesla, recal, model, c...",tesla,The Wall Street Journal,266.130005,0.032392
