In [45]:
#author: James Chan © 2018
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import st_utils as ut
import datetime as dt
import string

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from dateutil import parser
from datetime import timedelta, date
%matplotlib inline

In [4]:
# Initialize date range
start_date = '2018-01-01'
end_date = '2018-8-31'

In [5]:
# Grab news.  Only need to do this once
keywords = ['tesla'] #these three stocks move largely base on news
news_source = 'wsj.com, bloomberg.com, cnbc.com'
df = ut.get_news(keywords, start_date, end_date, news_source)
df.to_csv('news_dataset.csv')

{'status': 'ok', 'totalResults': 1774, 'articles': []}
total number of articles  1774
downloading page:  1
downloading page:  2
downloading page:  3
downloading page:  4
downloading page:  5
downloading page:  6
downloading page:  7
downloading page:  8
downloading page:  9
downloading page:  10
downloading page:  11
downloading page:  12
downloading page:  13
downloading page:  14
downloading page:  15
downloading page:  16
downloading page:  17
downloading page:  18


In [23]:
# Get Tesla's prices
dates = pd.date_range(start_date, end_date)
file = 'TSLA.csv'
df_stock = ut.get_data(file, dates)

In [24]:
# Calculate return 
df_stock['Return'] = df_stock['Adj Close'].pct_change()
df_stock.drop(df_stock.index[0], inplace=True)

In [25]:
df_news = pd.read_csv('news_dataset.csv', index_col=0, encoding="ISO-8859-1")

In [15]:
# Clean Dataset
# 1. Change date format to pandas datetime
# 2. Remove any row with title less than 5 words or body less than 10 words.
# 3. Remove any rowo with entries that have more than 3 question marks.

In [26]:
# map published time to close date.  see figure.1
def map_to_close_date(published_date):
    dt = parser.parse(published_date[:-1]) #-1 to ignore the Z, which is GMT.
    dt = dt - timedelta(hours=20) #shift back by 20 hrs.
    return pd.Timestamp(year=dt.year, month=dt.month, day=dt.day)

In [27]:
#make common column for merge
df_news['Published'] = df_news['Published'].apply(map_to_close_date)
df_stock['Published'] = df_stock.index


In [30]:
#merge news and stock
df_merged = pd.merge(df_news, df_stock, how='left', on='Published')

In [32]:
df_merged.sort_values(by='Published')

Unnamed: 0,Published,Title,Body,keyword,Keyword,Source,Adj Close,Return
1573,2018-01-01,A Plane Lesson for Driverless Cars,Lighter-touch regulation will hold back progre...,,tesla,Bloomberg,,
138,2018-01-01,Tesla Model 3 challenges and cash burn may ove...,Tesla fourth quarter delivery numbers could be...,,tesla,CNBC,,
1289,2018-01-01,Tony Robbins: This is the difference between p...,What most people call resolutions are reall...,,tesla,CNBC,,
445,2018-01-01,"With one simple tweet, Elon Musk shows a maste...","Its easy to forget, but critically important.",,tesla,CNBC,,
1290,2018-01-02,Stock Fund That Beats 95% of Peers Ignores CEO...,Reading brokerage research and meeting chief e...,,tesla,Bloomberg,,
1594,2018-01-02,Car Sales to Top 90 Million Globally for First...,Global sales of personal-use cars and trucks l...,,tesla,The Wall Street Journal,,
1605,2018-01-02,Five Things You Need to Know to Start Your Day,Get caught up on what's moving markets in Asia.,,tesla,Bloomberg,,
406,2018-01-02,"With one simple tweet, Elon Musk shows a maste...",By Catherine Clifford Billionaire tech entrepr...,,tesla,CNBC,,
1280,2018-01-02,Electric Car Drivers Are Too Smart to Own Elec...,Almost 80 percent of battery-powered vehicles ...,,tesla,Bloomberg,,
584,2018-01-03,"VW, Hyundai Turn to Driverless-Car Startup in ...","In the race to develop driverless cars, Volksw...",,tesla,The Wall Street Journal,317.250000,-0.010233


In [64]:
def tokenize_news(text):
    if(type(text)==float):
        return []
    lmtzr = WordNetLemmatizer()
    stmr = PorterStemmer()
    text = text.lower()
    text = text.strip(string.punctuation)
    tokens = nltk.tokenize.word_tokenize(text)
    tokens = [t for t in tokens if len(t) > 3]
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    tokens = [lmtzr.lemmatize(t) for t in tokens]
    tokens = [stmr.stem(t) for t in tokens]
    return tokens

In [65]:
df_merged['Title'] = df_merged['Title'].apply(tokenize_news)

In [66]:
df_merged

Unnamed: 0,Published,Title,Body,keyword,Keyword,Source,Adj Close,Return
0,2018-04-02,"[tesla, end, make, 2,020, model, sedan, week]","Tesla 1Q deliveries totaled 29,980 vehicles, B...",,tesla,Bloomberg,252.479996,-0.051291
1,2018-07-23,"[tesla, want, money, back]",Elon Musk is asking for donations to save Tesl...,,tesla,The Wall Street Journal,303.200012,-0.033102
2,2018-07-01,"[tesla, keep, bear, pivot, trade, week]","Buckle up, because Tesla could be in for a bum...",,tesla,CNBC,,
3,2018-05-08,"[florida, teenag, kill, tesla, crash]",Tesla said it is cooperating with authorities ...,,tesla,CNBC,301.970001,-0.002642
4,2018-05-09,"[nhtsa, join, ntsb, look, fatal, tesla, crash,...",Tesla said it is cooperating with authorities ...,,tesla,CNBC,306.850006,0.016161
5,2018-04-12,"[tesla, investig, feud, crash]","Tesla, Investigators Feud Over a Crash An unus...",,tesla,The Wall Street Journal,294.079987,-0.022763
6,2018-04-16,"[tesla, halt, model, product, wall, street, jo...",Wall Street Journal Tesla Halts Model 3 Produc...,,tesla,The Wall Street Journal,291.209991,-0.030399
7,2018-06-29,"[elon, musk, tweet, featur, tesla, pickup, tru...",Tesla CEO Elon Musk asked Twitter which featur...,,tesla,CNBC,342.950012,-0.019947
8,2018-03-27,"[tesla, defend, autopilot, record, fed, launch...",Tesla is defending its Autopilot driver-assist...,,tesla,CNBC,279.179993,-0.082188
9,2018-03-29,"[tesla, recal, 123000, model, car, bolt, issu,...",Wall Street Journal Tesla Recalls 123000 Model...,,tesla,The Wall Street Journal,266.130005,0.032392


In [67]:
df_news

Unnamed: 0,Published,Title,Body,keyword,Keyword,Source
0,2018-04-02,"Tesla Ends 1Q Making 2,020 Model 3 Sedans per ...","Tesla 1Q deliveries totaled 29,980 vehicles, B...",,tesla,Bloomberg
1,2018-07-23,Tesla Wants Its Money Back,Elon Musk is asking for donations to save Tesl...,,tesla,The Wall Street Journal
2,2018-07-01,What Tesla can do to keep the bears at bay dur...,"Buckle up, because Tesla could be in for a bum...",,tesla,CNBC
3,2018-05-08,Two Florida teenagers killed in Tesla crash,Tesla said it is cooperating with authorities ...,,tesla,CNBC
4,2018-05-09,NHTSA joins NTSB in looking into fatal Tesla c...,Tesla said it is cooperating with authorities ...,,tesla,CNBC
5,2018-04-12,"Tesla, Investigators Feud Over a Crash","Tesla, Investigators Feud Over a Crash An unus...",,tesla,The Wall Street Journal
6,2018-04-16,Tesla Halts Model 3 Production Again - Wall St...,Wall Street Journal Tesla Halts Model 3 Produc...,,tesla,The Wall Street Journal
7,2018-06-29,Elon Musk tweets about features for a new Tesl...,Tesla CEO Elon Musk asked Twitter which featur...,,tesla,CNBC
8,2018-03-27,Tesla defends Autopilot record after Feds laun...,Tesla is defending its Autopilot driver-assist...,,tesla,CNBC
9,2018-03-29,Tesla Recalls 123000 Model S Cars Over Bolt Is...,Wall Street Journal Tesla Recalls 123000 Model...,,tesla,The Wall Street Journal
