# Collection of stock data & news article 
For all stocks and all dates in the range.

In [1]:
import os
import requests
from datetime import datetime, timedelta
import time
import warnings
import pandas as pd
import numpy as np
from pycorenlp import StanfordCoreNLP

warnings.filterwarnings('ignore')

# to start standford NLP.
# cd stanford-corenlp-4.2.2
# java -mx6g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 5000

FINNHUB_API_KEY = ""

In [2]:
# StanfordCoreNLP Function
nlp = StanfordCoreNLP('http://localhost:9000')

def get_sentiment(text):
    result = nlp.annotate(text, properties={
                   'annotators': 'sentiment',
                   'outputFormat': 'json',
                   'timeout': 5000,
               })
    return np.mean([int(i['sentimentValue']) for i in result['sentences']])


In [6]:
# Gets news for ticker on a specfic date.
def get_news(ticker, date):
    r = requests.get(f'https://finnhub.io/api/v1/company-news?symbol={ticker}&from={date}&to={date}&token={FINNHUB_API_KEY}')
    
    data = r.json()
    h = []
    for d in data:
        d['date'] = datetime.utcfromtimestamp(d['datetime']).strftime('%Y-%m-%d')
        h.append([d['id'], d['date'], d['headline'], d['source'], d['summary'],d['url']])

    df = pd.DataFrame(h, columns=['id', 'date', 'headline', 'source', 'summary','url'])
    df['date'] = pd.to_datetime(df['date'])
#     df['Ticker']=ticker
    return df

sd = get_news('AAPL','2021-06-25')

In [7]:
# Gets_news_sentiment for each ticker. Usually Recent stats (last week or 2 weeks ago stats possibly.)
def get_news_sentiment(ticker):
    r = requests.get(f'https://finnhub.io/api/v1/news-sentiment?symbol={ticker}&token={FINNHUB_API_KEY}')

    data = r.json()
    h={}

    for d in data:
        try:
            for i in data[d]:
                sd=[]
                sd.append(data[d][i])
                h[i]=sd

        except:
            kl=[]
            kl.append(data[d])
            h[d]=kl

    df = pd.DataFrame.from_dict(h)
    df.insert(0,'Ticker',ticker)
    
    return df

get_news_sentiment('AAPL')

Unnamed: 0,Ticker,articlesInLastWeek,buzz,weeklyAverage,companyNewsScore,sectorAverageBullishPercent,sectorAverageNewsScore,bearishPercent,bullishPercent,symbol
0,AAPL,196,0.6582,297.75,0.654,0.6596,0.5355,0.2308,0.7692,AAPL


In [8]:
# Time function for date to increment 
def increment_one_day(str_date):

    _date = datetime.strptime(str_date, '%Y-%m-%d') + timedelta(days=1)
    _date = _date.strftime('%Y-%m-%d')
    
    return _date

In [9]:
%%time


# Where Magic Happens. 
# Where everything comes together.

tickers = ['TSLA', 'GE', 'NVDA', 'AMD']

all_news = pd.DataFrame([])

for ticker in tickers:
    #start date
    _date = '2021-01-01'
    
    #create empty dataframe
    df = pd.DataFrame([])
    
    #stop date
    _fdate = '2021-04-30'
    
    #loop over dates and get news articles and append to dataframe : limit 60 api calls per minute
    while _date != _fdate: #datetime.today().strftime('%Y-%m-%d'):
        df = df.append(get_news(ticker, _date))
        df = df.drop_duplicates()
        time.sleep(1.1)
        _date = increment_one_day(_date)
    
    #There are some repeat headlines on the same day, so getting a daily headline count per article
    #Maybe duplicates of the same headline indicates more important news??
    duplicate_headlines = df[['date', 'headline', 'id']]
    dh = (duplicate_headlines.groupby(['date', 'headline'], as_index=False)
          .count()
          .rename(columns={'id': 'headline_count'}))
      
    # Get unique headlines by date
    no_dups = df.drop_duplicates(subset=['date', 'headline'])
    
    #Merge in headline counts
    no_dups = no_dups.merge(dh, how='left', on=['date', 'headline'])
    
    #Insert ticker
    no_dups.insert(0, 'ticker', ticker)
    
    #Append to dataframe that has all tickers
    all_news = all_news.append(no_dups)
    
all_news.reset_index(drop=True,inplace=True)

Wall time: 9min 59s


In [14]:
pd.unique(all_news['source'])
all_news

Unnamed: 0,ticker,id,date,headline,source,summary,url,headline_count
0,TSLA,62112651,2021-01-02,Benzinga's Final Bulls And Bears Of The Year: ...,benzinga,Benzinga has examined the prospects for many i...,https://finnhub.io/api/news?id=06eb69054d00642...,1
1,TSLA,62112644,2021-01-02,Tesla Deliveries Fall Just Short Of Musk's 500...,benzinga,"Tesla Inc delivered 499,550 vehicles in 2020, ...",https://finnhub.io/api/news?id=775391328af7bad...,1
2,TSLA,62102066,2021-01-01,Tesla Hiring Battery Engineers In North Caroli...,https://cleantechnica.com,"Job openings are not product roadmaps, but the...",https://finnhub.io/api/news?id=963902dc23d763b...,1
3,TSLA,62097972,2021-01-01,Tesla Cuts Prices In China. That Could Be Bad ...,MarketWatch,Tesla recently posted pricing for its Chinese-...,https://finnhub.io/api/news?id=e1bb5d13caf61eb...,1
4,TSLA,62098219,2021-01-01,Lidar Is Coming To XPeng as Tesla Holds Out on...,MarketWatch,Chinese electrical vehicle maker XPeng is uppi...,https://finnhub.io/api/news?id=154fdfc38e08086...,1
...,...,...,...,...,...,...,...,...
5218,AMD,67111854,2021-04-29,"The Zacks Analyst Blog Highlights: Apple, Amer...",Yahoo,"The Zacks Analyst Blog Highlights: Apple, Amer...",https://finnhub.io/api/news?id=9289f75f877ec9e...,1
5219,AMD,67114927,2021-04-29,Semiconductor Watchlist: AMD Is Dominating the...,Yahoo,"See buy or sell recommendations for Nvidia, Qu...",https://finnhub.io/api/news?id=1ab132d0d1515a3...,1
5220,AMD,67111433,2021-04-29,4 Mega-Cap Tech Stocks to Buy Now That Blew Ou...,247WallSt,"This week, four technology giants reported inc...",https://finnhub.io/api/news?id=1c40583ae96d395...,1
5221,AMD,67109439,2021-04-29,Advanced Micro Devices Stock Shows Every Sign ...,GuruFocus,GuruFocus Article or News written by GF Value ...,https://finnhub.io/api/news?id=dea5d43f9e15673...,1


In [26]:
# all the articles with same date and ticker is merged into a list.
all_news_1=pd.DataFrame(all_news.groupby(['ticker','date'])['headline'].apply(list))

In [27]:
all_news_1

Unnamed: 0_level_0,Unnamed: 1_level_0,source
ticker,date,Unnamed: 2_level_1
AMD,2021-01-03,[Nasdaq]
AMD,2021-01-04,"[MarketWatch, Nasdaq, MarketWatch, Benzinga, N..."
AMD,2021-01-05,"[MarketWatch, Nasdaq, Nasdaq]"
AMD,2021-01-06,"[Nasdaq, MarketWatch, Nasdaq, benzinga]"
AMD,2021-01-07,"[MarketWatch, MarketWatch]"
...,...,...
TSLA,2021-04-26,"[Yahoo, Yahoo, Yahoo, Reuters, Nasdaq, Yahoo, ..."
TSLA,2021-04-27,"[Yahoo, Reuters, Reuters, SeekingAlpha, Nasdaq..."
TSLA,2021-04-28,"[Yahoo, Yahoo, Yahoo, Benzinga, Yahoo, Yahoo, ..."
TSLA,2021-04-29,"[Nasdaq, Yahoo, CNBC, Reuters, Nasdaq, Yahoo, ..."


<hr>

# Applying sentiment score for new article headline

In [28]:
# Gets the score of those list of articles for each ticker for each date
finals=[]
for i,j in dict(all_news_1).items():
    for k in j:
        score=[]
        for l in k:
            try: 
                score.append(get_sentiment(l))
            except:
                score.append(-1)
        finals.append(round(np.mean(score),2))
        


In [29]:
# adds the sentiment score to dataframe column
all_news_1['news_sentiment_score']=finals

In [31]:
# Add number of articles count
fdb=all_news_1.reset_index()
fdb['amount_of_articles']=fdb['headline'].apply(lambda x: len(x))

fdb['date'] = fdb['date'].apply(lambda x: datetime.strftime(x, '%Y-%m-%d'))
fdb

Unnamed: 0,ticker,date,source,news_sentiment_score,amount_of_articles
0,AMD,2021-01-03,[Nasdaq],2.00,1
1,AMD,2021-01-04,"[MarketWatch, Nasdaq, MarketWatch, Benzinga, N...",2.00,5
2,AMD,2021-01-05,"[MarketWatch, Nasdaq, Nasdaq]",2.00,3
3,AMD,2021-01-06,"[Nasdaq, MarketWatch, Nasdaq, benzinga]",2.00,4
4,AMD,2021-01-07,"[MarketWatch, MarketWatch]",2.00,2
...,...,...,...,...,...
427,TSLA,2021-04-26,"[Yahoo, Yahoo, Yahoo, Reuters, Nasdaq, Yahoo, ...",2.00,99
428,TSLA,2021-04-27,"[Yahoo, Reuters, Reuters, SeekingAlpha, Nasdaq...",1.99,142
429,TSLA,2021-04-28,"[Yahoo, Yahoo, Yahoo, Benzinga, Yahoo, Yahoo, ...",1.99,92
430,TSLA,2021-04-29,"[Nasdaq, Yahoo, CNBC, Reuters, Nasdaq, Yahoo, ...",2.00,57


In [32]:
# Read and concat csv's of all the tickers we have
fexp=pd.DataFrame()

for i in tickers:
    ticker_df = pd.read_csv(f'./data/{i}.csv')
    ticker_df.insert(0,'Ticker',i)
    
    fexp=pd.concat([fexp,ticker_df]).reset_index(drop=True)  
    
fexp.pop('High')
fexp.pop('Low')
fexp.pop('Dividends')
fexp.pop('Stock Splits')
fexp.head()

FileNotFoundError: [Errno 2] File ./data/TSLA.csv does not exist: './data/TSLA.csv'

In [None]:
# Combine the stocks data dataframe with the sentiment_score dataframe shown above

fdb=pd.merge(fexp, fdb,  how='left', left_on=['Ticker','Date'], right_on = ['ticker','date'])
fdb.pop('ticker')
fdb.pop('date')
fdb.head()
# down is 0
# up and equal is 1
fdb['Same_day_move'] = np.where(fdb['Open'] >= fdb['Close'],0,1)
fdb['from_previous_day'] = np.where(fdb['Open'] >= fdb['Close'].shift(1),1,0)
fdb.head()

In [None]:
# save the final dataframe as a csv
# fdb.to_csv('./data/fdb.csv',index=False)

In [None]:
# read the final dataframe
fdb=pd.read_csv('./data/fdb.csv')

In [None]:
# encode the Ticker for heatmap correaltion
from sklearn import preprocessing

df1=fdb[['Ticker']]
le = preprocessing.LabelEncoder()
df1 = df1.apply(le.fit_transform)

df1 = pd.concat([fdb[['Date', 'Open', 'Close', 'Volume',
                     'news_sentiment_score', 'amount_of_articles', 'Same_day_move',
                     'from_previous_day']],df1], 
                axis=1)
df1=df1[['Ticker','Date', 'Open', 'Close', 'Volume',
         'news_sentiment_score', 'amount_of_articles', 'Same_day_move',
         'from_previous_day']]
df1.head()

In [None]:
# a heatmap correaltion of the final dataframe
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

sns.set(style="white")


corr  = df1.corr()

mask = np.triu(np.ones_like(corr, dtype=np.bool))

f, ax = plt.subplots(figsize=(11, 9))

sns.heatmap(corr, 
            mask=mask,  
            vmax=.3, center=0,
            square=True,
            annot=True)

In [38]:
reread = pd.read_hdf('stocks.h5')

In [39]:
reread

Unnamed: 0,ticker,date,headline,news_sentiment_score,source,url,amount_of_articles,open,close,volume,social_sentiments,mentions
0,FB,2016-08-09,[Onetime Home of Warner Bros.’ Harry Warner As...,3.00,[DowJones],[https://finnhub.io/api/news?id=7dbe5db9757dda...,1,125.340,125.06,19620967,-1.000000,-1
1,FB,2021-03-15,[Rupert Murdoch's News Corp strikes deal as Fa...,2.00,"[The Guardian, https://nypost.com, https://www...",[https://finnhub.io/api/news?id=61c0d589cb8bf9...,70,269.080,273.75,16856746,0.066288,45
2,FB,2021-03-16,[NetApp reformula a organização de vendas glob...,1.91,"[businesswire, benzinga, businesswire, busines...",[https://finnhub.io/api/news?id=6479351ac59fa2...,89,276.085,279.28,22437665,-0.339269,85
3,FB,2021-03-17,[Facebook Promises More Support For Human Righ...,1.89,"[https://www.forbes.com, businesswire, busines...",[https://finnhub.io/api/news?id=ad0559e9f8ae60...,58,275.705,284.01,21315044,-0.589213,135
4,FB,2021-03-18,[Take A Sneak Peek At The Weirdly-Shaped New P...,1.85,"[benzinga, benzinga, benzinga, businesswire, b...",[https://finnhub.io/api/news?id=e851ef47ee28e6...,77,279.870,278.62,18754853,-0.361794,534
...,...,...,...,...,...,...,...,...,...,...,...,...
521,UBER,2021-07-29,[Replay: Joby Aviation Executive Chairman and ...,1.90,"[Yahoo, Yahoo, Yahoo, Yahoo, Yahoo, Yahoo, Uni...",[https://finnhub.io/api/news?id=62bfd8bf18171d...,30,44.120,44.69,51033697,-1.000000,-1
522,UBER,2021-07-30,[Uber looks beyond ride-hailing as rebound and...,2.06,"[Yahoo, Yahoo, DowJones, CNBC, Yahoo, Yahoo, Y...",[https://finnhub.io/api/news?id=e725733cc1743a...,18,44.380,43.46,22194938,0.195272,40
523,UBER,2021-07-31,[Why No One Should Invest in Food Delivery Sto...,2.00,"[Yahoo, MarketWatch, SeekingAlpha, SeekingAlpha]",[https://finnhub.io/api/news?id=1185c1d8cba9af...,4,-1.000,-1.00,-1,-1.000000,-1
524,UBER,2021-08-01,[Videogames entered the mainstream in the pand...,2.08,"[Yahoo, Yahoo, MarketWatch, MarketWatch, Marke...",[https://finnhub.io/api/news?id=90034f7a2d7274...,13,44.100,43.49,26579889,0.148094,33
