In [1]:
import os
import requests
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta
from pathlib import Path
import json
import time
import bs4 as bs
import pandas as pd
import numpy as np
import csv

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def delta_date(start_date,end_date):
    """Function that returns the number of days between 2 dates """

    return abs((dt.strptime(start_date, "%Y-%m-%d") - dt.strptime(end_date, "%Y-%m-%d")).days)

class FinnHub():
    """Class to make API calls to FinnHub"""

    def __init__(self,start_date,end_date,tickers):

        #Initialize attributes values here
        self.max_call = 60 #maximum api calls per minute for the finhub API
        self.time_sleep = 60 #default is 60 seconds as the maximum number of API calls is per minute
        self.nb_request = 0
        self.finhub_key = 'cnn9gp9r01qq36n60c3gcnn9gp9r01qq36n60c40'
        self.news_header = ['category', 'datetime','headline','id','image','related','source','summary','url']
        self.start_date = start_date
        self.end_date = end_date
        self.tickers = tickers
        self.js_data = []
        self.req_new() #call the methods to access historical financial headlines

    def iterate_day(func):
        """ Decorator that makes the API call on FinHub each days between the `self.start_date`
        and `self.end_date` """

        def wrapper_(self):
            delta_date_ = delta_date(self.start_date,self.end_date)
            date_ = self.start_date
            date_obj =  dt.strptime(self.start_date, "%Y-%m-%d")

            for item in range(delta_date_ + 1):
                self.nb_request +=1
                func(self,date_)
                date_obj = date_obj + relativedelta(days=1)
                date_  = date_obj.strftime("%Y-%m-%d")
                if self.nb_request == (self.max_call-1):
                    time.sleep(self.time_sleep)
                    self.nb_request=0
        return wrapper_
    
    @iterate_day
    def req_new(self,date_):
        """ Method that makes news request(s) to the Finnhub API"""

        request_ = requests.get('https://finnhub.io/api/v1/company-news?symbol=' + self.tickers + '&from=' +
                                date_ + '&to=' + date_ + '&token=' + self.finhub_key)
        self.js_data += request_.json()
    def get_js_data(self):
        """ Method that returns news data """
        return self.js_data
    
## Create finhub object to get the news
finhub = FinnHub(start_date="2024-03-07", end_date="2024-03-27", tickers='TSLA') ##start and end date should be within 1 year 
data = finhub.get_js_data()

In [4]:
df = pd.DataFrame(data)
## Converting datetime object
df['datetime']=df['datetime'].apply(lambda x: dt.fromtimestamp(x).strftime('%Y-%m-%d')) 
df.head()

Unnamed: 0,category,datetime,headline,id,image,related,source,summary,url
0,company,2024-03-07,Permanent High Plateau,126392077,https://static.seekingalpha.com/cdn/s3/uploads...,TSLA,SeekingAlpha,Tech stocks surge reminiscent of the 1929 cras...,https://finnhub.io/api/news?id=9d8ef9f8a10392e...
1,company,2024-03-07,Tesla drivers lose US class action bid in batt...,126390669,https://media.zenfs.com/en/reuters-finance.com...,TSLA,Yahoo,Tesla owners who accused it of falsely adverti...,https://finnhub.io/api/news?id=f1df7e4324b31ec...
2,company,2024-03-07,Tesla (TSLA) Laps the Stock Market: Here's Why,126390671,https://media.zenfs.com/en/zacks.com/d2f7ed3d1...,TSLA,Yahoo,"In the latest trading session, Tesla (TSLA) cl...",https://finnhub.io/api/news?id=2878e3b2dd14935...
3,company,2024-03-07,"These Stocks Moved the Most Today: NYCB, Novo ...",126390672,https://s.yimg.com/ny/api/res/1.2/nmFkBThAJkeM...,TSLA,Yahoo,New York Community Bancorp jumps after getting...,https://finnhub.io/api/news?id=3a5ed39a81f3283...
4,company,2024-03-07,"Rivian Pausing New Georgia Factory, Offers New...",126390674,https://media.zenfs.com/en/bloomberg_markets_8...,TSLA,Yahoo,(Bloomberg) -- Rivian Automotive Inc. is halti...,https://finnhub.io/api/news?id=9507b7b89f644f8...


In [5]:
import nltk
# nltk.download('vader_lexicon')
## NLTK VADER for sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer

## New words and values
new_words = {
    'crushes': 10,
    'beats': 5,
    'misses': -5,
    'trouble': -10,
    'falls': -100,
}
## Instantiate the sentiment intensity analyzer with the existing lexicon
vader = SentimentIntensityAnalyzer()
## Update the lexicon
vader.lexicon.update(new_words)

In [6]:
pd.set_option('display.max_colwidth', None) ## show columns in whole

In [7]:
## Iterate through the headlines and get the polarity scores
scores_vader = (vader.polarity_scores(headline) for headline in df['headline'])
## Convert the list of dicts into a DataFrame
scores_vader_df = pd.DataFrame(scores_vader)
## Join the DataFrames
scored_news_vader = df.join(scores_vader_df)
## Choose the relevant columns
scored_news_vader = scored_news_vader[['datetime','headline','neg','neu','pos','compound']]
scored_news_vader.head()

Unnamed: 0,datetime,headline,neg,neu,pos,compound
0,2024-03-07,Permanent High Plateau,0.0,1.0,0.0,0.0
1,2024-03-07,Tesla drivers lose US class action bid in battery range cases,0.213,0.787,0.0,-0.4019
2,2024-03-07,Tesla (TSLA) Laps the Stock Market: Here's Why,0.0,1.0,0.0,0.0
3,2024-03-07,"These Stocks Moved the Most Today: NYCB, Novo Nordisk, Ciena, Kroger, Tesla, Rivian, and More",0.0,1.0,0.0,0.0
4,2024-03-07,"Rivian Pausing New Georgia Factory, Offers New R2 and R3",0.0,1.0,0.0,0.0


In [8]:
## Write to file
with open('datasets/scraped_data_vader.csv', 'a', newline='',encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerows(scored_news_vader.values.tolist())

In [9]:
## Calculate the mean of days when neutral score is not 1
scored_news_vader[['datetime','headline','neg','neu','pos','compound']][scored_news_vader['neu']!=1].sort_values(['datetime']).groupby('datetime')['compound'].agg(np.mean)

datetime
2024-03-06   -0.095307
2024-03-07   -0.057204
2024-03-08    0.058100
2024-03-09    0.110900
2024-03-10    0.221565
2024-03-11    0.118012
2024-03-12   -0.030291
2024-03-13   -0.021310
2024-03-14    0.151930
2024-03-15    0.070436
2024-03-16    0.170273
2024-03-17    0.156547
2024-03-18    0.201638
2024-03-19    0.062804
2024-03-20    0.124200
2024-03-21    0.090548
2024-03-22    0.024546
2024-03-23    0.044845
2024-03-24    0.187025
2024-03-25    0.000760
2024-03-26    0.149567
2024-03-27    0.121870
Name: compound, dtype: float64