In [2]:
import requests
import pandas as pd
import os
from urllib.parse import quote_plus
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import json
import ast
import aiohttp
import asyncio
from datetime import datetime, timedelta
from tqdm.asyncio import tqdm
import nest_asyncio
nest_asyncio.apply()

In [3]:
API_KEY = '7hzjsuigdkvndhjwhoqm8e2gbi0vnsnsdve3raaf'

### Function to generate time range
- Fine-grained search from March 15, 2019 - March 15, 2024

In [7]:
def generate_date_range(start_date, end_date):
    date_format = "%m%d%Y"
    return [(date.strftime(date_format), date.strftime(date_format)) 
            for date in (start_date + timedelta(days=x) 
            for x in range((end_date - start_date).days + 1))]

In [8]:
start_date = datetime(2019, 3, 15)
end_date = datetime(2024, 3, 15)

In [9]:
date_ranges = generate_date_range(start_date, end_date)
#date_ranges

### Use async to fetch news data with progress bar

In [10]:
async def fetch_news(session, ticker, date_pair, page, API_KEY):
    start_date, end_date = date_pair
    url = f'https://stocknewsapi.com/api/v1?tickers={ticker}&type=article&date={start_date}-{end_date}&items=50&page={page}&extra-fields=id&token={API_KEY}'
    async with session.get(url) as response:
        if response.status == 200:
            return await response.json()
        else:
            return {'data': []}

In [11]:
async def fetch_news_for_ticker(ticker, date_ranges, API_KEY):
    news_data = {}  
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_news(session, ticker, date_pair, page, API_KEY)
                 for date_pair in date_ranges
                 for page in range(1, 3)]
        for task in tqdm.as_completed(tasks, desc=f"Fetching {ticker}"):
            news_result = await task
            if news_result['data']: 
                for article in news_result['data']:
                    news_data[article['news_id']] = article
    return news_data

In [12]:
async def main(tickers, date_ranges, API_KEY):
    all_ticker_news = {}
    for ticker in tickers:
        print(f"Starting fetch for {ticker}")
        ticker_news = await fetch_news_for_ticker(ticker, date_ranges, API_KEY)
        all_ticker_news[ticker] = ticker_news
        print(f"Completed fetching for {ticker}, Total news items: {len(ticker_news)}")
    return all_ticker_news


### Sentiment score calculation
- Done at the end so we can easily modify 

In [13]:
#Sentiment Score 

def calculate_sentiment_for_news(title, text):
    analyzer = SentimentIntensityAnalyzer()
    title_text = title + " " + text
    return analyzer.polarity_scores(title_text)

def add_sentiment_to_news(all_ticker_news):
    analyzer = SentimentIntensityAnalyzer()  
    for ticker, articles in all_ticker_news.items():
        for article_id, article in articles.items():
            if 'title' in article and 'text' in article:
                title_text = article['title'] + " " + article['text']
                sentiment_scores = analyzer.polarity_scores(title_text)
                article.update({
                    'sentiment_neg': sentiment_scores['neg'],
                    'sentiment_neu': sentiment_scores['neu'],
                    'sentiment_pos': sentiment_scores['pos'],
                    'sentiment_tot': sentiment_scores['compound']
                })
    return all_ticker_news

### Main function to run

In [14]:
if __name__ == "__main__":
    API_KEY = '7hzjsuigdkvndhjwhoqm8e2gbi0vnsnsdve3raaf'
    healthcare_stocks = ["LLY", "UNH", "JNJ", "MRK", "ABBV"]
    technology_stocks = ["MSFT","AAPL","NVDA","GOOGL","AMZN"]
    finance_stocks = ["JPM","V","MA","BAC","WFC"]
    tickers = healthcare_stocks + technology_stocks + finance_stocks
    start_date = datetime(2019, 3, 15)
    end_date = datetime(2024, 3, 15)
    date_ranges = generate_date_range(start_date, end_date)
    news_data = asyncio.run(main(tickers, date_ranges, API_KEY))
    news_data_with_sentiment = add_sentiment_to_news(news_data)
    all_ticker_news = news_data_with_sentiment


Starting fetch for LLY


Fetching LLY: 100%|██████████| 3656/3656 [00:50<00:00, 73.11it/s] 


Completed fetching for LLY, Total news items: 2750
Starting fetch for UNH


Fetching UNH: 100%|██████████| 3656/3656 [00:43<00:00, 84.58it/s] 


Completed fetching for UNH, Total news items: 1979
Starting fetch for JNJ


Fetching JNJ: 100%|██████████| 3656/3656 [00:46<00:00, 79.43it/s] 


Completed fetching for JNJ, Total news items: 4409
Starting fetch for MRK


Fetching MRK: 100%|██████████| 3656/3656 [00:44<00:00, 82.31it/s]


Completed fetching for MRK, Total news items: 2994
Starting fetch for ABBV


Fetching ABBV: 100%|██████████| 3656/3656 [00:43<00:00, 83.84it/s] 


Completed fetching for ABBV, Total news items: 3067
Starting fetch for MSFT


Fetching MSFT: 100%|██████████| 3656/3656 [00:44<00:00, 81.26it/s] 


Completed fetching for MSFT, Total news items: 12551
Starting fetch for AAPL


Fetching AAPL: 100%|██████████| 3656/3656 [00:44<00:00, 82.96it/s] 


Completed fetching for AAPL, Total news items: 19979
Starting fetch for NVDA


Fetching NVDA: 100%|██████████| 3656/3656 [00:43<00:00, 84.38it/s] 


Completed fetching for NVDA, Total news items: 10051
Starting fetch for GOOGL


Fetching GOOGL: 100%|██████████| 3656/3656 [00:44<00:00, 82.87it/s] 


Completed fetching for GOOGL, Total news items: 15058
Starting fetch for AMZN


Fetching AMZN: 100%|██████████| 3656/3656 [00:45<00:00, 80.01it/s] 


Completed fetching for AMZN, Total news items: 21658
Starting fetch for JPM


Fetching JPM: 100%|██████████| 3656/3656 [00:45<00:00, 79.71it/s]


Completed fetching for JPM, Total news items: 5124
Starting fetch for V


Fetching V: 100%|██████████| 3656/3656 [00:45<00:00, 80.53it/s]


Completed fetching for V, Total news items: 3212
Starting fetch for MA


Fetching MA: 100%|██████████| 3656/3656 [00:47<00:00, 77.23it/s]


Completed fetching for MA, Total news items: 2738
Starting fetch for BAC


Fetching BAC: 100%|██████████| 3656/3656 [00:47<00:00, 77.74it/s]


Completed fetching for BAC, Total news items: 4134
Starting fetch for WFC


Fetching WFC: 100%|██████████| 3656/3656 [00:46<00:00, 78.20it/s]


Completed fetching for WFC, Total news items: 3397


'    \nif __name__ == "__main__":\n    API_KEY = \'7hzjsuigdkvndhjwhoqm8e2gbi0vnsnsdve3raaf\'\n    #healthcare_stocks = ["LLY", "UNH", "JNJ", "MRK", "ABBV"]\n    #technology_stocks = ["MSFT","AAPL","NVDA","GOOGL","AMZN"]\n    #finance_stocks = ["JPM","V","MA","BAC","WFC"]\n    #tickers = healthcare_stocks + technology_stocks + finance_stocks\n    tickers = [\'AMZN\', \'MSFT\']\n    #start_date = datetime(2019, 3, 15)\n    #end_date = datetime(2024, 3, 15)\n    start_date = datetime(2019, 3, 15)\n    end_date = datetime(2024, 3, 16)\n    date_ranges = generate_date_range(start_date, end_date)\n    asyncio.run(main(tickers, date_ranges, API_KEY))\n'

### Summary of articles fetched

In [15]:
for ticker, news_articles in all_ticker_news.items():
    print(f"{ticker}: {len(news_articles)} news articles")

LLY: 2750 news articles
UNH: 1979 news articles
JNJ: 4409 news articles
MRK: 2994 news articles
ABBV: 3067 news articles
MSFT: 12551 news articles
AAPL: 19979 news articles
NVDA: 10051 news articles
GOOGL: 15058 news articles
AMZN: 21658 news articles
JPM: 5124 news articles
V: 3212 news articles
MA: 2738 news articles
BAC: 4134 news articles
WFC: 3397 news articles


In [35]:
#all_ticker_news

### Check contents of resulting dictionary using async
- Just practicing here how to access the dictionary structure
- Displaying the first news article as an example

In [34]:
for ticker, news_articles in all_ticker_news.items():
    print(f"\nNews for {ticker}:")
    for article_id in list(news_articles)[:1]:
        article = news_articles[article_id]
        print(f"Title: {article['title']}\nText: {article['text']}\nDate: {article['date']}")


News for LLY:
Title: House Committee Questions Eli Lilly, Sanofi, Novo Nordisk On 'Unacceptably High' Insulin Prices
Text: House Energy & Commerce Committee leaders sent three letters to execs at Eli Lilly And Co (NYSE: LLY), Sanofi SA (NASDAQ: SNY), and Novo Nordisk A/S (NYSE: NVO), raising concerns that despite their supposed concern over the past two years with the price of insulin, the price still remains "unacceptably high." Committee chair Frank Pallone (D-NJ) and subcommittee chair Diana DeGette (D-CO) note in all three letters that the price of insulin in the U.S. is more than 10 times that of 33 other countries.
Date: Fri, 20 Aug 2021 14:34:33 -0400

News for UNH:
Title: The 3 Best Healthcare Stocks to Buy for June 2023
Text: Healthcare stocks have been trailing the market this year. Vanguard's Health Care ETF (NYSEARCA: VHT ) is down 5% on the year, compared with a 10% year-to-date gain in the benchmark S&P 500 index.
Date: Mon, 29 May 2023 15:13:25 -0400

News for JNJ:
Titl

### Filtering: 
- Some financial news are associate with multiple tickers at a time. E.g tickers = ['GOOGL', 'NVDA', 'AAPL']
    - e.g these type of news would be like "Top 3 stocks in tech this month", however we don't need these type of news, we want specific news to our chosen ticker
- We want all of the financial news to be only associated for a specific ticker. E.g tickers = ['AAPL']
- The code below loops over a specified target_tickers 
    - an extra requirement is done for 'GOOGL' stock because articles associated to this specific ticker comes with `['GOOG', 'GOOGL']` and I have to rename this just 'GOOGL'

In [17]:
def filter_news_by_tickers(news_dict, target_tickers):
    filtered_dict = {}
    for target_ticker in target_tickers:
        ticker_key = tuple(target_ticker) if isinstance(target_ticker, list) else target_ticker
        filtered_dict[ticker_key] = {}

        for ticker, news_articles in news_dict.items():
            if isinstance(target_ticker, list):
                filtered_items = {
                    article_id: item for article_id, item in news_articles.items() if set(item.get('tickers', [])) == set(target_ticker)
                }
            else:
                filtered_items = {
                    article_id: item for article_id, item in news_articles.items() if item.get('tickers') == [target_ticker]
                }
            filtered_dict[ticker_key].update(filtered_items)

    return filtered_dict

In [39]:
target_tickers = [
    'MSFT', 'AAPL', 'NVDA', ['GOOG', 'GOOGL'], 'AMZN', 'JPM', 'V', 'MA', 'BAC', 'WFC', 
    'LLY', 'UNH', 'JNJ', 'MRK', 'ABBV'
]

filtered_news = filter_news_by_tickers(all_ticker_news, target_tickers)

for ticker_key, articles in filtered_news.items():
    print(f"News for {ticker_key}: {len(articles)} articles")


News for MSFT: 5934 articles
News for AAPL: 12991 articles
News for NVDA: 5179 articles
News for ('GOOG', 'GOOGL'): 8316 articles
News for AMZN: 14029 articles
News for JPM: 2508 articles
News for V: 1559 articles
News for MA: 1512 articles
News for BAC: 1757 articles
News for WFC: 1810 articles
News for LLY: 1693 articles
News for UNH: 1049 articles
News for JNJ: 2213 articles
News for MRK: 1702 articles
News for ABBV: 1452 articles


In [36]:
#filtered_news

### Preparing data for dataframe structure
- Using the filtered news in the previous block, we loop over the dictionary to save the information we want
- Rename `['GOOG', 'GOOGL']` to 'GOOGL'

In [20]:
def create_dataframe(news_items):
    data = []
    for ticker, news_articles in news_items.items():
        display_ticker = 'GOOGL' if ticker == ('GOOG', 'GOOGL') else ticker
        for article_id, news_item in news_articles.items():
            data.append({
                "Date & Time": news_item['date'],
                "Headline": news_item['title'],
                "Text": news_item['text'],
                "Source": news_item['source_name'],
                "News ID": article_id,
                "URL": news_item['news_url'],
                "Ticker": display_ticker, 
                "Negative Sentiment Score": news_item.get('sentiment_neg', 0),
                "Neutral Sentiment Score": news_item.get('sentiment_neu', 0),
                "Positive Sentiment Score": news_item.get('sentiment_pos', 0),
                "Total Sentiment Score (Compound)": news_item.get('sentiment_tot', 0)
            })
    return data


In [21]:
data = create_dataframe(filtered_news)

In [37]:
#data

In [23]:
df = pd.DataFrame(data)
df_experiment = pd.DataFrame(data)

### Checking if there are duplicates with news ids
- we see there are none :)

In [24]:
duplicates = df_experiment[df_experiment.duplicated(subset='News ID', keep=False)]
print(duplicates)

Empty DataFrame
Columns: [Date & Time, Headline, Text, Source, News ID, URL, Ticker, Negative Sentiment Score, Neutral Sentiment Score, Positive Sentiment Score, Total Sentiment Score (Compound)]
Index: []


In [25]:
duplicate_count = df_experiment.duplicated(subset='News ID').sum()
print(duplicate_count)

0


### Make the date and time an index 

In [27]:
df_experiment['Date & Time'] = pd.to_datetime(df_experiment['Date & Time'], errors='coerce')
df_experiment.set_index('Date & Time', inplace=True)
df_experiment.sort_index(inplace=True)

  df_experiment['Date & Time'] = pd.to_datetime(df_experiment['Date & Time'], errors='coerce')


In [38]:
#df_experiment

### Save sorted df to a csv file

In [30]:
current_dir = os.getcwd()
data_dir = os.path.join(current_dir, 'data')
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

file_path = os.path.join(data_dir, 'news_data_2019_2024_sorted.csv')
df_experiment.to_csv(file_path, index=True)