In [64]:
# Historical News Headlines Collector for Top 20 US Stocks - MONTHLY PREMIUM VERSION
# Using Polygon.io PREMIUM API to collect 5 years of news data (2020-01-01 to 2025-01-01)
# Premium features: No rate limits, up to 1000 articles per request, news ranking

import requests
import pandas as pd
import time
from datetime import datetime, timedelta
import os
import json
from typing import List, Dict, Any
import logging
import numpy as np
from calendar import monthrange

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Polygon.io API configuration - PREMIUM ACCOUNT
POLYGON_API_KEY = "rXpcpeYvCUXaKrW2knMKeh6XGgsksIrs"  # Premium API key
BASE_URL = "https://api.polygon.io"

# Top 20 US stocks by market cap (as of 2024)
TOP_20_STOCKS = [
    "MSFT", "NVDA", "AAPL", "AMZN", "GOOG", "META", "AVGO", "BRK.B", 
    "TSLA", "JPM", "WMT", "LLY", "V", "ORCL", "NFLX", "MA", 
    "XOM", "COST", "PG", "JNJ"
]

# Date range - 5 years of data
START_DATE = "2020-06-20"
END_DATE = "2025-06-20"

print("PREMIUM Monthly Historical News Collector Initialized")
print(f"Date Range: {START_DATE} to {END_DATE} (5 years)")
print(f"Stocks: {len(TOP_20_STOCKS)} companies")
print(f"Target: All headlines per month per stock (up to 1000 per request)")
print(f"Premium API: No rate limits, unlimited requests")

PREMIUM Monthly Historical News Collector Initialized
Date Range: 2020-06-20 to 2025-06-20 (5 years)
Stocks: 20 companies
Target: All headlines per month per stock (up to 1000 per request)
Premium API: No rate limits, unlimited requests


In [65]:
def get_news_for_stock_month(ticker: str, month_start_date: str, month_end_date: str, limit: int = 1000) -> List[Dict[str, Any]]:
    """
    Fetch news headlines for a specific stock over a full month.
    Note: The Polygon API has a max limit of 1000 results per request.
    For a more robust solution on popular stocks, pagination would be required.
    
    Args:
        ticker: Stock symbol (e.g., 'AAPL')
        month_start_date: The first day of the month in YYYY-MM-DD format
        month_end_date: The last day of the month in YYYY-MM-DD format
        limit: Number of headlines to fetch (max 1000 for Polygon API)
    
    Returns:
        List of news articles
    """
    url = f"{BASE_URL}/v2/reference/news"
    
    params = {
        'ticker': ticker,
        'published_utc.gte': month_start_date,
        'published_utc.lte': month_end_date,  # Use lte for the last day of the month
        'limit': limit,  # Increased limit for a full month
        'order': 'desc',
        'sort': 'published_utc',
        'apikey': POLYGON_API_KEY
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        
        if data['status'] == 'OK' and 'results' in data:
            return data['results']
        else:
            logger.warning(f"No news found for {ticker} in month {month_start_date[:7]}")
            return []
            
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching news for {ticker} in month {month_start_date[:7]}: {e}")
        return []
    except json.JSONDecodeError as e:
        logger.error(f"Error parsing JSON for {ticker} in month {month_start_date[:7]}: {e}")
        return []

def process_news_article(article: Dict[str, Any], ticker: str) -> Dict[str, Any]:
    """
    Process and clean a news article
    """
    # Extract the actual publication date from the API response
    published_utc = article.get('published_utc', '')
    
    # Convert to date format (M/D/YYYY) if datetime string is provided
    if published_utc:
        try:
            # Parse the ISO format datetime and convert to M/D/YYYY format
            dt = datetime.fromisoformat(published_utc.replace('Z', '+00:00'))
            date = f"{dt.month}/{dt.day}/{dt.year}"
        except Exception as e:
            # Fallback to original published_utc if parsing fails
            date = published_utc
    else:
        date = ''
    
    return {
        'ticker': ticker,
        'date': date,  # Use actual publication date in M/D/YYYY format
        'title': article.get('title', ''),
        'description': article.get('description', ''),
        'author': article.get('author', ''),
        'published_utc': published_utc,
        'article_url': article.get('article_url', ''),
        'publisher_name': article.get('publisher', {}).get('name', ''),
        'publisher_homepage_url': article.get('publisher', {}).get('homepage_url', ''),
        'tickers': ','.join(article.get('tickers', [])),
        'keywords': ','.join(article.get('keywords', []))
    }

print("News fetching functions (monthly) defined")

News fetching functions (monthly) defined


In [66]:
def generate_monthly_ranges(start_date: str, end_date: str) -> List[Dict[str, str]]:
    """
    Generate a list of start and end dates for each month in the range.
    """
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    
    monthly_ranges = []
    current_date = start
    
    while current_date < end:
        month_start = current_date.replace(day=1)
        days_in_month = monthrange(current_date.year, current_date.month)[1]
        month_end = current_date.replace(day=days_in_month)
        
        monthly_ranges.append({
            'start': month_start.strftime('%Y-%m-%d'),
            'end': month_end.strftime('%Y-%m-%d')
        })
        
        current_date = month_end + timedelta(days=1)
    
    return monthly_ranges

# Removed save_progress function as requested

# Generate the monthly date ranges
monthly_ranges = generate_monthly_ranges(START_DATE, END_DATE)
total_months = len(monthly_ranges)
total_requests = total_months * len(TOP_20_STOCKS)

print(f"Generated {total_months} months to process (5 years)")
print(f"Total API requests to be made: {total_requests:,}")
print(f"Estimated time with premium API (0.2s delay): {total_requests * 0.2 / 60:.1f} minutes")
print("Utility functions (monthly) defined")

Generated 61 months to process (5 years)
Total API requests to be made: 1,220
Estimated time with premium API (0.2s delay): 4.1 minutes
Utility functions (monthly) defined


In [67]:
# Main data collection function - MONTHLY PREMIUM VERSION
def collect_historical_news_monthly(rate_limit_delay: float = 0.2) -> List[Dict[str, Any]]:
    """
    Collect historical news data for all stocks, iterating by month.
    
    Args:
        rate_limit_delay: Delay between API calls to be safe.
    
    Returns:
        List of all collected news articles, sorted by ticker then by date.
    """
    all_news_data = []
    
    monthly_ranges = generate_monthly_ranges(START_DATE, END_DATE)
    total_combinations = len(TOP_20_STOCKS) * len(monthly_ranges)
    current_count = 0
    
    logger.info(f"Starting monthly data collection...")
    logger.info(f"Total combinations (Stock x Month): {total_combinations}")
    
    for month_range in monthly_ranges:
        month_start = month_range['start']
        month_end = month_range['end']
        
        for ticker in TOP_20_STOCKS:
            current_count += 1
            
            logger.info(f"Progress: {current_count}/{total_combinations} - Fetching {ticker} news for month {month_start[:7]}")
            
            # Fetch news for this ticker and month
            news_articles = get_news_for_stock_month(ticker, month_start, month_end, limit=1000)
            
            # Process each article
            for article in news_articles:
                processed_article = process_news_article(article, ticker)  # Use actual published date
                all_news_data.append(processed_article)
            
            logger.info(f"Found {len(news_articles)} articles for {ticker} in month {month_start[:7]}")
            
            # Minimal delay
            if rate_limit_delay > 0:
                time.sleep(rate_limit_delay)
    
    # Sort by ticker, then by date
    all_news_data.sort(key=lambda x: (x['ticker'], x['date']))
    
    # Final save
    df = pd.DataFrame(all_news_data)
    df.to_csv("historical_news_monthly_final.csv", index=False)
    logger.info(f"Data collection completed! Total articles: {len(all_news_data)}")
    logger.info(f"Data sorted by ticker, then by date and saved to: historical_news_monthly_final.csv")
    
    return all_news_data

print("Main monthly collection function defined")
print("Ready to start data collection")

Main monthly collection function defined
Ready to start data collection


## 🚀 Instructions for Monthly Collection

This notebook is configured to fetch data **monthly** instead of daily. This significantly reduces the number of API calls and speeds up the collection process.

1.  **API Key**: Ensure your Premium Polygon.io API key is set in the first cell.
2.  **Collection Method**: The script now iterates through each month from the `START_DATE` to the `END_DATE`.
3.  **API Limit**: For each month, it attempts to fetch up to **1000 articles**. Note that this is a limitation of a single API request. For highly active stocks, a month could have more than 1000 articles, and a more advanced script would need to handle pagination to retrieve all of them.
4.  **Efficiency**: This method is much faster and more efficient for collecting large historical datasets.
5.  **Output File**: The final data will be saved to `historical_news_monthly_final.csv`.

**Run the next cell to start the MONTHLY data collection process!**

In [None]:
# 🚀 START MONTHLY DATA COLLECTION
# Make sure you've set your API key in the first cell before running this!

# Verify API key is set
if 'YOUR_POLYGON_API_KEY' in POLYGON_API_KEY:
    print("ERROR: Please set your Polygon.io API key in the first cell!")
else:
    print("API key is set.")
    
    # Test API connection with a single monthly request
    print("Testing API connection with a monthly request...")
    test_news = get_news_for_stock_month("AAPL", "2024-05-01", "2024-05-31", limit=5)
    
    if test_news is not None:
        print(f"API test successful! Found {len(test_news)} article(s) for the month.")
        print("Starting full monthly data collection...")
        
        # Start the monthly data collection
        all_collected_news = collect_historical_news_monthly(
            rate_limit_delay=0.2
        )
        
        print(f"\nCollection completed!")
        print(f"Total articles collected: {len(all_collected_news)}")
        print(f"Data saved to: historical_news_monthly_final.csv")
        
    else:
        print("API test failed. Please check your API key, internet connection, and account status.")

API key is set.
Testing API connection with a monthly request...


2025-06-20 18:07:39,037 - INFO - Starting monthly data collection...
2025-06-20 18:07:39,037 - INFO - Total combinations (Stock x Month): 1220
2025-06-20 18:07:39,037 - INFO - Progress: 1/1220 - Fetching MSFT news for month 2020-06


API test successful! Found 5 article(s) for the month.
Starting full monthly data collection...


2025-06-20 18:07:39,831 - INFO - Found 2 articles for MSFT in month 2020-06
2025-06-20 18:07:40,033 - INFO - Progress: 2/1220 - Fetching NVDA news for month 2020-06
2025-06-20 18:07:40,768 - INFO - Found 0 articles for NVDA in month 2020-06
2025-06-20 18:07:40,969 - INFO - Progress: 3/1220 - Fetching AAPL news for month 2020-06
2025-06-20 18:07:41,691 - INFO - Found 6 articles for AAPL in month 2020-06
2025-06-20 18:07:41,893 - INFO - Progress: 4/1220 - Fetching AMZN news for month 2020-06
2025-06-20 18:07:42,615 - INFO - Found 3 articles for AMZN in month 2020-06
2025-06-20 18:07:42,817 - INFO - Progress: 5/1220 - Fetching GOOG news for month 2020-06
2025-06-20 18:07:43,591 - INFO - Found 1 articles for GOOG in month 2020-06
2025-06-20 18:07:43,793 - INFO - Progress: 6/1220 - Fetching META news for month 2020-06
2025-06-20 18:07:44,583 - INFO - Found 0 articles for META in month 2020-06
2025-06-20 18:07:44,785 - INFO - Progress: 7/1220 - Fetching AVGO news for month 2020-06
2025-06-20


Collection completed!
Total articles collected: 163777
Data saved to: historical_news_monthly_final.csv


In [69]:
# ANALYZE COLLECTED MONTHLY DATA
# Run this cell to get statistics about your collected data

def analyze_collected_data(filename: str = "historical_news_monthly_final.csv"):
    """
    Analyze the collected news data and show statistics.
    """
    try:
        if not os.path.exists(filename):
            print(f"File {filename} not found! Run the collection cell first.")
            return
        
        df = pd.read_csv(filename)
        
        print("DATA ANALYSIS REPORT")
        print("=" * 50)
        print(f"Total articles collected: {len(df):,}")
        print(f"Number of unique stocks: {df['ticker'].nunique()}")
        print(f"Date range: {df['date'].min()} to {df['date'].max()}")
        
        print("\nArticles by Stock:")
        stock_counts = df['ticker'].value_counts().sort_index()
        for ticker, count in stock_counts.items():
            print(f"   {ticker}: {count:,} articles")
        
        print("\nArticles by Month (based on collection start date):")
        df['month'] = pd.to_datetime(df['date']).dt.to_period('M')
        monthly_counts = df['month'].value_counts().sort_index()
        for month, count in monthly_counts.items():
            print(f"   {month}: {count:,} articles")
        
        print("\nTop 10 Publishers:")
        publisher_counts = df['publisher_name'].value_counts().head(10)
        for publisher, count in publisher_counts.items():
            print(f"   {publisher}: {count:,} articles")
        
        print("\nFile Information:")
        file_size = os.path.getsize(filename) / (1024 * 1024)  # in MB
        print(f"   File size: {file_size:.2f} MB")
        print(f"   Columns: {list(df.columns)}")
        
    except Exception as e:
        print(f"An error occurred during analysis: {e}")

# Run the analysis on the final output file
analyze_collected_data()

DATA ANALYSIS REPORT
Total articles collected: 163,777
Number of unique stocks: 20
Date range: 1/1/2022 to 9/9/2024

Articles by Stock:
   AAPL: 19,710 articles
   AMZN: 21,218 articles
   AVGO: 3,048 articles
   BRK.B: 5,191 articles
   COST: 3,635 articles
   GOOG: 10,708 articles
   JNJ: 4,216 articles
   JPM: 5,166 articles
   LLY: 2,885 articles
   MA: 2,394 articles
   META: 8,034 articles
   MSFT: 15,744 articles
   NFLX: 7,273 articles
   NVDA: 17,700 articles
   ORCL: 2,242 articles
   PG: 1,929 articles
   TSLA: 19,956 articles
   V: 3,320 articles
   WMT: 5,266 articles
   XOM: 4,142 articles

Articles by Month (based on collection start date):
   2020-06: 19 articles
   2020-07: 45 articles
   2020-08: 50 articles
   2020-09: 51 articles
   2020-10: 55 articles
   2020-11: 26 articles
   2020-12: 14 articles
   2021-01: 36 articles
   2021-02: 31 articles
   2021-03: 46 articles
   2021-04: 1,173 articles
   2021-05: 3,259 articles
   2021-06: 3,445 articles
   2021-07: 3,9