In [3]:
from datetime import date, timedelta, datetime
from gdeltdoc import GdeltDoc, Filters
from gdeltdoc.errors import RateLimitError
from urllib.parse import quote
import yfinance as yf
import pandas as pd
import time
import os
import json

START_DATE = date(year=2018, month=1, day=1)
END_DATE = datetime.now().date()
DATE_INCREMENT = timedelta(days=31)

gdd = GdeltDoc()

In [5]:
sp500 = pd.read_csv("../sp500.csv")
tickers = sorted([ticker for ticker in sp500["Symbol"]])
symbol_to_security = sp500.set_index("Symbol")["Security"].to_dict()

In [6]:
sp500_data = yf.download(tickers, start="2018-01-01", end="2025-09-01", group_by="ticker")
sp500_data.to_csv("sp500_data.csv")

  sp500_data = yf.download(tickers, start="2018-01-01", end="2025-09-01", group_by="ticker")
[*********             19%                       ]  98 of 503 completedHTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: ANSS"}}}
[*************         27%                       ]  136 of 503 completedHTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: HES"}}}
[*********************100%***********************]  503 of 503 completed

6 Failed downloads:
['BF.B']: YFPricesMissingError('possibly delisted; no price data found  (1d 2018-01-01 -> 2025-09-01)')
['ANSS', 'HES', 'BRK.B']: YFTzMissingError('possibly delisted; no timezone found')
['PARA']: YFPricesMissingError('possibly delisted; no price data found  (1d 2018-01-01 -> 2025-09-01) (Yahoo error = "No data found, symbol may be delisted")')
['O']: Timeout('Failed to perform, curl: (28) Connection timed out aft

In [None]:
CACHE_PATH = "cached_articles"
IGNORE_PATH = os.path.join(CACHE_PATH, "ignored.json")


def is_ignored_article(ticker: str, start_date: str):
    if os.path.exists(IGNORE_PATH):
        with open(IGNORE_PATH, "r") as file:
            data = json.loads(file.read())
            return start_date in data and ticker in data[start_date]
    return False


def set_ignored_article(ticker: str, start_date: str):
    os.makedirs(CACHE_PATH, exist_ok=True)

    if os.path.exists(IGNORE_PATH):
        with open(IGNORE_PATH, "r") as file:
            data = json.loads(file.read())
    else:
        data = {}

    if start_date not in data:
        data[start_date] = []

    if ticker not in data[start_date]:
        data[start_date].append(ticker)

    with open(IGNORE_PATH, "w") as file:
        file.write(json.dumps(data, indent=4))


def save_articles_to_file(ticker: str, start_date: str, end_date: str, *, save_file: str, index: int, length: int):
    filters = Filters(keyword=quote(symbol_to_security[ticker]), start_date=start_date, end_date=end_date)
    articles = gdd.article_search(filters)

    if len(articles) > 0:
        articles = articles[articles["language"] == "English"]
        articles = articles[articles["sourcecountry"] == "United States"]

        os.makedirs(os.path.dirname(save_file), exist_ok=True)
        articles.to_csv(save_file)
        print(
            f"({index:3d}/{length}) {'\033[38;5;6m'}DOWNLOADED DATA{'\033[0m'}: {'\033[38;5;7m'}{start_date} {'\033[38;5;3m'}{ticker} ({symbol_to_security[ticker]}){'\033[0m'}"
        )
    else:
        set_ignored_article(ticker, start_date)
        print(
            f"({index:3d}/{length}) {'\033[38;5;1m'}NO DATA{'\033[0m'}: {'\033[38;5;7m'}{start_date} {'\033[38;5;3m'}{ticker} ({symbol_to_security[ticker]}){'\033[0m'}"
        )


def save_articles(ticker: str, *, index: int, length: int):
    if len(symbol_to_security[ticker]) < 5:
        print(f"({index:3d}/{length}) {'\033[38;5;1m'}BAD DATA{'\033[0m'}: {'\033[38;5;3m'}{ticker} ({symbol_to_security[ticker]}){'\033[0m'}")
        return

    start_date = START_DATE
    while start_date + DATE_INCREMENT < END_DATE:
        save_file = f"{CACHE_PATH}/{start_date}/{ticker}.csv"

        if is_ignored_article(ticker, str(start_date)):
            print(
                f"({index:3d}/{length}) {'\033[38;5;1m'}NO DATA{'\033[0m'}: {'\033[38;5;7m'}{start_date} {'\033[38;5;3m'}{ticker} ({symbol_to_security[ticker]}){'\033[0m'}"
            )
            # pass
        elif os.path.exists(save_file):
            print(f"({index:3d}/{length}) {'\033[38;5;5m'}SKIPPED DATA{'\033[0m'}: {'\033[38;5;7m'}{start_date} {'\033[38;5;3m'}{ticker}{'\033[0m'}")
            # pass
        else:
            save_articles_to_file(ticker, str(start_date), str(start_date + DATE_INCREMENT), save_file=save_file, index=index, length=length)

        start_date += DATE_INCREMENT

In [None]:
selected_tickers = tickers[0:101]

for i, ticker in enumerate(selected_tickers):
    while True:
        start = time.time()

        try:
            save_articles(ticker, index=i, length=len(selected_tickers))
            print(f"TOTAL TIME: {time.time() - start:.2f} (s)")
            break
        except RateLimitError:
            print(f"{'\033[38;5;1m'}RATE LIMIT REACHED. WAITING...{'\033[0m'}")
            time.sleep(90)
        except Exception as err:
            print(f"{'\033[38;5;1m'}FAILED TO GET ALL DATA{'\033[0m'}")
            raise err

In [None]:
print(tickers[0:101])  # indices 0–100 (DYLAN)
print(tickers[101:202])  # indices 101–201 (EFORD)
print(tickers[202:303])  # indices 202–302 (JACKY)
print(tickers[303:403])  # indices 303–402 (CALVIN)
print(tickers[403:503])  # indices 403–502 (TEJU)

In [None]:
import glob

os.makedirs("merged_articles", exist_ok=True)

for ticker in tickers:
    files = glob.glob("E:/cached_articles/**/AAPL.csv")
    dfs = [pd.read_csv(file, index_col=0) for file in files]

    merged_df = pd.concat(dfs, ignore_index=True)
    merged_df.to_csv(f"merged_articles/{ticker}.csv")
