First instinct to make money with LLMs is: scrape FT, drudge, scan for company names, get their performance in that day, fine tune _a model to predict the performance of a company based on the news, and then trade on that._

Or we could do macro-vibe trading, take the above the fold headlines, line them up with S&P opens and closes, finetune, predict. 

Okay, so we need a website snapshotter that'll get past bot detections. But we also need training data, so we'll use the wayback machine. Time for some python.

In [None]:
%pip install waybackpy beautifulsoup4

In [None]:
# use waybackpy to get the last year of drudge, every day at 9 AM ET.
from waybackpy import WaybackMachineCDXServerAPI
import requests
from bs4 import BeautifulSoup

url = "https://drudgereport.com/"

# pass url, a year, month, day, hour, and minute, returning a list of tuples of url and text
def get_page_links_at_date(url, year, month, day, hour, minute):
    w = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
    url = w.near(year=year, month=month, day=day, hour=hour, minute=minute).archive_url
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')
    article_links = [(link.get('href'), link.text) for link in links]
    article_links = [link for link in article_links if link[1]]
    return article_links

links = get_page_links_at_date("https://drudgereport.com/", year=2023, month=2, day=27, hour=9, minute=0)

for url, text in links[:5]:
    print(f"Text: {text} URL: {url}")


Thank you _for your help_ GPT. _I'm going to use the wayback machine to get the last year of drudge, every day at 9 AM ET, and 5._


In [None]:
%pip install pandas

In [14]:
import pandas as pd

now = pd.Timestamp.now()
dates = pd.date_range(start="2022-02-01", end=now, freq="D")

data = []

for date in dates:
    am_links = get_page_links_at_date("https://drudgereport.com/", year=date.year, month=date.month, day=date.day, hour=6, minute=0)
    for url, text in am_links:
        data.append({"date": date.isoformat(), "url": url, "text": text, isMorning: True}, ignore_index=True)
    pm_links = get_page_links_at_date("https://drudgereport.com/", year=date.year, month=date.month, day=date.day, hour=18, minute=0)
    for url, text in pm_links:
        data.append({"date": date.isoformat(), "url": url, "text": text, isMorning: False}, ignore_index=True)

drudge_df = pd.DataFrame(data)
drudge_df.head()


ConnectionError: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /cdx/search/cdx?gzip=false&closest=202202240600&sort=closest&url=https%3A%2F%2Fdrudgereport.com%2F&showResumeKey=true&limit=1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x1373f3310>: Failed to establish a new connection: [Errno 61] Connection refused'))

Oh no they blocked me. Love you archive.org, didn't mean to upset you. Let's use a proxy.

In [None]:
%pip install python-dotenv tqdm

In [1]:
from waybackpy import WaybackMachineCDXServerAPI
from dotenv import load_dotenv
import requests
from tqdm import tqdm
import os
import pandas as pd
import json
from bs4 import BeautifulSoup
import time

load_dotenv()

username = os.getenv('SMARTPROXY_USERNAME')
password = os.getenv('SMARTPROXY_PASSWORD')
proxy = f"https://{username}:{password}@gate.smartproxy.com:7000"

proxies = {'https': proxy}
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
headers = {'User-Agent': user_agent}

def play_success_sound():
    os.system('afplay /System/Library/Sounds/Pop.aiff')

def get_drudge_links_at_date(url, year, month, day, hour, minute):
    w = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
    url = w.near(year=year, month=month, day=day, hour=hour, minute=minute).archive_url
    response = requests.get(url, headers=headers, proxies=proxies)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')
    article_links = [(link.get('href'), link.text) for link in links]
    article_links = [link for link in article_links if link[1]]
    return article_links

def scrape_drudge_with_proxy():
    now = pd.Timestamp.now()
    dates = pd.date_range(start="2022-01-01", end=now, freq="D")

    for date in tqdm(dates, desc="Scraping progress", unit="date"):
        # check if we already have the data
        if os.path.exists(f"01-some-data/drudgereport.com/{date}.jsonl"):
            continue
        data = []
        am_links = get_drudge_links_at_date("https://drudgereport.com/", year=date.year, month=date.month, day=date.day, hour=6, minute=0)
        for url, text in am_links:
            data.append({"date": date.isoformat(), "url": url, "text": text, "isMorning": True})
        pm_links = get_drudge_links_at_date("https://drudgereport.com/", year=date.year, month=date.month, day=date.day, hour=18, minute=0)
        for url, text in pm_links:
            data.append({"date": date.isoformat(), "url": url, "text": text, "isMorning": False})
        # write to jsonl
        with open(f"01-some-data/drudgereport.com/{date}.jsonl", "w") as f:
            for row in data:
                f.write(json.dumps(row) + "\n")
        play_success_sound()




Scraping is a muddy art, let's let this baby run with some backoff retry logic.

In [None]:
%pip install retrying

In [2]:
import os
from retrying import retry

@retry(wait_exponential_multiplier=1000, wait_exponential_max=60000)
def scrape_drudge_with_proxy_retry():
    scrape_drudge_with_proxy()

try:
    scrape_drudge_with_proxy_retry()
except Exception as e:
    os.system('say "An error occurred. Please check the issue."')


Scraping progress:   0%|          | 0/789 [00:00<?, ?date/s]

Scraping progress:  27%|██▋       | 214/789 [18:15<49:04,  5.12s/date]  
Scraping progress:   0%|          | 0/789 [00:00<?, ?date/s]

In [None]:
def print_drudgereport_stats():
    # count the number of scraped docs
    scraped_docs_count = len(os.listdir("01-some-data/drudgereport.com"))
    print(f"Number of days scraped: {scraped_docs_count}")

    # count the total number of links
    total_links = 0
    for filename in os.listdir("01-some-data/drudgereport.com"):
        filepath = os.path.join("01-some-data/drudgereport.com", filename)
        with open(filepath, 'r') as f:
            data = [json.loads(line) for line in f]
        total_links += len(data)

    print(f"Total number of links: {total_links}")

    unique_links = set()
    for filename in os.listdir("01-some-data/drudgereport.com"):
        filepath = os.path.join("01-some-data/drudgereport.com", filename)
        with open(filepath, 'r') as f:
            data = [json.loads(line) for line in f]
        for item in data:
            unique_links.add(item['url'])

    print(f"Total number of unique links: {len(unique_links)}")


    # get the head of the newest
    newest_file = max(os.listdir("01-some-data/drudgereport.com"))
    newest_filepath = os.path.join("01-some-data/drudgereport.com", newest_file)
    with open(newest_filepath, 'r') as f:
        newest_data = [json.loads(line) for line in f]
    print("Head of the newest:")
    print(newest_data[0])

Lovely. But there are a lot of non-news item links in the scrapes. Let's only include urls with more than one path component and clean up the archive.org prefaces.

In [2]:
import re
from urllib.parse import urlparse

def clean_up_drudge_links():
    # iterate over each file in the directory
    preprune_count = 0
    postprune_count = 0
    for filename in os.listdir("01-some-data/drudgereport.com"):
        filepath = os.path.join("01-some-data/drudgereport.com", filename)
        with open(filepath, 'r') as f:
            data = [json.loads(line) for line in f]

        # regex pattern to match 'https://web.archive.org/web/{date as integer}'
        pattern = r"https://web\.archive\.org/web/\d+/"

        # iterate over each item in the data
        for item in data:
            # replace the matched pattern with an empty string
            item['url'] = re.sub(pattern, '', item['url'])
        # filter out urls with only one path component
        filtered = [item for item in data if len(urlparse(item['url']).path.split('/')) > 2]
        # remove internal links to drudge
        filtered = [item for item in filtered if "drudgereport.com" not in item['url']]
        # remove items with less than three words in text
        # filtered = [item for item in filtered if len(item['text'].split()) > 2]

        preprune_count += len(data)
        postprune_count += len(filtered)

        # write the filtered data back to the file
        with open(filepath, 'w') as f:
            for item in filtered:
                f.write(json.dumps(item) + "\n")

        print(f"Preprune count: {preprune_count}")
        print(f"Postprune count: {postprune_count}")
        print(f"Pruned {preprune_count - postprune_count} links or {((preprune_count - postprune_count) / preprune_count) * 100:.2f}%")


In [3]:
clean_up_drudge_links()

Old size: 469
New size: 244
Old size: 457
New size: 231
Old size: 482
New size: 257
Old size: 492
New size: 268
Old size: 495
New size: 273
Old size: 258
New size: 258
Old size: 489
New size: 262
Old size: 502
New size: 271
Old size: 484
New size: 257
Old size: 492
New size: 269
Old size: 492
New size: 258
Old size: 475
New size: 254
Old size: 480
New size: 256
Old size: 474
New size: 253
Old size: 504
New size: 277
Old size: 496
New size: 264
Old size: 492
New size: 268
Old size: 259
New size: 259
Old size: 484
New size: 265
Old size: 470
New size: 252
Old size: 474
New size: 252
Old size: 473
New size: 246
Old size: 281
New size: 281
Old size: 474
New size: 250
Old size: 489
New size: 252
Old size: 471
New size: 248
Old size: 486
New size: 258
Old size: 472
New size: 238
Old size: 485
New size: 258
Old size: 492
New size: 269
Old size: 471
New size: 251
Old size: 494
New size: 274
Old size: 483
New size: 260
Old size: 255
New size: 255
Old size: 283
New size: 283
Old size: 456
New si

Let's do the same thing for the FT. We'll genericize scraping as a function to do it.

In [8]:
from waybackpy import WaybackMachineCDXServerAPI
from dotenv import load_dotenv
import requests
from tqdm import tqdm
import os
import pandas as pd
import json
from bs4 import BeautifulSoup
import time
import re
from urllib.parse import urlparse

load_dotenv()

# mkdir 01-some-data/ft.com
if not os.path.exists("01-some-data/ft.com"):
    os.mkdir("01-some-data/ft.com")

username = os.getenv('SMARTPROXY_USERNAME')
password = os.getenv('SMARTPROXY_PASSWORD')
proxy = f"https://{username}:{password}@gate.smartproxy.com:7000"

proxies = {'https': proxy}
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
headers = {'User-Agent': user_agent}

def get_ft_links_at_date(url, year, month, day, hour, minute):
    w = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
    url = w.near(year=year, month=month, day=day, hour=hour, minute=minute).archive_url
    response = requests.get(url, headers=headers, proxies=proxies)
    soup = BeautifulSoup(response.text, 'html.parser')
    # TODO: find the right selector
    links = soup.find_all('a')
    article_links = [(link.get('href'), link.text) for link in links]
    article_links = [link for link in article_links if link[1]]
    return article_links

def scrape_ft_with_proxy():
    now = pd.Timestamp.now()
    dates = pd.date_range(start="2022-01-01", end=now, freq="D")

    for date in tqdm(dates, desc="Scraping progress", unit="date"):
        # check if we already have the data
        if os.path.exists(f"01-some-data/ft.com/{date}.jsonl"):
            continue
        data = []
        am_links = get_ft_links_at_date("https://ft.com/", year=date.year, month=date.month, day=date.day, hour=6, minute=0)
        for url, text in am_links:
            data.append({"date": date.isoformat(), "url": url, "text": text, "isMorning": True})
        pm_links = get_ft_links_at_date("https://ft.com/", year=date.year, month=date.month, day=date.day, hour=18, minute=0)
        for url, text in pm_links:
            data.append({"date": date.isoformat(), "url": url, "text": text, "isMorning": False})
        # write to jsonl
        with open(f"01-some-data/ft.com/{date}.jsonl", "w") as f:
            for row in data:
                f.write(json.dumps(row) + "\n")
        os.system('afplay /System/Library/Sounds/Pop.aiff')

def clean_up_ft_links():
    # iterate over each file in the directory
    preprune_count = 0
    postprune_count = 0
    for filename in os.listdir("01-some-data/ft.com"):
        filepath = os.path.join("01-some-data/ft.com", filename)
        with open(filepath, 'r') as f:
            data = [json.loads(line) for line in f]

        # regex pattern to match everything after the last http, including the http

        pattern = r"(http[s]?://.*)"

        # iterate over each item in the data
        for item in data:
            # replace the matched pattern with an empty string
            match = re.search(pattern, item['url'])
            if match:
                item['url'] = match.group(1)
        # only include urls with /content/ in them
        filtered = [item for item in data if "/content/" in item['url']]

        preprune_count += len(data)
        postprune_count += len(filtered)

        # write the filtered data back to the file
        with open(filepath, 'w') as f:
            for item in filtered:
                f.write(json.dumps(item) + "\n")

        print(f"Preprune count: {preprune_count}")
        print(f"Postprune count: {postprune_count}")


In [15]:
import os
from retrying import retry

@retry(wait_exponential_multiplier=1000, wait_exponential_max=60000)
def scrape_ft_with_proxy_retry():
    scrape_ft_with_proxy()

try:
    scrape_ft_with_proxy_retry()
except Exception as e:
    os.system('say "An error occurred. Please check the issue."')

Scraping progress:   0%|          | 0/789 [00:13<?, ?date/s]
Scraping progress:   0%|          | 0/789 [00:19<?, ?date/s]
Scraping progress:   0%|          | 3/789 [01:06<4:50:17, 22.16s/date]
Scraping progress:   0%|          | 0/789 [00:00<?, ?date/s]

In [9]:
clean_up_ft_links()

Preprune count: 943
Postprune count: 270
Preprune count: 1876
Postprune count: 531
Preprune count: 2817
Postprune count: 797
