First instinct to make money with LLMs is: scrape headlines from a few sources, FT, drudge, bloomberg, scan for company names, get their performance in that day, fine tune _a model to predict the performance of a company based on the news, and then trade on that._

Or we could do macro-vibe trading, take the above the fold headlines, line them up with S&P opens and closes, finetune, predict. 

Okay, so we need a website snapshotter that'll get past bot detections. But we also need training data, so we'll use the wayback machine. Time for some python.

Let's get the sources then see what we can cook.

In [1]:
%pip install waybackpy beautifulsoup4 retrying python-dotenv tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# use waybackpy to get the last year of drudge, every day at 9 AM ET.
# from waybackpy import WaybackMachineCDXServerAPI
# import requests
# from bs4 import BeautifulSoup

# url = "https://drudgereport.com/"

# # pass url, a year, month, day, hour, and minute, returning a list of tuples of url and text
# def get_page_links_at_date(url, year, month, day, hour, minute):
#     w = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
#     url = w.near(year=year, month=month, day=day, hour=hour, minute=minute).archive_url
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text, 'html.parser')
#     links = soup.find_all('a')
#     article_links = [(link.get('href'), link.text) for link in links]
#     article_links = [link for link in article_links if link[1]]
#     return article_links

# links = get_page_links_at_date("https://drudgereport.com/", year=2023, month=2, day=27, hour=9, minute=0)

# for url, text in links[:5]:
#     print(f"Text: {text} URL: {url}")


Thank you _for your help_ GPT. _I'm going to use the wayback machine to get the last year of drudge, every day at 9 AM ET, and 5._


In [4]:
# import pandas as pd

# now = pd.Timestamp.now()
# dates = pd.date_range(start="2022-02-01", end=now, freq="D")

# data = []

# for date in dates:
#     am_links = get_page_links_at_date("https://drudgereport.com/", year=date.year, month=date.month, day=date.day, hour=6, minute=0)
#     for url, text in am_links:
#         data.append({"date": date.isoformat(), "url": url, "text": text, isMorning: True}, ignore_index=True)
#     pm_links = get_page_links_at_date("https://drudgereport.com/", year=date.year, month=date.month, day=date.day, hour=18, minute=0)
#     for url, text in pm_links:
#         data.append({"date": date.isoformat(), "url": url, "text": text, isMorning: False}, ignore_index=True)

# drudge_df = pd.DataFrame(data)
# drudge_df.head()


Oh no they blocked me. Love you archive.org, didn't mean to upset you. Let's use a proxy.

In [6]:
from waybackpy import WaybackMachineCDXServerAPI
from dotenv import load_dotenv
import requests
from tqdm import tqdm
import os
import pandas as pd
import json
from bs4 import BeautifulSoup
import time

load_dotenv()

username = os.getenv('SMARTPROXY_USERNAME')
password = os.getenv('SMARTPROXY_PASSWORD')
proxy = f"https://{username}:{password}@gate.smartproxy.com:7000"

proxies = {'https': proxy}
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
headers = {'User-Agent': user_agent}

def play_success_sound():
    os.system('afplay /System/Library/Sounds/Pop.aiff')

def get_drudge_links_at_date(url, year, month, day, hour, minute):
    w = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
    url = w.near(year=year, month=month, day=day, hour=hour, minute=minute).archive_url
    response = requests.get(url, headers=headers, proxies=proxies)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')
    article_links = [(link.get('href'), link.text) for link in links]
    article_links = [link for link in article_links if link[1]]
    return article_links

def scrape_drudge_with_proxy():
    now = pd.Timestamp.now()
    dates = pd.date_range(start="2022-01-01", end=now, freq="D")

    for date in tqdm(dates, desc="Drudge progress", unit="date"):
        # check if we already have the data
        if os.path.exists(f"01-some-data/drudgereport.com/{date}.jsonl"):
            continue
        data = []
        am_links = get_drudge_links_at_date("https://drudgereport.com/", year=date.year, month=date.month, day=date.day, hour=6, minute=0)
        for url, text in am_links:
            data.append({"date": date.isoformat(), "url": url, "text": text, "isMorning": True})
        pm_links = get_drudge_links_at_date("https://drudgereport.com/", year=date.year, month=date.month, day=date.day, hour=18, minute=0)
        for url, text in pm_links:
            data.append({"date": date.isoformat(), "url": url, "text": text, "isMorning": False})
        # write to jsonl
        with open(f"01-some-data/drudgereport.com/{date}.jsonl", "w") as f:
            for row in data:
                f.write(json.dumps(row) + "\n")
        play_success_sound()




Scraping is a muddy art, let's let this baby run with some backoff retry logic.

In [8]:
import os
from retrying import retry

@retry(wait_exponential_multiplier=1000, wait_exponential_max=60000)
def scrape_drudge_with_proxy_retry():
    scrape_drudge_with_proxy()


Lovely. But there are a lot of non-news item links in the scrapes. Let's only include urls with more than one path component and clean up the archive.org prefaces.

In [9]:
import re
from urllib.parse import urlparse

def clean_up_drudge_links():
    # iterate over each file in the directory
    preprune_count = 0
    postprune_count = 0
    for filename in os.listdir("01-some-data/drudgereport.com"):
        filepath = os.path.join("01-some-data/drudgereport.com", filename)
        with open(filepath, 'r') as f:
            data = [json.loads(line) for line in f]

        # regex pattern to match 'https://web.archive.org/web/{date as integer}'
        pattern = r"https://web\.archive\.org/web/\d+/"

        # iterate over each item in the data
        for item in data:
            # replace the matched pattern with an empty string
            item['url'] = re.sub(pattern, '', item['url'])
        # filter out urls with only one path component
        filtered = [item for item in data if len(urlparse(item['url']).path.split('/')) > 2]
        # remove internal links to drudge
        filtered = [item for item in filtered if "drudgereport.com" not in item['url']]
        # remove items with less than three words in text
        # filtered = [item for item in filtered if len(item['text'].split()) > 2]

        preprune_count += len(data)
        postprune_count += len(filtered)

        # write the filtered data back to the file
        with open(filepath, 'w') as f:
            for item in filtered:
                f.write(json.dumps(item) + "\n")

        print(f"Preprune count: {preprune_count}")
        print(f"Postprune count: {postprune_count}")
        print(f"Pruned {preprune_count - postprune_count} links or {((preprune_count - postprune_count) / preprune_count) * 100:.2f}%")


In [10]:
clean_up_drudge_links()

Preprune count: 230
Postprune count: 230
Pruned 0 links or 0.00%
Preprune count: 485
Postprune count: 485
Pruned 0 links or 0.00%
Preprune count: 746
Postprune count: 746
Pruned 0 links or 0.00%
Preprune count: 964
Postprune count: 964
Pruned 0 links or 0.00%
Preprune count: 1208
Postprune count: 1208
Pruned 0 links or 0.00%
Preprune count: 1457
Postprune count: 1457
Pruned 0 links or 0.00%
Preprune count: 1688
Postprune count: 1688
Pruned 0 links or 0.00%
Preprune count: 1945
Postprune count: 1945
Pruned 0 links or 0.00%
Preprune count: 2158
Postprune count: 2158
Pruned 0 links or 0.00%
Preprune count: 2429
Postprune count: 2429
Pruned 0 links or 0.00%
Preprune count: 2670
Postprune count: 2670
Pruned 0 links or 0.00%
Preprune count: 2938
Postprune count: 2938
Pruned 0 links or 0.00%
Preprune count: 3211
Postprune count: 3211
Pruned 0 links or 0.00%
Preprune count: 3469
Postprune count: 3469
Pruned 0 links or 0.00%
Preprune count: 3731
Postprune count: 3731
Pruned 0 links or 0.00%
Pre

Let's do the same thing for the FT. I should probably generalize this to a scraper class, but it's python baby, _we're all about the duct tape._

In [11]:
from waybackpy import WaybackMachineCDXServerAPI
from dotenv import load_dotenv
import requests
from tqdm import tqdm
import os
import pandas as pd
import json
from bs4 import BeautifulSoup
import time
import re
from urllib.parse import urlparse

load_dotenv()

# mkdir 01-some-data/ft.com
if not os.path.exists("01-some-data/ft.com"):
    os.mkdir("01-some-data/ft.com")

username = os.getenv('SMARTPROXY_USERNAME')
password = os.getenv('SMARTPROXY_PASSWORD')
proxy = f"https://{username}:{password}@gate.smartproxy.com:7000"

proxies = {'https': proxy}
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
headers = {'User-Agent': user_agent}

def get_ft_links_at_date(url, year, month, day, hour, minute):
    w = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
    url = w.near(year=year, month=month, day=day, hour=hour, minute=minute).archive_url
    response = requests.get(url, headers=headers, proxies=proxies)
    soup = BeautifulSoup(response.text, 'html.parser')
    # TODO: find the right selector
    links = soup.find_all('a')
    article_links = [(link.get('href'), link.text) for link in links]
    article_links = [link for link in article_links if link[1]]
    return article_links

def scrape_ft_with_proxy():
    now = pd.Timestamp.now()
    dates = pd.date_range(start="2022-01-01", end=now, freq="D")

    for date in tqdm(dates, desc="FT progress", unit="date"):
        # check if we already have the data
        if os.path.exists(f"01-some-data/ft.com/{date}.jsonl"):
            continue
        data = []
        am_links = get_ft_links_at_date("https://ft.com/", year=date.year, month=date.month, day=date.day, hour=6, minute=0)
        for url, text in am_links:
            data.append({"date": date.isoformat(), "url": url, "text": text, "isMorning": True})
        pm_links = get_ft_links_at_date("https://ft.com/", year=date.year, month=date.month, day=date.day, hour=18, minute=0)
        for url, text in pm_links:
            data.append({"date": date.isoformat(), "url": url, "text": text, "isMorning": False})
        # write to jsonl
        with open(f"01-some-data/ft.com/{date}.jsonl", "w") as f:
            for row in data:
                f.write(json.dumps(row) + "\n")
        os.system('afplay /System/Library/Sounds/Pop.aiff')

def clean_up_ft_links():
    # iterate over each file in the directory
    preprune_count = 0
    postprune_count = 0
    for filename in os.listdir("01-some-data/ft.com"):
        filepath = os.path.join("01-some-data/ft.com", filename)
        with open(filepath, 'r') as f:
            data = [json.loads(line) for line in f]

        # regex pattern to match everything after the last http, including the http

        pattern = r"(http[s]?://.*)"

        # iterate over each item in the data
        for item in data:
            # replace the matched pattern with an empty string
            match = re.search(pattern, item['url'])
            if match:
                item['url'] = match.group(1)
        # only include urls with /content/ in them
        filtered = [item for item in data if "/content/" in item['url']]

        preprune_count += len(data)
        postprune_count += len(filtered)

        # write the filtered data back to the file
        with open(filepath, 'w') as f:
            for item in filtered:
                f.write(json.dumps(item) + "\n")

        print(f"Preprune count: {preprune_count}")
        print(f"Postprune count: {postprune_count}")


In [12]:
import os
from retrying import retry

@retry(wait_exponential_multiplier=1000, wait_exponential_max=60000)
def scrape_ft_with_proxy_retry():
    scrape_ft_with_proxy()

In [13]:
clean_up_ft_links()

Preprune count: 261
Postprune count: 261
Preprune count: 525
Postprune count: 525
Preprune count: 804
Postprune count: 804
Preprune count: 1072
Postprune count: 1072
Preprune count: 1342
Postprune count: 1342
Preprune count: 1612
Postprune count: 1612
Preprune count: 1871
Postprune count: 1871
Preprune count: 2135
Postprune count: 2135
Preprune count: 2394
Postprune count: 2394
Preprune count: 2653
Postprune count: 2653
Preprune count: 2916
Postprune count: 2916
Preprune count: 3177
Postprune count: 3177
Preprune count: 3438
Postprune count: 3438
Preprune count: 3719
Postprune count: 3719
Preprune count: 3987
Postprune count: 3987
Preprune count: 4253
Postprune count: 4253
Preprune count: 4518
Postprune count: 4518
Preprune count: 4782
Postprune count: 4782
Preprune count: 5053
Postprune count: 5053
Preprune count: 5320
Postprune count: 5320
Preprune count: 5596
Postprune count: 5596
Preprune count: 5880
Postprune count: 5880
Preprune count: 6147
Postprune count: 6147
Preprune count: 6

In [14]:
from waybackpy import WaybackMachineCDXServerAPI
from dotenv import load_dotenv
import requests
from tqdm import tqdm
import os
import pandas as pd
import json
from bs4 import BeautifulSoup
import time
import re
from urllib.parse import urlparse

load_dotenv()

# mkdir 01-some-data/bloomberg.com
if not os.path.exists("01-some-data/bloomberg.com"):
  os.mkdir("01-some-data/bloomberg.com")

username = os.getenv('SMARTPROXY_USERNAME')
password = os.getenv('SMARTPROXY_PASSWORD')
proxy = f"https://{username}:{password}@gate.smartproxy.com:7000"

proxies = {'https': proxy}
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
headers = {'User-Agent': user_agent}

def get_bloomberg_links_at_date(url, year, month, day, hour, minute):
  w = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
  url = w.near(year=year, month=month, day=day, hour=hour, minute=minute).archive_url
  response = requests.get(url, headers=headers, proxies=proxies)
  soup = BeautifulSoup(response.text, 'html.parser')
  # TODO: find the right selector
  links = soup.find_all('a')
  article_links = [(link.get('href'), link.text) for link in links]
  article_links = [link for link in article_links if link[1]]
  return article_links

def scrape_bloomberg_with_proxy():
  now = pd.Timestamp.now()
  dates = pd.date_range(start="2022-01-01", end=now, freq="D")

  for date in tqdm(dates, desc="Bloomberg progress", unit="date"):
    # check if we already have the data
    if os.path.exists(f"01-some-data/bloomberg.com/{date}.jsonl"):
      continue
    data = []
    am_links = get_bloomberg_links_at_date("https://www.bloomberg.com/markets", year=date.year, month=date.month, day=date.day, hour=6, minute=0)
    for url, text in am_links:
      data.append({"date": date.isoformat(), "url": url, "text": text, "isMorning": True})
    pm_links = get_bloomberg_links_at_date("https://www.bloomberg.com/markets", year=date.year, month=date.month, day=date.day, hour=18, minute=0)
    for url, text in pm_links:
      data.append({"date": date.isoformat(), "url": url, "text": text, "isMorning": False})
    # write to jsonl
    with open(f"01-some-data/bloomberg.com/{date}.jsonl", "w") as f:
      for row in data:
        f.write(json.dumps(row) + "\n")
    os.system('afplay /System/Library/Sounds/Pop.aiff')

def clean_up_bloomberg_links():
  # iterate over each file in the directory
  preprune_count = 0
  postprune_count = 0
  for filename in os.listdir("01-some-data/bloomberg.com"):
    filepath = os.path.join("01-some-data/bloomberg.com", filename)
    with open(filepath, 'r') as f:
      data = [json.loads(line) for line in f]

    # regex pattern to match everything after the last http, including the http

    pattern = r"(http[s]?://.*)"

    # iterate over each item in the data
    for item in data:
      # replace the matched pattern with an empty string
      match = re.search(pattern, item['url'])
      if match:
        item['url'] = match.group(1)

      # trim the text
      item['text'] = item['text'].strip()
    # only include urls with /news/articles/ in them
    filtered = [item for item in data if "/news/articles/" in item['url']]
    # remove those with text len less than 5
    filtered = [item for item in filtered if len(item['text']) > 5]


    preprune_count += len(data)
    postprune_count += len(filtered)

    # write the filtered data back to the file
    with open(filepath, 'w') as f:
      for item in filtered:
        f.write(json.dumps(item) + "\n")

    print(f"Preprune count: {preprune_count}")
    print(f"Postprune count: {postprune_count}")


In [15]:
import os
from retrying import retry

@retry(wait_exponential_multiplier=1000, wait_exponential_max=60000)
def scrape_bloomberg_with_proxy_retry():
  scrape_bloomberg_with_proxy()


In [16]:
clean_up_bloomberg_links()

Preprune count: 42
Postprune count: 42
Preprune count: 84
Postprune count: 84
Preprune count: 124
Postprune count: 124
Preprune count: 145
Postprune count: 145


They all do their thing, now I'll let this puppy run over _night and see what we get._

In [17]:
import os
from retrying import retry

@retry(wait_exponential_multiplier=1000, wait_exponential_max=60000)
def run_all_scrapers():
  # Scrape drudgereport.com
  try:
    scrape_drudge_with_proxy_retry()
  except Exception as e:
    print("An error occurred while scraping drudgereport.com:", str(e))

  # Scrape ft.com
  try:
    scrape_ft_with_proxy_retry()
  except Exception as e:
    print("An error occurred while scraping ft.com:", str(e))

  # Scrape bloomberg.com
  try:
    scrape_bloomberg_with_proxy_retry()
  except Exception as e:
    print("An error occurred while scraping bloomberg.com:", str(e))

try:
  run_all_scrapers()
  os.system('say "All scrapers have finished running."')
except Exception as e:
  print("An error occurred while running all scrapers:", str(e))


Scraping progress:  29%|██▊       | 226/789 [03:17<08:10,  1.15date/s]
Scraping progress:  32%|███▏      | 254/789 [08:01<2:47:46, 18.82s/date]