## Webscraping terms from investopedia
- locates links for each 'term' section and extracts raw html
- saves every definition by alphabetical order in a markdown file

In [2]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re

BASE_URL = "https://www.investopedia.com"
DICTIONARY_URL = f"{BASE_URL}/financial-term-dictionary-4769738"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
OUTPUT_DIR = "investopedia_terms"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def get_main_page():
    response = requests.get(DICTIONARY_URL, headers=HEADERS)
    response.raise_for_status()
    return response.text

main_html = get_main_page()
print(main_html)

<!DOCTYPE html>
<html id="dictionaryTemplate_1-0" class="comp dictionaryTemplate html mntl-html no-js " data-finance-resource-version="3.118.0" data-ab="99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,64" data-mm-transactional-resource-version="2.0.60" data-mm-ads-resource-version="2.1.44" data-mm-video-resource-version="2.1.18" data-mantle-resource-version="4.1.167" data-mm-digital-issues-resource-version="2.0.7" lang="en" data-tracking-container="true" data-resource-version="3.118.0" data-ddm-standard-tracking="true"><!--
<globe-environment environment="k8s-prod" application="finance" dataCenter="us-east-1"/>
-->
<head class="loc head">
<link rel="preconnect" href="//js-sec.indexww.com">
<link rel="preconnect" href="//c.amazon-adsystem.com">
<link rel="preconnect" href="//securepubads.g.doubleclick.net">
<link rel="dnsprefetch" href="//www.google-analytics.com">
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="robots" content="ma

In [4]:
def extract_section_links(html):
    """Find all valid A-Z and numeric section links."""
    soup = BeautifulSoup(html, "html.parser")
    links = {}
    for a in soup.find_all("a", href=True):
        href = a["href"]
        text = a.get_text(strip=True)

        # Match e.g. /terms-beginning-with-a-4769351 or /terms-beginning-with-num-4769352
        match = re.match(r"^/terms-beginning-with-(num|[a-z])-\d+", href)
        if match:
            key = "0-9" if match.group(1) == "num" else match.group(1).upper()
            links[key] = BASE_URL + href

    return links

section_links = extract_section_links(main_html)
print(section_links)

{}


In [26]:
def extract_section_links(html):
    soup = BeautifulSoup(html, "html.parser")
    links = {}
    for a in soup.find_all("a", href=True):
        href = a["href"]
        text = a.get_text(strip=True)
        if "terms-beginning-with" in href:
            # Use the letter or "0-9" if it's numeric
            if "num" in href:
                key = "0-9"
            else:
                match = re.search(r"terms-beginning-with-([a-z])", href)
                key = match.group(1).upper() if match else text[:1].upper()
            links[key] = href
    return links

section_links = extract_section_links(main_html)

print(section_links)

{'0-9': 'https://www.investopedia.com/terms-beginning-with-num-4769350', 'A': 'https://www.investopedia.com/terms-beginning-with-a-4769351', 'B': 'https://www.investopedia.com/terms-beginning-with-b-4769352', 'C': 'https://www.investopedia.com/terms-beginning-with-c-4769353', 'D': 'https://www.investopedia.com/terms-beginning-with-d-4769354', 'E': 'https://www.investopedia.com/terms-beginning-with-e-4769355', 'F': 'https://www.investopedia.com/terms-beginning-with-f-4769356', 'G': 'https://www.investopedia.com/terms-beginning-with-g-4769357', 'H': 'https://www.investopedia.com/terms-beginning-with-h-4769358', 'I': 'https://www.investopedia.com/terms-beginning-with-i-4769359', 'J': 'https://www.investopedia.com/terms-beginning-with-j-4769360', 'K': 'https://www.investopedia.com/terms-beginning-with-k-4769361', 'L': 'https://www.investopedia.com/terms-beginning-with-l-4769362', 'M': 'https://www.investopedia.com/terms-beginning-with-m-4769363', 'N': 'https://www.investopedia.com/terms-be

In [None]:
def extract_term_links(letter_html):
    soup = BeautifulSoup(letter_html, "html.parser")
    links = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("/terms/") and href.endswith(".asp"):
            links.add(href)
    return list(links)


for key in sorted(section_links):
    try:
        html = requests.get(section_links[key], headers=HEADERS).text
        soup = BeautifulSoup(html, "html.parser")
        links = set()
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if "/terms/" in href and href.endswith(".asp"):
                links.add(href)
                print(href)
        term_links = extract_term_links(html)
    except Exception as e:
        print(f"Failed to process section {key}: {e}")



https://www.investopedia.com/terms/1/0x-protocol.asp
https://www.investopedia.com/terms/1/1-10net30.asp
https://www.investopedia.com/terms/1/10-k.asp
https://www.investopedia.com/terms/1/10k-wrap.asp
https://www.investopedia.com/terms/1/10q.asp
https://www.investopedia.com/terms/1/10-yeartreasury.asp
https://www.investopedia.com/terms/1/100-equities-strategy.asp
https://www.investopedia.com/terms/1/1040.asp
https://www.investopedia.com/terms/1/1040a.asp
https://www.investopedia.com/terms/1/1040ez.asp
https://www.investopedia.com/terms/1/1092.asp
https://www.investopedia.com/terms/1/cofi.asp
https://www.investopedia.com/terms/1/125_loan.asp
https://www.investopedia.com/terms/1/12b-1fees.asp
https://www.investopedia.com/terms/1/12b-1-fund.asp
https://www.investopedia.com/terms/1/12b-1plan.asp
https://www.investopedia.com/terms/1/130-30_strategy.asp
https://www.investopedia.com/terms/1/18hour-city.asp
https://www.investopedia.com/terms/1/183-day-rule.asp
https://www.investopedia.com/terms

In [36]:

def extract_term_links(letter_html):
    soup = BeautifulSoup(letter_html, "html.parser")
    links = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "/terms/" in href and href.endswith(".asp"):
            links.add(href)
    return list(links)

def extract_term_definition(html):
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find("h1")
    term = title.get_text(strip=True) if title else "No Title Found"

    content = soup.find("div", class_="comp mntl-sc-page mntl-block") or \
              soup.find("div", class_="article-body")

    definition = content.get_text(separator="\n", strip=True) if content else "No Definition Found"
    return term, definition

def save_to_markdown(letter, terms):
    filepath = os.path.join(OUTPUT_DIR, f"{letter}.md")
    with open(filepath, "w", encoding="utf-8") as f:
        for term, definition in terms:
            f.write(f"## {term}\n\n{definition}\n\n")

records_done = ['0-9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O']

for key in sorted(section_links):
    print(f"Processing section: {key}")
    if key not in records_done:
        try:
            html = requests.get(section_links[key], headers=HEADERS).text
            term_links = extract_term_links(html)

            terms = []
            for term_url in term_links:
                try:
                    term_html = requests.get(term_url, headers=HEADERS).text
                    term, definition = extract_term_definition(term_html)
                    terms.append((term, definition))
                    time.sleep(1)
                except Exception as e:
                    print(f"Failed to fetch term: {term_url} — {e}")
            save_to_markdown(key, terms)
        except Exception as e:
            print(f"Failed to process section {key}: {e}")


Processing section: 0-9
Processing section: A
Processing section: B
Processing section: C
Processing section: D
Processing section: E
Processing section: F
Processing section: G
Processing section: H
Processing section: I
Processing section: J
Processing section: K
Processing section: L
Processing section: M
Processing section: N
Processing section: O
Processing section: P
Processing section: Q
Processing section: R
Failed to fetch term: https://www.investopedia.com/terms/r/riskreversal.asp — HTTPSConnectionPool(host='www.investopedia.com', port=443): Read timed out. (read timeout=None)
Processing section: S
Processing section: T
Processing section: U
Processing section: V
Processing section: W
Processing section: X
Processing section: Y
Processing section: Z
