In [23]:
%pip install datasets markdownify html2text

Note: you may need to restart the kernel to use updated packages.


In [24]:
import requests
import html2text
import markdownify
import pandas as pd
import concurrent.futures
from datasets import Dataset
from bs4 import BeautifulSoup

In [25]:
url = "https://www.investopedia.com/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
}

In [26]:
response = requests.get(url, headers=headers)

In [27]:
soup = BeautifulSoup(response.text, "html.parser")

In [28]:
top_links = []
def get_nav_links():
    """
    Gets all the links from the navbar.
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    nav_links = soup.find_all("li", class_="header-nav__list-item")
    nav_top_links = []
    for link in nav_links:
        if "simulator" not in link.a["href"] and "academy" not in link.a["href"]:
            nav_top_links.append(link.a["href"])
            top_links.append(link.a["href"])
    for link in nav_top_links:
        response = requests.get(link, headers=headers)
        soup_sublink = BeautifulSoup(response.text, "html.parser")
        sub_links = soup_sublink.find_all("li", class_="header-nav__sublist-item")
        for sub_link in sub_links:
            if "simulator" not in sub_link.a["href"] and "academy" not in sub_link.a["href"]:
                top_links.append(sub_link.a["href"])
    

In [29]:
get_nav_links()

In [30]:
morein_links = []
def get_morein_links(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    morein_links_bottom = soup.find_all("a", class_="mntl-taxonomysc-sibling-node mntl-text-link")
    for morein_link in morein_links_bottom:
        morein_links.append(morein_link["href"])

In [31]:
num_workers = 30

In [32]:
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    executor.map(get_morein_links, top_links)

In [33]:
top_links.extend(morein_links)

In [34]:
top_links = list(set(top_links))

In [35]:
footer_links = []
def get_footer_links():
    footer_links_btm = soup.find_all("li", class_="comp terms-bar__item mntl-block")
    for link in footer_links_btm:
        footer_links.append(link.a["href"])

In [36]:
get_footer_links()

In [37]:
def get_faq_links(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    faq_links = soup.find_all("a", class_="accordion-content__feature-link mntl-text-link")
    return list(set([link["href"] for link in faq_links]))

In [38]:
def get_key_term_links(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    key_term_links = soup.find_all("a", class_="carousel-card__link mntl-text-link")
    return list(set([link["href"] for link in key_term_links]))

In [39]:
article_links = []
def get_article_links(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    links = soup.find_all('a', class_="comp mntl-card-list-items mntl-document-card mntl-card card card--no-image")
    if links:
        faq_links = get_faq_links(url)
        key_term_links = get_key_term_links(url)
        article_links.extend(key_term_links + faq_links + list(set([link["href"] for link in links])))
    else:
        footerlinks = soup.find_all("a", class_="dictionary-top300-list__list mntl-text-link")
        article_links.extend([link["href"] for link in footerlinks])

In [40]:
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    executor.map(get_article_links, top_links)

In [41]:
article_links = list(set(article_links))

In [42]:
titles = []
def get_article_title(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    heading = soup.find("h1", class_="article-heading")
    if heading:
        titles.append(heading.text.strip())
    else:
        title = soup.find("title")
        titles.append(title.text.strip())

In [43]:
for link in article_links:
    try:
        get_article_title(link)
    except:
        print(link)

In [None]:
html_content = []
md_content = []
clean_content = []
def get_article_content(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    content = soup.find("div", class_="comp mntl-sc-page mntl-block article-body-content")
    html_content.append(f"{content}")
    md_content.append(markdownify.markdownify(f"{content}", heading_style="ATX"))
    clean_content.append(" ".join([c.text for c in content]))

In [None]:
for link in article_links:
    try:
        get_article_content(link)
    except:
        print(link)

In [None]:
dataset_pd = {
    "url": article_links,
    "title": titles,
    "html_content": html_content,
    "md_content": md_content,
    "clean_content": clean_content
}

In [None]:
df = pd.DataFrame(dataset_pd)

In [None]:
df

In [None]:
# dataset = Dataset.from_pandas(df)

In [None]:
# dataset.push_to_hub("openvega-simon/investopedia", private=True, token=True, max_shard_size="2GB")