In [3]:
%pip install datasets markdownify html2text huggingface_hub ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [4]:
import requests
import html2text
import markdownify
import pandas as pd
import concurrent.futures
from datasets import Dataset
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from huggingface_hub import notebook_login

In [5]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
url = "https://www.investopedia.com/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
}

In [7]:
response = requests.get(url, headers=headers)

In [8]:
soup = BeautifulSoup(response.text, "html.parser")

In [9]:
top_links = []
def get_nav_links():
    """
    Gets all the links from the navbar.
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    nav_links = soup.find_all("li", class_="header-nav__list-item")
    nav_top_links = []
    for link in nav_links:
        if "simulator" not in link.a["href"] and "academy" not in link.a["href"]:
            nav_top_links.append(link.a["href"])
            top_links.append(link.a["href"])
    for link in nav_top_links:
        response = requests.get(link, headers=headers)
        soup_sublink = BeautifulSoup(response.text, "html.parser")
        sub_links = soup_sublink.find_all("li", class_="header-nav__sublist-item")
        for sub_link in sub_links:
            if "simulator" not in sub_link.a["href"] and "academy" not in sub_link.a["href"]:
                top_links.append(sub_link.a["href"])
    

In [10]:
get_nav_links()

In [11]:
morein_links = []
def get_morein_links(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    morein_links_bottom = soup.find_all("a", class_="mntl-taxonomysc-sibling-node mntl-text-link")
    for morein_link in morein_links_bottom:
        morein_links.append(morein_link["href"])

In [12]:
num_workers = 30

In [13]:
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    executor.map(get_morein_links, top_links)

In [14]:
top_links.extend(morein_links)

In [15]:
top_links = list(set(top_links))

In [16]:
footer_links = []
def get_footer_links():
    footer_links_btm = soup.find_all("li", class_="comp terms-bar__item mntl-block")
    for link in footer_links_btm:
        footer_links.append(link.a["href"])

In [17]:
get_footer_links()

In [18]:
def get_faq_links(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    faq_links = soup.find_all("a", class_="accordion-content__feature-link mntl-text-link")
    return list(set([link["href"] for link in faq_links]))

In [19]:
def get_key_term_links(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    key_term_links = soup.find_all("a", class_="carousel-card__link mntl-text-link")
    return list(set([link["href"] for link in key_term_links]))

In [20]:
article_links = []
def get_article_links(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    links = soup.find_all('a', class_="comp mntl-card-list-items mntl-document-card mntl-card card card--no-image")
    if links:
        faq_links = get_faq_links(url)
        key_term_links = get_key_term_links(url)
        article_links.extend(key_term_links + faq_links + list(set([link["href"] for link in links])))
    else:
        footerlinks = soup.find_all("a", class_="dictionary-top300-list__list mntl-text-link")
        article_links.extend([link["href"] for link in footerlinks])

In [21]:
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    executor.map(get_article_links, top_links)

In [22]:
article_links = list(set(article_links))

In [23]:
titles = []
def get_article_title(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    title = soup.find("h1", class_="article-heading")
    if title:
        titles.append(title.text.strip())
    else:
        title = soup.find("title")
        titles.append(title.text.strip())

In [36]:
html_content = []
md_content = []
clean_content = []
def get_article_content(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    article = soup.find("div", class_="comp mntl-sc-page mntl-block article-body-content")
    review = soup.find("div", class_="article-body article-content")
    suggestion = soup.find("div", class_="article-content")
    if article:
        html_content.append(f"{article}")
        md_content.append(markdownify.markdownify(f"{article}", heading_style="ATX"))
        clean_content.append(" ".join([c.text for c in article]))
    elif review:
        html_content.append(f"{review}")
        md_content.append(markdownify.markdownify(f"{review}", heading_style="ATX"))
        clean_content.append(" ".join([c.text for c in review]))
    elif suggestion:
        html_content.append(f"{suggestion}")
        md_content.append(markdownify.markdownify(f"{suggestion}", heading_style="ATX"))
        clean_content.append(" ".join([c.text for c in suggestion]))

In [None]:
for link in tqdm(article_links):
    try:
        get_article_title(link)
        get_article_content(link)
    except Exception as e:
        print(e, link)

In [55]:
print(len(titles), len(html_content), len(md_content), len(clean_content))

4720 4 4 4


In [39]:
dataset_pd = {
    "url": article_links,
    "title": titles,
    "html_content": html_content,
    "md_content": md_content,
    "clean_content": clean_content
}

In [46]:
print(len(article_links), len(titles), len(html_content), len(md_content), len(clean_content))

4715 4715 4715 4715 4715


In [47]:
df = pd.DataFrame(dataset_pd)

In [48]:
df

Unnamed: 0,url,title,html_content,md_content,clean_content
0,https://www.investopedia.com/articles/exchange...,Building an All-ETF Portfolio,"<div class=""comp mntl-sc-page mntl-block artic...",The emergence of [exchange-traded funds](http...,The emergence of exchange-traded funds (ETFs)...
1,https://www.investopedia.com/gdp-growth-slowed...,Economic Growth Surprisingly Slowed To 1.6% In...,"<div class=""comp mntl-sc-page mntl-block artic...",### Key Takeaways\n\n\n* The U.S. GDP grew at ...,Key Takeaways\nThe U.S. GDP grew at an annual ...
2,https://www.investopedia.com/fallen-pandemic-s...,Fallen Pandemic Star Carvana Roars Back into G...,"<div class=""comp mntl-sc-page mntl-block artic...",### Key Takeaways\n\n\n* Carvana set profit an...,Key Takeaways\nCarvana set profit and sales re...
3,https://www.investopedia.com/top-cds-today-mar...,"Top CDs Today, March 27, 2024: Top Rates Range...","<div class=""comp mntl-sc-page mntl-block artic...",### Key Takeaways\n\n\n* The best nationally a...,Key Takeaways\n\nThe best nationally available...
4,https://www.investopedia.com/us-prosecutors-se...,US Prosecutors Seek Up To 50-Year Prison Sente...,"<div class=""comp mntl-sc-page mntl-block artic...",### Key Takeaways\n\n\n* Prosecutors in former...,Key Takeaways\nProsecutors in former FTX CEO S...
...,...,...,...,...,...
4710,spam,"Indirect Loan: What it is, How it Works, Examples",spam,spam,spam
4711,spam,What Is a Fiduciary Duty? Examples and Types E...,spam,spam,spam
4712,spam,Inverse Relation Between Interest Rates and Bo...,spam,spam,spam
4713,spam,Which Companies Are Winning and Losing as Infl...,spam,spam,spam


In [49]:
dataset = Dataset.from_pandas(df)

In [50]:
dataset.push_to_hub("openvega-simon/investopedia", token=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/openvega-simon/investopedia/commit/6f694d4676d8f4132502804cf6b258368ff53599', commit_message='Upload dataset', commit_description='', oid='6f694d4676d8f4132502804cf6b258368ff53599', pr_url=None, pr_revision=None, pr_num=None)

In [53]:
len(article_links)

4715