In [48]:
from bs4 import BeautifulSoup

def clean_text(text):
    # Replace newline characters with spaces
    cleaned_text = text.replace("\n", " ")

    # Replace multiple spaces with a single space
    cleaned_text = " ".join(cleaned_text.split())
    return cleaned_text

def parse_section(html):
    soup = BeautifulSoup(html, 'html.parser')
    buttons = soup.find_all('button')
    accordion_contents = soup.find_all('div', {'data-testid': lambda x: x and x.startswith('accordion-content-')})
    parsed_data = []
    for i, (button, content) in enumerate(zip(buttons, accordion_contents), 1):
        heading = button.find('h2').text
        articles = []
        list_items = content.find_all('li')
        for j, li in enumerate(list_items, start=1):
            a = li.find('a')
            title = a.text
            link = a['href']
            articles.append(dict(title=clean_text(title), link=link, order=j))
        parsed_data.append(dict(order=i, heading=clean_text(heading), articles=articles))
    return parsed_data


In [49]:
import importlib.util
import os

data = []

for section_folder_name in ['01-sending-money', '02-managing-your-account', '03-holding-money', '04-wise-card', '05-receiving-money', '06-wise-business']:
    source_path = 'scraped-data/sections/' + section_folder_name + '/source.py'
    module_name = os.path.basename(source_path).split('.')[0]
    spec = importlib.util.spec_from_file_location(module_name, source_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    path = '/'.join(source_path.split('/')[:3])
    order_str = source_path.split('/')[2][:2]
    data.append(dict(path=path, order=int(order_str), link=module.link, heading=module.heading, title=module.title, subsections=parse_section(module.html)))

In [50]:
str(data[0])[:500]

"{'path': 'scraped-data/sections/01-sending-money', 'order': 1, 'link': 'https://wise.com/help/topics/5bVKT0uQdBrDp6T62keyfz/sending-money', 'heading': 'Sending money', 'title': 'Setting up, paying for, editing, and cancelling transfers.', 'subsections': [{'order': 1, 'heading': 'Sending money basics', 'articles': [{'title': 'How do I send money with Wise?', 'link': '/help/articles/2977959/how-do-i-send-money-with-wise', 'order': 1}, {'title': 'How long does a transfer take?', 'link': '/help/arti"

In [51]:
str(data[0]['subsections'][0])[:500]

"{'order': 1, 'heading': 'Sending money basics', 'articles': [{'title': 'How do I send money with Wise?', 'link': '/help/articles/2977959/how-do-i-send-money-with-wise', 'order': 1}, {'title': 'How long does a transfer take?', 'link': '/help/articles/2524078/how-long-does-a-transfer-take', 'order': 2}, {'title': 'Can I send exact amounts?', 'link': '/help/articles/2448314/can-i-send-exact-amounts', 'order': 3}, {'title': 'How do you notify me about a transfer?', 'link': '/help/articles/2553293/ho"

In [52]:
data[0]['subsections'][0]['articles'][0]

{'title': 'How do I send money with Wise?',
 'link': '/help/articles/2977959/how-do-i-send-money-with-wise',
 'order': 1}

In [53]:
# sum the number of articles in each content
sanity_check = {d['order']: sum([len(subsection['articles']) for subsection in d['subsections']]) for d in data}


In [54]:
# number of articles in total
sum(sanity_check.values())

289

In [55]:
import csv
from io import StringIO
import requests
import html2text
# Convert HTML content to Markdown
converter = html2text.HTML2Text()
# Don't want to deal with inline links
converter.ignore_links = True
# Ignore images
converter.ignore_images = True
# Ignore tables
converter.ignore_tables = True
converter.body_width = 0  # Disable line wrapping

base_url = "https://wise.com"

table_csv_getvalue = ''


def get_article_content(url):
    global table_csv_getvalue
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    html = soup.find("div", {"class": "article-content"})
    related_articles_section = soup.find("ul", {"class": "css-1mcz8c5"})
    related_articles = []

    for li in related_articles_section.find_all("li"):
        a = li.find("a")
        related_articles.append({
            "title": a.get_text().strip(),
            "link": base_url+ a["href"]
        })

    # Convert tables to CSV and replace them with markers
    tables = html.find_all("table")
    csv_tables = []
    if tables:
        print(f'Found {len(tables)} tables in {url}')
    for i, table in enumerate(tables):
        table_csv = StringIO()
        csv_writer = csv.writer(table_csv, lineterminator='\n')

        rows = table.find_all("tr")
        for row in rows:
            cells = row.find_all(["th", "td"])
            cell_list = [cell.get_text().strip() for cell in cells]
            csv_writer.writerow(cell_list)
        table_csv_getvalue = table_csv.getvalue().replace("\n", "  \n")
        csv_tables.append(table_csv_getvalue)
        table.replace_with(f"CSV_TABLE_MARKER_{i}")  # Place a marker

    # Convert HTML to markdown
    markdown_content = converter.handle(str(html))

    # Replace markers with CSV tables
    for i, table_csv_str in enumerate(csv_tables):
        markdown_content = markdown_content.replace(f"CSV_TABLE_MARKER_{i}", "\n --- CSV table begins ---" + "\n" + table_csv_str + " --- CSV table ends ---")


    return html, markdown_content.strip(), related_articles


In [56]:
from slugify import slugify
import json

for section in data:
    os.makedirs(section['path'] + '/subsections/', exist_ok=True)
    for subsection in section['subsections']:
        subsection_folder = section['path'] + '/subsections/' + str(subsection['order']).zfill(2) + '-' + slugify(subsection['heading'])
        os.makedirs(subsection_folder, exist_ok=True)
        os.makedirs(subsection_folder + '/articles', exist_ok=True)
        for article in subsection['articles']:
            article_folder_name = slugify(article['title'])
            article_folder_path = subsection_folder + '/articles/' + str(article['order']).zfill(2) + '-' + article_folder_name
            if os.path.exists(article_folder_path):
                continue
            os.makedirs(article_folder_path, exist_ok=False)
            article_url = base_url + article["link"]
            # print(f'Fetching {article_url} and saving to {article_folder_path}')
            html, md, related_articles = get_article_content(article_url)
            with open(article_folder_path + '/' + 'content.md', 'w') as f:
                f.write('# ' + article['title']+'\n\n'+md)
            with open(article_folder_path + '/' + 'metadata.json', 'w') as f:
                metadata = dict(title=article['title'], link=article_url, related_articles=related_articles)
                json.dump(metadata, f, indent=4)


Found 1 tables in https://wise.com/help/articles/2571942/what-countries-can-i-send-to
Found 2 tables in https://wise.com/help/articles/2973523/paying-for-a-transfer-with-apple-or-google-pay
Found 1 tables in https://wise.com/help/articles/2977942/how-much-does-it-cost-to-send-large-transfers
Found 2 tables in https://wise.com/help/articles/2932638/how-long-do-you-store-my-data-for
Found 1 tables in https://wise.com/help/articles/2978082/what-fee-am-i-charged-to-hold-large-amounts-of-eur
Found 3 tables in https://wise.com/help/articles/31thpWvBl38OL54poT6VAE/who-can-use-assets
Found 1 tables in https://wise.com/help/articles/4vA3EJSQTbRlgQTQpoVQUo/what-taxes-do-i-need-to-pay-when-i-use-assets
Found 1 tables in https://wise.com/help/articles/5kdpzaHyM0pla2j2YzgeFj/managing-your-balance-or-jar-when-its-held-as-interest
Found 2 tables in https://wise.com/help/articles/10HD8UtqMLH89OPpjdVuu2/what-fees-do-i-pay-when-i-hold-money-in-an-interest-earning-fund
Found 7 tables in https://wise.com/