In [1]:
from bs4 import BeautifulSoup


def clean_text(text):
    # Replace newline characters with spaces
    cleaned_text = text.replace("\n", " ")

    # Replace multiple spaces with a single space
    cleaned_text = " ".join(cleaned_text.split())
    return cleaned_text


def parse_section(html):
    soup = BeautifulSoup(html, 'html.parser')
    buttons = soup.find_all('button')
    accordion_contents = soup.find_all('div', {'data-testid': lambda x: x and x.startswith('accordion-content-')})
    parsed_data = []
    for i, (button, content) in enumerate(zip(buttons, accordion_contents), 1):
        heading = button.find('h2').text
        articles = []
        list_items = content.find_all('li')
        for j, li in enumerate(list_items, start=1):
            a = li.find('a')
            title = a.text
            link = a['href']
            articles.append(dict(title=clean_text(title), link=link, order=j))
        parsed_data.append(dict(order=i, heading=clean_text(heading), articles=articles))
    return parsed_data


In [16]:
import importlib.util
import os

data = []

for section_folder_name in ['01-sending-money', '02-managing-your-account', '03-holding-money', '04-wise-card',
                            '05-receiving-money', '06-wise-business']:
    source_path = 'scraped-data/sections/' + section_folder_name + '/source.py'
    module_name = os.path.basename(source_path).split('.')[0]
    spec = importlib.util.spec_from_file_location(module_name, source_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    path = '/'.join(source_path.split('/')[:3])
    order_str = source_path.split('/')[2][:2]
    data.append(dict(path=path, order=int(order_str), link=module.link, heading=module.heading, title=module.title,
                     subsections=parse_section(module.html)))

In [17]:
str(data[0])[:500]

"{'path': 'scraped-data/sections/01-sending-money', 'order': 1, 'link': 'https://wise.com/help/topics/5bVKT0uQdBrDp6T62keyfz/sending-money', 'heading': 'Sending money', 'title': 'Setting up, paying for, editing, and cancelling transfers.', 'subsections': [{'order': 1, 'heading': 'Sending money basics', 'articles': [{'title': 'How do I send money with Wise?', 'link': '/help/articles/2977959/how-do-i-send-money-with-wise', 'order': 1}, {'title': 'How long does a transfer take?', 'link': '/help/arti"

In [18]:
str(data[0]['subsections'][0])[:500]

"{'order': 1, 'heading': 'Sending money basics', 'articles': [{'title': 'How do I send money with Wise?', 'link': '/help/articles/2977959/how-do-i-send-money-with-wise', 'order': 1}, {'title': 'How long does a transfer take?', 'link': '/help/articles/2524078/how-long-does-a-transfer-take', 'order': 2}, {'title': 'Can I send exact amounts?', 'link': '/help/articles/2448314/can-i-send-exact-amounts', 'order': 3}, {'title': 'How do you notify me about a transfer?', 'link': '/help/articles/2553293/ho"

In [19]:
data[0]['subsections'][0]['articles'][0]

{'title': 'How do I send money with Wise?',
 'link': '/help/articles/2977959/how-do-i-send-money-with-wise',
 'order': 1}

In [20]:
# sum the number of articles in each content
sanity_check = {d['order']: sum([len(subsection['articles']) for subsection in d['subsections']]) for d in data}


In [21]:
# number of articles in total
sum(sanity_check.values())

289

In [34]:
import csv
from io import StringIO
import requests
import html2text

# Convert HTML content to Markdown
converter = html2text.HTML2Text()
# Don't want to deal with inline links
converter.ignore_links = True
# Ignore images
converter.ignore_images = True
# Ignore tables
converter.ignore_tables = True
converter.body_width = 0  # Disable line wrapping

base_url = "https://wise.com"


def get_article_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    html = soup.find("div", {"class": "article-content"})
    if not html:
        print(f"Could not find article content in {url}")
        return None, None, None
    related_articles = []
    related_articles_section = soup.find("ul", {"class": "css-1mcz8c5"})
    if related_articles_section:
        for li in related_articles_section.find_all("li"):
            a = li.find("a")
            related_articles.append({
                "title": a.get_text().strip(),
                "link": base_url + a["href"]
            })

    # Convert tables to CSV and replace them with markers
    tables = html.find_all("table")
    csv_tables = []
    if tables:
        print(f'Found {len(tables)} tables in {url}')
    for i, table in enumerate(tables):
        table_csv = StringIO()
        csv_writer = csv.writer(table_csv, lineterminator='\n')

        rows = table.find_all("tr")
        for row in rows:
            cells = row.find_all(["th", "td"])
            cell_list = [cell.get_text().strip() for cell in cells]
            csv_writer.writerow(cell_list)
        table_csv_getvalue = table_csv.getvalue().replace("\n", "  \n")
        csv_tables.append(table_csv_getvalue)
        table.replace_with(f"CSV_TABLE_MARKER_{i}")  # Place a marker

    # Convert HTML to markdown
    markdown_content = converter.handle(str(html))

    # Replace markers with CSV tables
    for i, table_csv_str in enumerate(csv_tables):
        markdown_content = markdown_content.replace(f"CSV_TABLE_MARKER_{i}",
                                                    "\n--- CSV table begins ---" + "  \n" + table_csv_str + "--- CSV table ends ---  \n")

    return html, markdown_content.strip(), related_articles


In [8]:
from slugify import slugify
import json

for section in data:
    path_subsections_ = section['path'] + '/subsections/'
    os.makedirs(path_subsections_, exist_ok=True)
    for subsection in section['subsections']:
        subsection_folder = path_subsections_ + str(subsection['order']).zfill(2) + '-' + slugify(subsection['heading'])
        os.makedirs(subsection_folder, exist_ok=True)
        subsection['folder_path'] = subsection_folder
        os.makedirs(subsection_folder + '/articles', exist_ok=True)
        for article in subsection['articles']:
            article_folder_name = slugify(article['title'])
            article_folder_path = subsection_folder + '/articles/' + str(article['order']).zfill(
                2) + '-' + article_folder_name
            # add article_folder_path to article
            article['folder_path'] = article_folder_path
            os.makedirs(article_folder_path, exist_ok=True)
            article_url = base_url + article["link"]
            print(f'Fetching {article_url} and saving to {article_folder_path}')
            html, md, related_articles = get_article_content(article_url)
            md_with_headings = f"# {subsection['heading']}  \n## {article['title']}  \n{md}"
            with open(article_folder_path + '/' + 'content.md', 'w') as f:
                f.write(md_with_headings)
            with open(article_folder_path + '/' + 'metadata.json', 'w') as f:
                metadata = dict(title=article['title'], link=article_url, related_articles=related_articles)
                json.dump(metadata, f, indent=4)


NameError: name 'os' is not defined

In [2]:
## serialize data
import json

with open('scraped-data/index.json', 'w') as f:
    json.dump(data, f, indent=2)

NameError: name 'data' is not defined

In [5]:
import json
with open("scraped-data/index.json", "r") as f:
    data = json.load(f)

In [64]:
existing_articles = set()
for section in data:
    for subsection in section["subsections"]:
        for article in subsection["articles"]:
            article_url = base_url + article["link"]
            existing_articles.add(article_url)
len(existing_articles)

288

In [65]:
# walk through all folders and subfolders at /Users/kristjan.roosild/repos/wise-cs/scraped-data/sections and find all files called metadata.json
# open them and read the related_articles list. Each element in the list is a dict which has a link key. Add all the links to the  related_articles_not_scraped list
import os
related_articles = dict()

for root, dirs, files in os.walk('scraped-data/sections'):
    for file in files:
        if file == 'metadata.json':
            with open(os.path.join(root, file)) as f:
                metadata = json.load(f)
                for related_article in metadata['related_articles']:
                    # drop the query string
                    cleaned_up_url = related_article['link'].split('?')[0]
                    if cleaned_up_url not in existing_articles:
                        related_articles[cleaned_up_url] = related_article['title']


In [66]:
len(related_articles)

146

In [67]:
# scrape the remaining articles
os.makedirs('scraped-data/related-articles/', exist_ok=True)


def scrape_related(related_articles_not_scraped):
    for article_url, article_title_ in related_articles_not_scraped.items():
        article_folder_name = slugify(article_title_)
        print(f'Fetching {article_url}')
        try:
            html, md, related_articles = get_article_content(article_url)
        except AttributeError:
            raise Exception(f'Failed to fetch {article_url}')
        if not html:
            continue
        full_folder_path = 'scraped-data/related-articles/' + article_folder_name
        print(f'Saving to {full_folder_path}')
        os.makedirs(full_folder_path, exist_ok=True)
        md_with_headings = f"## {article_title_}  \n{md}"
        with open(full_folder_path + '/' + 'content.md', 'w') as f:
            f.write(md_with_headings)
        with open(full_folder_path + '/' + 'metadata.json', 'w') as f:
            metadata = dict(title=article_title_, link=article_url, related_articles=related_articles)
            json.dump(metadata, f, indent=4)


In [68]:
existing_related_articles = set()
for root, dirs, files in os.walk('scraped-data/related-articles'):
    for file in files:
        if file == 'metadata.json':
            with open(os.path.join(root, file)) as f:
                # check if file is empty
                if os.stat(os.path.join(root, file)).st_size == 0:
                    continue
                metadata = json.load(f)
                article_url = metadata['link']
                existing_related_articles.add(article_url)

In [69]:

related_articles_not_scraped = dict()
for root, dirs, files in os.walk('scraped-data/related-articles'):
    for file in files:
        if file == 'metadata.json':
            with open(os.path.join(root, file)) as f:
                # check if file is empty
                if os.stat(os.path.join(root, file)).st_size == 0:
                    continue
                metadata = json.load(f)
                for related_article in metadata['related_articles']:
                    # drop the query string
                    cleaned_up_url = related_article['link'].split('?')[0]
                    if cleaned_up_url not in existing_related_articles:
                        related_articles_not_scraped[cleaned_up_url] = related_article['title']
len(related_articles_not_scraped)

6

In [70]:
scrape_related(related_articles_not_scraped)


Fetching https://wise.com/help/articles/PhxIBARqUVe4P7yZNVxZw/how-do-i-approvereject-a-payment-requiring-approval
Saving to scraped-data/related-articles/how-do-i-approve-reject-a-payment-requiring-approval
Fetching https://wise.com/help/articles/5dLOaJKVT5yAtq7G2OCCXS/how-do-i-check-the-details-of-an-approvedrejected-transfer
Saving to scraped-data/related-articles/how-do-i-check-the-details-of-an-approved-rejected-transfer
Fetching https://wise.com/help/articles/2958229/whats-an-api-token
Could not find article content in https://wise.com/help/articles/2958229/whats-an-api-token
Fetching https://wise.com/help/articles/6cskaMU5RKpsmxqN7n8X1C/how-does-my-connection-with-quickbooks-work
Saving to scraped-data/related-articles/how-does-my-connection-with-quickbooks-work
Fetching https://wise.com/help/articles/2932350/guide-to-gel-transfers
Saving to scraped-data/related-articles/guide-to-gel-transfers
Fetching https://wise.com/help/articles/6PgFe27V7Fw8v38Hygt4H0/earning-cash-back-with-y