In [None]:
folders_to_remove = set()
import os
import json
import requests

for root, dirs, files in os.walk('scraped-data/sections'):
    for file in files:
        if file == 'metadata.json':
            with open(os.path.join(root, file)) as f:
                article_metadata = json.load(f)
                article_url = article_metadata['link']
                response = requests.options(article_url)
                if response.status_code == 404:
                    print(f'404: {article_url}')
                    folders_to_remove.add(article_url)
                if response.status_code == 200:
                    print(f'200: {article_url}')

In [96]:
import urllib
from typing import List
import psycopg2
from psycopg2 import extras

DATABASE_URL = "postgres://euehbwygbuonvn:6bbd071effc209fc6ee43d005db2b664f90ed9a9fd50b0c3a06071cdc04c2bab@ec2-34-247-94-62.eu-west-1.compute.amazonaws.com:5432/df5f8k8354jtcm"




def connect(db_url):
    url = urllib.parse.urlparse(db_url)
    return psycopg2.connect(
        host=url.hostname,
        database=url.path[1:],
        user=url.username,
        password=url.password
    )


class Article:
    def __init__(self, id, title, url, html, markdown, md_hash, metadata,
                 section_title, section_subtitle, section_url, subsection_title,
                 created_at, updated_at, deleted_at, md_ada_002_embedding, qa):
        self.id = id
        self.title = title
        self.url = url
        self.html = html
        self.markdown = markdown
        self.md_hash = md_hash
        self.metadata = metadata
        self.section_title = section_title
        self.section_subtitle = section_subtitle
        self.section_url = section_url
        self.subsection_title = subsection_title
        self.created_at = created_at
        self.updated_at = updated_at
        self.deleted_at = deleted_at
        self.md_ada_002_embedding = md_ada_002_embedding
        self.qa = qa


def get_articles() -> List[Article]:
    with connect(DATABASE_URL) as conn:
        with conn.cursor(cursor_factory=extras.DictCursor) as cur:
            cur.execute("SELECT * FROM article WHERE deleted_at IS NULL")
            rows = cur.fetchall()

    articles = [Article(*row) for row in rows]
    return articles


class InsertableArticle:
    def __init__(self, title, url, html, markdown, metadata,
                 section_title, section_subtitle, section_url, subsection_title,
                 md_ada_002_embedding):
        self.title = title
        self.url = url
        self.html = html
        self.markdown = markdown
        self.metadata = metadata
        self.section_title = section_title
        self.section_subtitle = section_subtitle
        self.section_url = section_url
        self.subsection_title = subsection_title
        self.md_ada_002_embedding = md_ada_002_embedding


def insert_article(conn, article: InsertableArticle):
    with conn.cursor() as cur:
        cur.execute(
            """
            INSERT INTO article (title, url, html, markdown, metadata,
                                 section_title, section_subtitle, section_url, subsection_title,
                                 md_ada_002_embedding)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """,
            (article.title, article.url, article.html, article.markdown,
             json.dumps(article.metadata), article.section_title,
             article.section_subtitle, article.section_url, article.subsection_title,
             article.md_ada_002_embedding)
        )
    conn.commit()

def delete_article(conn, article_url):
    with conn.cursor() as cur:
        cur.execute(
            """
            UPDATE article
            SET deleted_at = now()
            WHERE url = %s
            """,
            (article_url,)
        )
    conn.commit()


def article_with_url_exists(conn, url) -> bool:
    with conn.cursor() as cur:
        cur.execute(
            """
            SELECT EXISTS(
                SELECT 1
                FROM article
                WHERE url = %s
            )
            """,
            (url,)
        )
        return cur.fetchone()[0]

In [51]:
import json
with open("scraped-data/index.json", "r") as f:
    data = json.load(f)

In [52]:
import numpy as np
with open('embeddings/article_embeddings.npy', 'rb') as f:
    article_embeddings = np.load(f)

article_embeddings = {article_nr: article_embedding for article_nr, article_embedding in enumerate(article_embeddings)}

In [53]:
import requests
base_url = "https://wise.com"
counter = 0

with connect(DATABASE_URL) as conn:
    for section in data:
        for subsection in section['subsections']:
            for article in subsection['articles']:
                ordering = counter
                article_title = article['title']
                article_url = base_url + article['link']
                print(f'Processing article {article_title} at {article_url}')
                if article_with_url_exists(conn, article_url):
                    print(f'Article {article_url} already exists in database. Skipping.')
                else:
                    response = requests.get(article_url)
                    article_html = response.text
                    with open(article['folder_path'] + '/content.md', 'r') as f:
                        article_md = f.read()
                    with open(article['folder_path'] + '/metadata.json', 'r') as f:
                        article_metadata = json.load(f)
                    article_embedding = article_embeddings[ordering]

                    insertable_article = InsertableArticle(
                        title=article_title,
                        url=article_url,
                        html=article_html,
                        markdown=article_md,
                        metadata=article_metadata,
                        section_title=section['heading'],
                        section_subtitle=section['title'],
                        section_url=section['link'],
                        subsection_title=subsection['heading'],
                        md_ada_002_embedding=article_embedding.tolist()
                    )

                    print(f'Inserting article {insertable_article.url}')

                    insert_article(conn, insertable_article)
                    if response.status_code == 404:
                        print(f'404: {article_url}')
                        delete_article(conn, insertable_article.url)
                counter += 1


Processing article How do I send money with Wise? at https://wise.com/help/articles/2977959/how-do-i-send-money-with-wise
Inserting article https://wise.com/help/articles/2977959/how-do-i-send-money-with-wise
Processing article How long does a transfer take? at https://wise.com/help/articles/2524078/how-long-does-a-transfer-take
Inserting article https://wise.com/help/articles/2524078/how-long-does-a-transfer-take
Processing article Can I send exact amounts? at https://wise.com/help/articles/2448314/can-i-send-exact-amounts
Inserting article https://wise.com/help/articles/2448314/can-i-send-exact-amounts
Processing article How do you notify me about a transfer? at https://wise.com/help/articles/2553293/how-do-you-notify-me-about-a-transfer
Inserting article https://wise.com/help/articles/2553293/how-do-you-notify-me-about-a-transfer
Processing article How do I download a transfer receipt? at https://wise.com/help/articles/2977946/how-do-i-download-a-transfer-receipt
Inserting article h

In [59]:
# open qas.json and insert questions_answers column into article table
# fetch all articles, iterate over them and insert questions_answers column

with open('testing/qas.json', 'r') as f:
    qas = json.load(f)
qas = {qa['link']: qa['QA'] for qa in qas}

In [60]:

with connect(DATABASE_URL) as conn:
    with conn.cursor() as cur:
        cur.execute(
            """
            SELECT id, url
            FROM article
            WHERE deleted_at IS NULL
            """
        )
        articles = cur.fetchall()
        for article in articles:
            article_id = article[0]
            article_url = article[1]
            if article_url in qas:
                questions_answers = qas[article_url]
                cur.execute(
                    """
                    UPDATE article
                    SET questions_answers = %s
                    WHERE id = %s
                    """,
                    (json.dumps(questions_answers), article_id)
                )
                conn.commit()
                print(f'Inserted questions_answers for article {article_url}')
            else:
                print(f'No questions_answers for article {article_url}')

Inserted questions_answers for article https://wise.com/help/articles/2977959/how-do-i-send-money-with-wise
Inserted questions_answers for article https://wise.com/help/articles/2524078/how-long-does-a-transfer-take
Inserted questions_answers for article https://wise.com/help/articles/2448314/can-i-send-exact-amounts
Inserted questions_answers for article https://wise.com/help/articles/2553293/how-do-you-notify-me-about-a-transfer
Inserted questions_answers for article https://wise.com/help/articles/2971476/how-long-do-refunds-take
Inserted questions_answers for article https://wise.com/help/articles/2977946/how-do-i-download-a-transfer-receipt
Inserted questions_answers for article https://wise.com/help/articles/2565007/do-i-need-to-pay-any-tax-on-transfers
Inserted questions_answers for article https://wise.com/help/articles/2491525/how-can-i-repeat-a-transfer
Inserted questions_answers for article https://wise.com/help/articles/2977993/can-i-send-from-a-joint-bank-account
Inserted q

In [76]:
import psycopg2

class TestQuestionAnswer:
    def __init__(self, id, question, answer, article_url, md_ada_002_question_embedding):
        self.id = id
        self.question = question
        self.answer = answer
        self.article_url = article_url
        self.md_ada_002_question_embedding = md_ada_002_question_embedding

def insert_test_question_answer(conn, test_question_answer):
    with conn.cursor() as cur:
        cur.execute(
            """
            INSERT INTO test_question_answer (question, answer, article_url, md_ada_002_question_embedding)
            VALUES (%s, %s, %s, %s)
            ON CONFLICT (question) DO NOTHING
            """,
            (test_question_answer.question, test_question_answer.answer, test_question_answer.article_url,
             test_question_answer.md_ada_002_question_embedding)
        )
    conn.commit()


In [78]:
EMBEDDING_MODEL = "text-embedding-ada-002"
import json
import openai

def set_pactum_api_key():
    openai.api_key = ''

def set_my_api_key():
    openai.api_key = ''

set_pactum_api_key()

def get_next_question():
    with open('testing/qas.json', 'r') as f:
        qas = json.load(f)
    for ooo in qas:
        for qa in ooo["QA"]:
            yield qa["question"], qa["answer"], ooo["link"]


def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
        model=model,
        input=text
    )
    return result["data"][0]["embedding"]

with connect(DATABASE_URL) as conn:
    for question, answer, link in get_next_question():
        print(question)
        emb = get_embedding(question)
        insert_test_question_answer(conn, TestQuestionAnswer(
            id=None,
            article_url=link,
            question=question,
            answer=answer,
            md_ada_002_question_embedding=emb
        ))



How do I send money with Wise?
Can I send money using an email address if I don’t have the recipient's bank details?
How can I make sure my recipient gets an exact amount?
What are the payment options for sending money through Wise?
How will I know when my transfer is complete and the money is on its way?
How long does a transfer take?
How can I get an estimate of the transfer time?
Do the receiver's bank processing time affect the transfer duration?
Are some payment methods quicker than others?
What factors can add extra time to my transfer?
Can I send exact amounts when transferring money?
What is the purpose of the 3% amount lock deposit?
Do I always have to select the padlock icon to guarantee the recipient gets the exact amount?
In which currencies is the amount lock feature available?
How will I be notified about my transfer?
What if I don't receive an email about my transfer?
Will you contact me by phone about my transfer?
How can I download a transfer receipt on the website?
Ho

In [94]:
from bs4 import BeautifulSoup
import csv
from io import StringIO
import requests
import html2text

# Convert HTML content to Markdown
converter = html2text.HTML2Text()
# Don't want to deal with inline links
converter.ignore_links = True
# Ignore images
converter.ignore_images = True
# Ignore tables
converter.ignore_tables = True
converter.body_width = 0  # Disable line wrapping

base_url = "https://wise.com"


def get_article_content(response_text):
    soup = BeautifulSoup(response_text, "html.parser")
    html = soup.find("div", {"class": "article-content"})
    related_articles = []
    related_articles_section = soup.find("ul", {"class": "css-1mcz8c5"})
    if related_articles_section:
        for li in related_articles_section.find_all("li"):
            a = li.find("a")
            related_articles.append({
                "title": a.get_text().strip(),
                "link": base_url + a["href"]
            })

    # Convert tables to CSV and replace them with markers
    tables = html.find_all("table")
    csv_tables = []
    for i, table in enumerate(tables):
        table_csv = StringIO()
        csv_writer = csv.writer(table_csv, lineterminator='\n')

        rows = table.find_all("tr")
        for row in rows:
            cells = row.find_all(["th", "td"])
            cell_list = [cell.get_text().strip() for cell in cells]
            csv_writer.writerow(cell_list)
        table_csv_getvalue = table_csv.getvalue().replace("\n", "  \n")
        csv_tables.append(table_csv_getvalue)
        table.replace_with(f"CSV_TABLE_MARKER_{i}")  # Place a marker

    # Convert HTML to markdown
    markdown_content = converter.handle(str(html))
    if markdown_content.strip() == "":
        raise Exception(f"No content found for {response_text}")
    # Replace markers with CSV tables
    for i, table_csv_str in enumerate(csv_tables):
        markdown_content = markdown_content.replace(f"CSV_TABLE_MARKER_{i}",
                                                    "\n--- CSV table begins ---" + "  \n" + table_csv_str + "--- CSV table ends ---  \n")

    return html, markdown_content.strip(), related_articles


In [101]:

from json import JSONDecodeError
import os

import requests

existing_urls = {a.url for a in get_articles()}


with connect(DATABASE_URL) as conn:
    for root, dirs, files in os.walk('scraped-data/related-articles'):
        for dir in dirs:
            os_path = os.path.join(root, dir, 'metadata.json')
            with open(os_path, 'r') as f:
                try:
                    article_metadata = json.load(f)
                except Exception as e:
                    print(os_path)
                    raise e

            article_url = article_metadata['link']
            article_title = article_metadata['title']
            if article_url in existing_urls:
                print(f'Article {article_url} already exists in database. Skipping.')
            else:
                response = requests.get(article_url)
                article_html = response.text
                _, article_md, _ = get_article_content(article_html)

                insertable_article = InsertableArticle(
                    title=article_title,
                    url=article_url,
                    html=article_html,
                    markdown=article_md,
                    metadata=article_metadata,
                    section_title=None,
                    section_subtitle=None,
                    section_url=None,
                    subsection_title=None,
                    md_ada_002_embedding=None
                )

                print(f'Inserting article {insertable_article.url}')

                insert_article(conn, insertable_article)
                if response.status_code == 404:
                    print(f'404: {article_url}')
                    delete_article(conn, insertable_article.url)


Article https://wise.com/help/articles/2935763/my-wise-card-hasnt-arrived-yet already exists in database. Skipping.
Article https://wise.com/help/articles/6cPPSuMCHSrDI1VYJCzy7v/can-i-transfer-my-business-account-to-someone-else already exists in database. Skipping.
Article https://wise.com/help/articles/4LB2UIQjHBGiof7UBipZpU/how-will-the-change-to-wise-affect-my-login-account-details-and-card already exists in database. Skipping.
Article https://wise.com/help/articles/5W5j4mHrJQezaHSEyAykHE/whats-the-risk-of-holding-money-in-interest already exists in database. Skipping.
Article https://wise.com/help/articles/2954849/how-to-take-a-screenshot already exists in database. Skipping.
Article https://wise.com/help/articles/2973523/paying-for-a-transfer-with-apple-or-google-pay already exists in database. Skipping.
Article https://wise.com/help/articles/2935789/why-doesnt-contactless-work-for-my-wise-card already exists in database. Skipping.
Article https://wise.com/help/articles/1mP764ZaF