In [2]:
%pip install -r requirements.txt

670.43s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import re
import shutil
import requests
import numpy as np
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
from typing import List, Dict, Union

In [None]:
# shutil.rmtree('text')
# shutil.rmtree('processed')

Crawl zuddl knowledge base and text preprocessing

In [None]:
# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

# Define root domain to crawl
domain = "help.zuddl.com"
full_url = "https://help.zuddl.com/support/home"

Page = Dict[str, Union[str, Dict[str, str]]]
PAGES: List[Page] = []

# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self) -> None:
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs) -> None:
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

# Function to get the hyperlinks from a URL
def get_hyperlinks(url: str) -> List[str]:
    
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            
            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain: str, url: str) -> List[str]:
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))

In [None]:
def truncate_string(text: str) -> str:
    end_index = text.find("Was this article helpful?")
    start_index = text.find("Sorry! nothing found for") + 24

    if start_index != -1 or end_index != -1:
      return text[start_index:end_index]
    if start_index != -1:
      return text[:end_index]
    if end_index != -1:
      return text[start_index:]
    
    return text

def remove_newlines(text: str) -> str:
    text = text.replace('\n', ' ')
    text = text.replace('\\n', ' ')
    text = text.replace('  ', ' ')
    text = text.replace('  ', ' ')
    return text

def process_text(text: str) -> str:
  text = remove_newlines(text)
  text = truncate_string(text)
  return text.strip()

In [None]:
def crawl(url: str) -> None:
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    if not os.path.exists("text/"):
            os.mkdir("text/")

    if not os.path.exists("text/"+local_domain+"/"):
            os.mkdir("text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists("processed"):
            os.mkdir("processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()

        # Save text from the url to a <url>.txt file
        with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w") as f:

            # Get the text from the URL using BeautifulSoup
            soup = BeautifulSoup(requests.get(url).text, "html.parser")

            # Get the text but remove the tags
            text = soup.get_text()

            processed_text = process_text(text)

            if soup.title:
                title = str(soup.title.string).strip()
            else:
                title = ""
            
            print(f"{title} - {url}")

            # If the crawler gets to a page that requires JavaScript, it will stop the crawl
            if ("You need to enable JavaScript to run this app." in text):
                print("Unable to parse page " + url + " due to JavaScript being required")
            
            # Otherwise, write the text to the file in the text directory
            f.write(processed_text)
            
            PAGES.append({
              'content': processed_text + '\n\n',
              'metadata': {
                'title': title,
                'url': url
              }
            })

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen:
                queue.append(link)
                seen.add(link)

In [None]:
crawl(full_url)

Sample Metadata output

In [None]:
print(PAGES[200]['metadata'])

Using tiktoken to estimate the total number of tokens

In [None]:
import tiktoken

# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")

data_modified = [
    {
        'title': d['metadata']['title'],
        'url': d['metadata']['url'],
        'content': d['content']
    }
    for d in PAGES
]

# Create DataFrame
df_tokens = pd.DataFrame(data_modified)

# Tokenize the text and save the number of tokens to a new column
df_tokens['n_tokens'] = df_tokens.content.apply(lambda x: len(tokenizer.encode(x)))

# Visualize the distribution of the number of tokens per row using a histogram
df_tokens.n_tokens.hist()

In [None]:
df_tokens

Splitting the text into smaller chunks for efficiency

In [None]:
from langchain.text_splitter import TokenTextSplitter

chunk_type = Dict[str, Union[str, int, List]]

def chunk_data() -> List[chunk_type]:
    text_splitter = TokenTextSplitter(encoding_name="cl100k_base", chunk_size=500, chunk_overlap=100)

    chunks = []

    for page in PAGES:
        splits = text_splitter.split_text(page['content'])

        for split in splits:
          chunks.append({
              "article_title": page['metadata']['title'],
              "article_url": page['metadata']['url'],
              "content": split,
              "content_length": len(split),
              "content_tokens": len(tokenizer.encode(split)),
              "embedding": []
          })
        print(f"Splitted {page['metadata']['url']} into {len(splits)} chunks.")

    print(f'Total chunks created: {len(chunks)}.\n')
    return chunks

In [None]:
chunks = chunk_data()

Sample chunked object

In [None]:
import json
print(json.dumps(chunks[7], sort_keys=True, indent=4))

Distribution for number of tokens in the chunks dataframe

In [None]:
df_chunked = pd.DataFrame(chunks)

# Visualize the distribution of the number of tokens per row using a histogram
df_chunked.content_tokens.hist()

In [None]:
df_chunked

Saving the dataframe as CSV and JSON

In [None]:
df_chunked.to_csv('processed/chunks.csv', index=True)
df_chunked.to_json('processed/chunks.json', orient='records', index=True)

Total number of tokens generated

In [None]:
df_chunked['content_tokens'].sum()

Generating embeddings using OpenAI embeddings API

In [None]:
import openai

openai.api_key = os.environ.get('OPENAI_API_KEY')

def create_embedding(text: str):
    response = openai.Embedding.create(
        input = text,
        model = 'text-embedding-ada-002'
    )

    print(f"response: {response}")

    embedding = response['data'][0]['embedding']
    
    print(f"embedding: {embedding}")

    with open('processed/embeds.txt', 'a') as f1:
        f1.write(str(embedding) + '\n\n')

    with open('processed/response.txt', 'a') as f0:
        f0.write(text + '\n' + str(response) + '\n\n\n')

    return embedding

In [None]:
for i, row in df_chunked.iterrows():
  print(f"Processing row {i} of {len(df_chunked)}")
  
  embedding = create_embedding(row["content"])

  if embedding:
    df_chunked.at[i, "embedding"] = embedding

In [None]:
df_chunked

Saving the dataframe as CSV and JSON

In [None]:
df_chunked.to_csv('processed/chunks_with_embeds.csv', index=True)
df_chunked.to_json('processed/chunks_with_embeds.json', orient='records', index=True)

Saving the dataframe to postgres database

In [None]:
from supabase import create_client, Client

url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(url, key)

In [None]:
def insert_articles(row) -> None:
		
		data, count = supabase.table('articles').insert({
			"article_title": row["article_title"],
			"article_url": row["article_url"],
			"content_length": row["content_length"],
			"content_tokens": row["content_tokens"],
			"content": row["content"],
			"embedding": row["embedding"],
			}).execute()
		
		print(f"API response: {data}, {count}\n")

In [None]:
for i, row in df_chunked.iterrows():
	print(f"Inserting row {i} of {len(df_chunked)}")
	insert_articles(row)