In [1]:
import sys
import os

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.preprocessing.pdf_extract import recursive_extract_text

recursive_extract_text('articles/raw/')

Processing articles/raw/Why I (Still) Like the Serverless Framework over the CDK _ DeBrie Advisory.pdf
Processing articles/raw/Event-Driven Architectures vs. Event-Based Compute in Serverless Applications _ DeBrie Advisory.pdf
Processing articles/raw/Key Takeaways from the DynamoDB Paper _ DeBrie Advisory.pdf
Processing articles/raw/How you should think about DynamoDB costs _ DeBrie Advisory.pdf
Processing articles/raw/GraphQL, DynamoDB, and Single-table Design _ DeBrie Advisory.pdf
Processing page 3/3

In [None]:
from src.preprocessing.pdf_extract import extract_text

pdf_path = 'articles/raw/Building a Massively Scalable Serverless Chat Application with AWS AppSync _ by Sarah Hamilton _ Serverless Transformation _ Medium.pdf'
output_path = 'Building a Massively Scalable Serverless Chat Application with AWS AppSync _ by Sarah Hamilton _ Serverless Transformation _ Medium.txt'

extract_text(pdf_path, output_path)


Processing page 5/5

In [2]:
from src.preprocessing.chunk import recursive_chunk_text
from src.preprocessing.chunk import chunk_text
import os

# delete articles/chunks/chunks.txt
if os.path.exists('articles/chunks/chunks.txt'):
    os.remove('articles/chunks/chunks.txt')
    
recursive_chunk_text('articles/parsed/')
# chunk_text('articles/parsed/Building a Massively Scalable Serverless Chat Application with AWS AppSync _ by Sarah Hamilton _ Serverless Transformation _ Medium.txt')

Processing articles/parsed/Event-Driven Architectures vs. Event-Based Compute in Serverless Applications _ DeBrie Advisory.txt
Processing articles/parsed/Key Takeaways from the DynamoDB Paper _ DeBrie Advisory.txt
Processing articles/parsed/Why I (Still) Like the Serverless Framework over the CDK _ DeBrie Advisory.txt
Processing articles/parsed/How you should think about DynamoDB costs _ DeBrie Advisory.txt
Processing articles/parsed/GraphQL, DynamoDB, and Single-table Design _ DeBrie Advisory.txt


In [4]:
from src.preprocessing.neutralise import create_neutralised_dataset

neutraliser = create_neutralised_dataset('articles/chunks/chunks.txt', 'articles/neutralised/training_data.jsonl')

Neutralising chunk: 29/29

In [3]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

with open('articles/chunks/chunks.txt', 'r') as file:
    chunks = file.read()

print(num_tokens_from_string(chunks, "cl100k_base"))

11512


In [15]:
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain_community.document_loaders import WebBaseLoader

# Load
loader = WebBaseLoader(web_paths=["https://www.alexdebrie.com/posts/dynamodb-costs"])
doc = loader.load()



# Transform
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(doc)

print(docs_transformed)



[Document(page_content='\n\n\n\n\nHow you should think about DynamoDB costs | DeBrie Advisory\n\n\n\n\n\n\n\n\n\n\n\nSkip to main contentHomeAboutBlogBitesServicesThe DynamoDB BookRecent postsHow you should think about DynamoDB costsEvent-Driven Architectures vs. Event-Based Compute in Serverless ApplicationsWhy I (Still) Like the Serverless Framework over the CDKKey Takeaways from the DynamoDB PaperUnderstanding Eventual Consistency in DynamoDBGet the DynamoDB BookHow you should think about DynamoDB costsMay 8, 2023 · 16 min readAlex DeBrieFounder, DeBrie AdvisoryLast week, someone emailed me to ask about a potential cost optimization mechanism in DynamoDB. More on the specifics of that situation below, but the basic point is they were thinking about adding some additional application and architectural complexity because they were concerned about high DynamoDB costs for a particular use case.I responded the way I always respond for these requests -- "have you done the math?"One of my 

In [1]:
import sys
import os

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.preprocessing.web_extract import extract_sections

urls = [
  "https://www.alexdebrie.com/posts/event-driven-vs-event-based",
  "https://www.alexdebrie.com/posts/dynamodb-costs",
  "https://www.alexdebrie.com/posts/serverless-framework-vs-cdk",
  "https://www.alexdebrie.com/posts/dynamodb-paper",
  "https://www.alexdebrie.com/posts/dynamodb-eventual-consistency",
  "https://www.alexdebrie.com/posts/dynamodb-graphql",
  "https://www.alexdebrie.com/posts/dynamodb-partitions"
]

extract_sections(urls)

Extracted 8 sections from the webpage.
Extracted 6 sections from the webpage.
Extracted 8 sections from the webpage.
Extracted 9 sections from the webpage.
Extracted 14 sections from the webpage.
Extracted 11 sections from the webpage.
Extracted 8 sections from the webpage.


In [7]:

from bs4 import BeautifulSoup
import requests


def soupify(url):
  """ Function to get the HTML content of a webpage and parse it using BeautifulSoup.
  """
  
  response = requests.get(url)
  html_content = response.content
  soup = BeautifulSoup(html_content, "html.parser")

  return soup


def get_sections(soup):
  """ Function to extract sections from a webpage using BeautifulSoup.
  """
  
  # Extract the <article> tag from the HTML content
  article = soup.find("article")

  headers = ["header", "h1", "h2", "h3", "h4", "h5", "h6"]
  elements = article.find_all(headers + ["p"])

  sections = []
  section = []
  for element in elements:
      # Maintaining the tags
      if element.name in headers:
          if len(section) > 1:
              sections.append(section)

          bare_element = f"<{element.name}>{element.text}</{element.name}>"
          section = [bare_element]

      else:
          section.append(element)

  print(f"Extracted {len(sections)} sections from the webpage.")
  return sections


url = "https://www.alexdebrie.com/posts/dynamodb-costs"

soup = soupify(url)
sections = get_sections(soup)

for section in sections:
  print(section)
  print(f"Paragraphs in this section: {len(section)}")
  for element in section:
      print(element)
      print("\n")
  print("\n")

Extracted 6 sections from the webpage.
['<h1>How you should think about DynamoDB costs</h1>', <p>Last week, someone emailed me to ask about a potential cost optimization mechanism in DynamoDB. <a href="#doin-the-math-pt-1-tracking-view-counts">More on the specifics of that situation below</a>, but the basic point is they were thinking about adding some additional application and architectural complexity because they were concerned about high DynamoDB costs for a particular use case.</p>, <p>I responded the way I always respond for these requests -- "have you done the math?"</p>, <p>One of my favorite things about DynamoDB is that you can easily do the math when considering how much it will cost you. I use this all the time in a few different ways, from getting a rough guess at how much DynamoDB will cost for my application to deciding between different approaches to solving a specific access pattern.</p>, <p class="break-words whitespace-pre-wrap prose px-2 mb-0 font-bold">You can and 