In [1]:
import sys
import os

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.preprocessing.pdf_extract import recursive_extract_text

recursive_extract_text('articles/raw/')

Processing articles/raw/Why I (Still) Like the Serverless Framework over the CDK _ DeBrie Advisory.pdf
Processing articles/raw/Event-Driven Architectures vs. Event-Based Compute in Serverless Applications _ DeBrie Advisory.pdf
Processing articles/raw/Key Takeaways from the DynamoDB Paper _ DeBrie Advisory.pdf
Processing articles/raw/How you should think about DynamoDB costs _ DeBrie Advisory.pdf
Processing articles/raw/GraphQL, DynamoDB, and Single-table Design _ DeBrie Advisory.pdf
Processing page 3/3

In [None]:
from src.preprocessing.pdf_extract import extract_text

pdf_path = 'articles/raw/Building a Massively Scalable Serverless Chat Application with AWS AppSync _ by Sarah Hamilton _ Serverless Transformation _ Medium.pdf'
output_path = 'Building a Massively Scalable Serverless Chat Application with AWS AppSync _ by Sarah Hamilton _ Serverless Transformation _ Medium.txt'

extract_text(pdf_path, output_path)


Processing page 5/5

In [2]:
from src.preprocessing.chunk import recursive_chunk_text
from src.preprocessing.chunk import chunk_text
import os

# delete articles/chunks/chunks.txt
if os.path.exists('articles/chunks/chunks.txt'):
    os.remove('articles/chunks/chunks.txt')
    
recursive_chunk_text('articles/parsed/')
# chunk_text('articles/parsed/Building a Massively Scalable Serverless Chat Application with AWS AppSync _ by Sarah Hamilton _ Serverless Transformation _ Medium.txt')

Processing articles/parsed/Event-Driven Architectures vs. Event-Based Compute in Serverless Applications _ DeBrie Advisory.txt
Processing articles/parsed/Key Takeaways from the DynamoDB Paper _ DeBrie Advisory.txt
Processing articles/parsed/Why I (Still) Like the Serverless Framework over the CDK _ DeBrie Advisory.txt
Processing articles/parsed/How you should think about DynamoDB costs _ DeBrie Advisory.txt
Processing articles/parsed/GraphQL, DynamoDB, and Single-table Design _ DeBrie Advisory.txt


In [4]:
from src.preprocessing.neutralise import create_neutralised_dataset

neutraliser = create_neutralised_dataset('articles/chunks/chunks.txt', 'articles/neutralised/training_data.jsonl')

Neutralising chunk: 29/29

In [4]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

with open('articles/neutralised/neutralised_chunks.jsonl', 'r') as file:
    chunks = file.read()

print(num_tokens_from_string(chunks, "cl100k_base"))

64891


# Web Loading

#### 1. Web scraping

In [1]:
import sys
import os

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.preprocessing.web_extract import extract_sections

urls = [
  "https://www.alexdebrie.com/posts/event-driven-vs-event-based",
  "https://www.alexdebrie.com/posts/dynamodb-costs",
  "https://www.alexdebrie.com/posts/serverless-framework-vs-cdk",
  "https://www.alexdebrie.com/posts/dynamodb-paper",
  "https://www.alexdebrie.com/posts/dynamodb-eventual-consistency",
  "https://www.alexdebrie.com/posts/dynamodb-graphql",
  "https://www.alexdebrie.com/posts/dynamodb-partitions"
]

extract_sections(urls)

Extracted 9 sections from the webpage.
Extracted 7 sections from the webpage.
Extracted 9 sections from the webpage.
Extracted 10 sections from the webpage.
Extracted 15 sections from the webpage.
Extracted 12 sections from the webpage.
Extracted 9 sections from the webpage.


#### 2. Chunking

In [2]:
from src.preprocessing.chunking.chunks import Chunking 

chunking = Chunking(input_dir='articles/parsed/')
chunks = chunking.chunk_corpus()


#### 3. Neutralising

In [1]:
from src.preprocessing.neutralising.llm import NeutraliserLLM

input_file_path = 'articles/chunks/chunks.txt'
output_dir = 'articles/neutralised/'
output_style = 'draft'

neutraliser = NeutraliserLLM(input_file_path, output_dir, output_style)
neutraliser.neutralise_corpus()


Anthropic succeeded 2/188
Anthropic succeeded 6/188
Anthropic succeeded 7/188
Anthropic succeeded 9/188
Anthropic succeeded 11/188
Anthropic succeeded 12/188
Anthropic succeeded 14/188
Anthropic succeeded 17/188
Anthropic succeeded 19/188
Anthropic succeeded 23/188
Anthropic succeeded 25/188
Anthropic succeeded 27/188
Anthropic succeeded 31/188
Anthropic succeeded 35/188
Anthropic succeeded 38/188
Anthropic succeeded 39/188
Anthropic succeeded 40/188
Anthropic succeeded 42/188
Anthropic succeeded 43/188
Anthropic succeeded 44/188
Anthropic succeeded 45/188
Anthropic succeeded 47/188
Anthropic succeeded 53/188
Anthropic succeeded 57/188
Anthropic succeeded 58/188
Anthropic succeeded 59/188
Anthropic succeeded 61/188
Anthropic succeeded 62/188
Anthropic succeeded 63/188
Anthropic succeeded 66/188
Anthropic succeeded 67/188
Anthropic succeeded 71/188
Anthropic succeeded 72/188
Anthropic succeeded 73/188
Anthropic succeeded 77/188
Anthropic succeeded 78/188
Anthropic succeeded 79/188
Anthr

["- This post offers information on DynamoDB Partitions.\n- DynamoDB powers high-traffic systems worldwide, including Amazon.com's shopping cart, real-time ad platform bidding, and low-latency gaming applications.\n- DynamoDB's popularity is due to its fast, consistent performance regardless of scale.\n- The consistent and predictable scaling properties of DynamoDB are due to basic computer science principles, not superior computing power.",
 "- This post will delve into DynamoDB partitions: what they are, why they matter, and how they should influence data modeling.\n- Understanding DynamoDB partitions is crucial for comprehending DynamoDB's behavior and the rationale behind its API restrictions and single-table design principles.\n- Initially, the DynamoDB API may seem unnecessarily restrictive, and single-table design principles may seem bizarre.\n- However, once DynamoDB partitions are understood, the reasons for these aspects become clear.\n- The post will cover the following sect

In [11]:
with open('articles/chunks/chunks.txt', 'r') as file:
  chunks = file.read()

for i, chunk in enumerate(chunks.split('\n')):
  print(f"Chunk {i}: Length {len(chunk)}. Tokens {num_tokens_from_string(chunk, 'cl100k_base')}")

Chunk 0: Length 681. Tokens 148
Chunk 1: Length 583. Tokens 123
Chunk 2: Length 787. Tokens 179
Chunk 3: Length 550. Tokens 123
Chunk 4: Length 651. Tokens 135
Chunk 5: Length 969. Tokens 211
Chunk 6: Length 821. Tokens 168
Chunk 7: Length 615. Tokens 141
Chunk 8: Length 923. Tokens 207
Chunk 9: Length 992. Tokens 188
Chunk 10: Length 676. Tokens 128
Chunk 11: Length 524. Tokens 106
Chunk 12: Length 952. Tokens 213
Chunk 13: Length 795. Tokens 173
Chunk 14: Length 534. Tokens 109
Chunk 15: Length 953. Tokens 214
Chunk 16: Length 895. Tokens 179
Chunk 17: Length 790. Tokens 175
Chunk 18: Length 561. Tokens 120
Chunk 19: Length 920. Tokens 176
Chunk 20: Length 343. Tokens 70
Chunk 21: Length 629. Tokens 145
Chunk 22: Length 890. Tokens 187
Chunk 23: Length 841. Tokens 185
Chunk 24: Length 558. Tokens 113
Chunk 25: Length 861. Tokens 190
Chunk 26: Length 957. Tokens 180
Chunk 27: Length 830. Tokens 166
Chunk 28: Length 878. Tokens 186
Chunk 29: Length 746. Tokens 161
Chunk 30: Length 726.