## Input data

In [1]:
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 

In [9]:
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text.replace("\n", " ").strip()
        pages_and_texts.append({"page_number": page_number + 1,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts


In [None]:
open_and_read_pdf(pdf_path="Retrieval-AugmentedGenerationRAG-AdvancingAIwithDynamicKnowledgeIntegration.pdf")

In [12]:
isc2_1 = open_and_read_pdf(pdf_path="Am I Logging the Right AWS Log Sources_.pdf")
isc2_1

0it [00:00, ?it/s]

[{'page_number': 1,
  'page_char_count': 2438,
  'page_word_count': 376,
  'page_sentence_count_raw': 17,
  'page_token_count': 609.5,
  'text': "Disclaimer: The views and opinions expressed in this article belong solely to the author and do not necessarily reflect those of ISC2. As organizations increasingly rely on external cloud platforms such as Amazon Web Services (AWS), maintaining a clear understanding of what’s going on with external services is essential. Based on his experience, Jatin Mannepalli CISSP, CCSP, argues that effective logging is one of the most critical aspects of securing any cloud environment. He considers why having the right logs and knowing how to use them can make or break your security posture. When talking about logging, I can't help but think of some real-world incidents that have had a lasting impact. In 2022, Pegasus Airlines experienced unauthorized access to its AWS environment, leading to significant operational disruptions. The 2019 Capital One brea

In [18]:
for item in tqdm(isc2_1): #isc2_1 is a list of dictionaries 
    item["sentences"] = item["text"].split(". ")

  0%|          | 0/5 [00:00<?, ?it/s]

In [21]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 7

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i : (i + slice_size)] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(isc2_1): #isc2_1 is a list of dictionaries 
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
import pandas as pd

df = pd.DataFrame(isc2_1)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,num_chunks
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,3.0,2406.2,360.4,16.4,601.55,2.8
std,1.58,1192.04,178.74,6.11,298.01,1.1
min,1.0,477.0,75.0,7.0,119.25,1.0
25%,2.0,2330.0,330.0,16.0,582.5,3.0
50%,3.0,2438.0,376.0,17.0,609.5,3.0
75%,4.0,3331.0,503.0,18.0,832.75,3.0
max,5.0,3455.0,518.0,24.0,863.75,4.0


In [23]:
df

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,sentence_chunks,num_chunks
0,1,2438,376,17,609.5,Disclaimer: The views and opinions expressed i...,[Disclaimer: The views and opinions expressed ...,[[Disclaimer: The views and opinions expressed...,3
1,2,3331,503,24,832.75,world scenarios and how they’ve helped me and ...,[world scenarios and how they’ve helped me and...,[[world scenarios and how they’ve helped me an...,4
2,3,3455,518,18,863.75,forward logs. Monitor web-based threats effect...,"[forward logs, Monitor web-based threats effec...","[[forward logs, Monitor web-based threats effe...",3
3,4,2330,330,16,582.5,Quick Links The Center for Cyber Safety & Educ...,[Quick Links The Center for Cyber Safety & Edu...,[[Quick Links The Center for Cyber Safety & Ed...,3
4,5,477,75,7,119.25,Frequently Asked Questions Contact Us Policies...,[Frequently Asked Questions Contact Us Policie...,[[Frequently Asked Questions Contact Us Polici...,1


In [73]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(isc2_1):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = ". ".join(sentence_chunk).strip()
        # joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/5 [00:00<?, ?it/s]

14

In [None]:
import random
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': 3,
  'sentence_chunk': 'forward logs. Monitor web-based threats effectively through comprehensive log analysis.\xa0 the details (http traffic logs) to block malicious activity and fine-tune WAF rules. AWS Lambda Function Logs Enable logging to Amazon CloudWatch and set up a CloudWatch Logs subscription filter or use Kinesis Data Firehose to stream logs to your SIEM. Alternatively, install and configure a Lambda Extension from your SIEM provider to forward logs directly from the Lambda function to the SIEM, bypassing CloudWatch for reduced latency and greater flexibility.\xa0 Debugging Lambda issues or detecting unauthorized function executions would have been nearly impossible without these logs, given they are server-less and leave no trace except for the log files. They’ve also proven to be critical for auditing and forensic analysis. AWS CloudWatch Logs Ensure necessary logs are captured in relevant Log Groups. Create a Subscription Filter in CloudWatch Logs and cho

In [32]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,14.0,14.0,14.0,14.0
mean,2.64,848.36,123.86,212.09
std,1.28,316.54,45.15,79.13
min,1.0,353.0,47.0,88.25
25%,2.0,547.25,79.75,136.81
50%,2.5,955.0,137.0,238.75
75%,3.75,1058.25,150.25,264.56
max,5.0,1278.0,196.0,319.5


In [36]:
pages_and_chunks[:2]

[{'page_number': 1,
  'sentence_chunk': "Disclaimer: The views and opinions expressed in this article belong solely to the author and do not necessarily reflect those of ISC2As organizations increasingly rely on external cloud platforms such as Amazon Web Services (AWS), maintaining a clear understanding of what’s going on with external services is essentialBased on his experience, Jatin Mannepalli CISSP, CCSP, argues that effective logging is one of the most critical aspects of securing any cloud environmentHe considers why having the right logs and knowing how to use them can make or break your security postureWhen talking about logging, I can't help but think of some real-world incidents that have had a lasting impactIn 2022, Pegasus Airlines experienced unauthorized access to its AWS environment, leading to significant operational disruptionsThe 2019 Capital One breach, in which a simple AWS misconfiguration compromised over 100 million records, showing how devastating small mistak

In [37]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device="cpu") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [38]:
embedding_model.to("cpu")

# Embed each chunk one by one
for item in tqdm(pages_and_chunks):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/14 [00:00<?, ?it/s]

In [39]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [40]:
text_chunks_and_embeddings_df.head(2)

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,1,Disclaimer: The views and opinions expressed i...,968,145,242.0,"[0.059340823, 0.050530076, 0.030840993, -0.029..."
1,1,"In June 2021, Turkish beauty brand Cosmolog Ko...",1053,148,263.25,"[0.049672864, 0.017929126, 0.020043159, -0.014..."


### for the below, ignore if you alr have the df of text chunks and embeddings

In [41]:
text_chunks_and_embeddings_df_v2 = pd.read_csv(embeddings_df_save_path)

In [None]:
text_chunks_and_embeddings_df_v2.head(2) #note that previously saving it as .csv file caused some formatting issues in the 'embedding' col as the scientific notation "e" is used

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,1,Disclaimer: The views and opinions expressed i...,968,145,242.0,[ 5.93408234e-02 5.05300760e-02 3.08409929e-...
1,1,"In June 2021, Turkish beauty brand Cosmolog Ko...",1053,148,263.25,[ 4.96728644e-02 1.79291256e-02 2.00431589e-...


In [None]:
import random

import torch
import numpy as np 
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

### cont. here if ignored the above

## Retrieval

In [47]:
import torch 
import numpy as np

embeddings = torch.tensor(np.array(text_chunks_and_embeddings_df["embedding"].tolist()), dtype=torch.float32).to("cpu")
embeddings.shape

torch.Size([14, 768])

In [49]:
from sentence_transformers import util

# 1. Define the query
# Note: This could be anything. But since we're working with a nutrition textbook, we'll stick with nutrition-based queries.
query = "importance of logs"
print(f"Query: {query}")

# 2. Embed the query to the same numerical space as the text examples 
# Note: It's important to embed your query with the same model you embedded your examples with.
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# 3. Get similarity scores with the dot product (we'll time this for fun)
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# 4. Get the top-k results (we'll keep this to 5)
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Query: importance of logs
Time take to get scores on 14 embeddings: 0.00243 seconds.


torch.return_types.topk(
values=tensor([0.5495, 0.5167, 0.5145, 0.4558, 0.4534]),
indices=tensor([ 4,  0, 10,  2,  1]))

In [50]:
# Define helper function to print wrapped text 
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [51]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Query: 'importance of logs'

Results:
Score: 0.5495
Text:
They’re essential for tracking resource changes and catching misconfigurations,
like open security groups or overly permissive IAM rolesVPC Flow Logs Enable VPC
Flow Logs to CloudWatch or S3Use Lambda or a CloudWatch subscription filter to
forward logs to your SIEM or leverage Kinesis or SQS for scalable, real-time or
batch processingEnsure logs are parsed and monitored for security events such as
non-standard ports, malicious domains or unauthorized DNS server trafficWhen
analyzing network behavior or investigating anomalies, these logs provide a
detailed history of traffic patternsThey’ve been particularly useful for
detecting unauthorized access attempts, especially when you are investigating a
breachAWS GuardDuty Logs Export GuardDuty findings to CloudWatch Events or an S3
bucket
Page number: 2


Score: 0.5167
Text:
Disclaimer: The views and opinions expressed in this article belong solely to
the author and do not necessaril

In [1]:
!nvidia-smi

'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.
