In [None]:
# import PDF
import os
import requests

# your pdf path
pdf_path = "/sample_data/(2010) Kirschenbaum - What is Digital Humanities.pdf"
print(pdf_path)

In [None]:
# Download PDF onine if not availale locally
if not os.path.exists(pdf_path):
    print(f"[INFO] File doesn't exist, downloading...")

    # Enter URL of PDF
    url = "https://mkirschenbaum.wordpress.com/wp-content/uploads/2011/03/ade-final.pdf"

    # the local filename to save the download file
    filename = pdf_path

    # send a GET request to the URL
    response = requests.get(url)

    # check if the request was successful

    if response.status_code == 200:
        # open the file and save it
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"[INFO] The file has been downloaded and saved as {filename}")
    else:
        print(
            f"[INFO] Failed to download the file. Status code: {response.status_code}"
        )

else:
    print(f"File {pdf_path} exists.")

In [None]:
# Open the PDF file, see: http://github.com/pymupdf/PyMuPDF
import fitz

print("PyMuPDF is installed and the fitz module is available.")

In [None]:
from tqdm.auto import tqdm


def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    # more text formatting functions can go here
    return cleaned_text


def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append(
            {
                "page_number": page_number
                - 5,  # adjust page numbers since our PDF starts on page 42
                "page_char_count": len(text),
                "page_word_count": len(text.split(" ")),
                "page_sentence_count_raw": len(text.split(". ")),
                "page_token_count": len(text)
                / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                "text": text,
            }
        )
    return pages_and_texts


pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

In [None]:
import random

random.sample(pages_and_texts, k=3)

In [None]:
# reading some stats from the text
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

In [None]:
df.describe().round(2)

#### Note: The Importance of Token Count

It’s essential to consider token count when working with embedding models or Large Language Models (LLMs), as they cannot process an infinite number of tokens. During tokenization, some information may be lost or truncated, which can impact the quality of the embeddings and the model’s understanding of the text. Carefully managing token counts is crucial, especially for applications requiring detailed analysis, such as close reading of text.


#### Further Text Processing: Splitting Pages into Sentences

1. Splitting on "."
2. This step can be done with spaCy or nltk.


In [None]:
import sys

print(sys.executable)

Depending on your corpus, you may choose which spaCy model to use:

**Small model**
python -m spacy download en_core_web_sm

**Medium model**
python -m spacy download en_core_web_md

**Large model**
python -m spacy download en_core_web_lg


In [None]:
from spacy.lang.en import English

nlp = English()

# Add a sentencizer pipeline, c.f. https://spacy.io/api/sentencizer
nlp.add_pipe("sentencizer")

# Create document instance as an example
doc = nlp(
    "This is a sentence. This is another sentence. This is about digital humanities."
)
assert len(list(doc.sents)) == 3

# Print out sentences split
list(doc.sents)

In [None]:
pages_and_texts[1]

In [None]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Ensure all sentences are strings (currently spaCy datatype)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

In [None]:
random.sample(pages_and_texts, k=1)

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

#### Chunking Text for Embedding and Retrieval

Chunking text into manageable segments is essential for effective processing in a Retrieval-Augmented Generation (RAG) pipeline.

**LangChain** provides tools to handle this chunking, making it easier to organize content to fit within the embedding model’s context window. Proper chunking ensures that the context passed to the LLM is focused and relevant, leading to more accurate and precise responses.


In [None]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10  # Experiment with the number


# Create a funcion to split lists of texts recursively into chunk size
# e.g. [20]->[10,10] or [25] -> [10,10,5]
def split_list(
    input_list: list[str], slice_size: int = num_sentence_chunk_size
) -> list[list[str]]:
    return [
        input_list[i : i + slice_size] for i in range(0, len(input_list), slice_size)
    ]


test_list = list(range(25))
split_list(test_list)

In [None]:
# Loop throough ppers and texts and split sentencds into chunk
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(
        input_list=item["sentences"], slice_size=num_sentence_chunk_size
    )
    item["num_chunks"] = len(item["sentence_chunks"])

In [None]:
random.sample(pages_and_texts, k=1)

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

### Splitting and Embedding Text Chunks

Each chunk of text is split into individual items and embedded as its own unique numerical representation. This process helps ensure that each segment of content is distinct and appropriately prepared for retrieval and analysis in the RAG pipeline.

The following code performs this function by:

1. **Combining Sentences into Chunks**: Each chunk is created by joining related sentences into a paragraph-like structure.
2. **Cleaning and Formatting**: Simple formatting (e.g., adding spaces after full stops) ensures consistency and readability within each chunk.
3. **Generating Statistics**: The code calculates character count, word count, and an estimated token count for each chunk, which can be useful for managing context windows and model constraints.


In [None]:
import re

# split each chunk into its own item
# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(
            r"\.([A-Z])", r". \1", joined_sentence_chunk
        )  # ".A" -> ". A" for any full-stop/capital letter combo
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len(
            [word for word in joined_sentence_chunk.split(" ")]
        )
        chunk_dict["chunk_token_count"] = (
            len(joined_sentence_chunk) / 4
        )  # 1 token = ~4 characters

        pages_and_chunks.append(chunk_dict)

# Number of chunks we have
len(pages_and_chunks)

In [None]:
random.sample(pages_and_chunks, k=1)

In [None]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

In [None]:
df.head()

In [None]:
print(df.head())  # Displays the first few rows of the DataFrame
print(
    df["chunk_token_count"].describe()
)  # Shows statistics for the chunk_token_count column

In [None]:
# Filter chunks of text for short chunks as they may not contain much useful information
# Show random chunks with under 70 tokens in length
# You may need to adjust the number of tokens. The below code has been adjusted to check whether tokens of your expected count exists in the DatafFrame.

# Set min_token_length to filter out short chunks
min_token_length = 70

# Filter and check the DataFrame
filtered_df = df[df["chunk_token_count"] <= min_token_length]
print(
    f"Number of chunks found with token count <= {min_token_length}: {len(filtered_df)}"
)

if not filtered_df.empty:
    sample_size = min(5, len(filtered_df))
    for _, row in filtered_df.sample(sample_size).iterrows():
        print(
            f'Chunk token count: {row["chunk_token_count"]} | Text: {row["sentence_chunk"]}'
        )
else:
    print(f"No chunks found with token count <= {min_token_length}.")

In [None]:
# Filter Dataframe for rows with under 70 tokens
# You will need to adjust the number of tokens depending on your DataFrame structure

pages_and_chunks_over_min_token_len = df[
    df["chunk_token_count"] > min_token_length
].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

In [None]:
import random

random.sample(pages_and_chunks_over_min_token_len, k=1)

#### Embedding Text Chunks

see https://vickiboykis.com/what_are_embeddings/


In [None]:
## Note: Downgraded NumPy to version <2 for compatibility with sentence-transformers.
# NumPy 2.x versions are currently not fully supported by sentence-transformers and may cause errors.
# This downgrade ensures stable integration when encoding embeddings.

import numpy as np

print("NumPy version:", np.__version__)

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(
    model_name_or_path="all-mpnet-base-v2", device="cpu"
)  # Set device to "cpu" or "cuda" depending on available hardware and speed


# Create the list of sentences
sentences = [
    "The Sentence Transformer library provides an easy way to create embeddings.",
    "Sentence embedding is part of the process.",
    "This is a digital humanities project.",
]

# encode sentences by calling model.encode()
embeddings = embedding_model.encode(sentences)
embedding_dict = dict(zip(sentences, embeddings))

# check the embeddings
for sentence, embedding in embedding_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding}")
    print("")

In [None]:
embeddings[0].shape

In [None]:
import time
from tqdm import tqdm

# Start the timer
start_time = time.time()

# Move the model to the CPU for embedding creation
embedding_model.to("cpu")

# Generate embeddings for each chunk on the CPU
# This iterates over each item in pages_and_chunks_over_min_token_len
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

# Calculate and print the elapsed time
elapsed_time = time.time() - start_time
print(f"Time taken for CPU embedding creation: {elapsed_time:.2f} seconds")

# Optional: Move the model to the GPU for faster embedding creation
# Uncomment the following lines if a GPU is available for use with "cuda"

# embedding_model.to("cuda")  # Requires a GPU to be installed

# # Start timing for GPU embedding
# start_time = time.time()

# # Create embeddings on the GPU (if available)
# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

# # Calculate and print the elapsed time for GPU processing
# elapsed_time = time.time() - start_time
# print(f"Time taken for GPU embedding creation: {elapsed_time:.2f} seconds")

In [None]:
import pandas as pd

# Measure execution time for batch embedding and saving (if in Jupyter, uncomment %%time)
# %%time

# Embed all text chunks in batches to optimize memory and performance
# Uncomment the following to batch embeddings:
# text_chunk_embeddings = embedding_model.encode(
#     text_chunks,  # List of text chunks to embed
#     batch_size=32,  # Adjust batch size as needed to improve speed and memory efficiency
#     convert_to_tensor=True  # Optional: return embeddings as PyTorch tensor for further tensor operations
# )

# Optional: Display or use `text_chunk_embeddings` for further processing

# Save the text chunks and embeddings to a DataFrame
# Note: Adjust path as needed
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(
    embeddings_df_save_path, index=False
)  # Save DataFrame as CSV

In [None]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

### Query


In [None]:
import random
import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df[
    "embedding"
].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(
    np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32
).to(device)
embeddings.shape
embeddings.dtype

In [None]:
text_chunks_and_embedding_df.head()

In [None]:
text_chunks_and_embedding_df["embedding"]

In [None]:
embeddings = np.stack(text_chunks_and_embedding_df["embedding"].tolist(), axis=0)
embeddings

In [None]:
embeddings.shape

In [None]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(
    model_name_or_path="all-mpnet-base-v2", device=device
)  # choose the device to load the model to

### Steps for Querying with Embeddings

1. **Define a Query String**: Start by creating a query as a text string that represents what you’re searching for.

2. **Convert the Query to an Embedding**: Use the same embedding model to transform the query string into an embedding, similar to how the text chunks were embedded.

3. **Calculate Similarity**: Perform a similarity comparison (e.g., dot product or cosine similarity) between the query embedding and each text embedding in the dataset.

4. **Sort Results by Relevance**: Sort the similarity scores in descending order to identify the most relevant results for your query.


In [None]:
import torch
from sentence_transformers import util

# 1. Define the query
query = "digital humanities"
print(f"Query: {query}")

# 2. Embed the query to the same numerical space as the text examples
# Ensure that both query_embedding and embeddings are in the same format and dtype
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# Convert embeddings to a PyTorch tensor if it's currently a NumPy array
if isinstance(embeddings, np.ndarray):
    embeddings = torch.tensor(embeddings, dtype=torch.float32)

# 3. Get similarity scores with the dot product
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(query_embedding, embeddings)[0]
end_time = timer()

print(
    f"Time taken to get scores on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds."
)

# 4. Get the top-k results (keeping this to 5 for now)
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

In [None]:
# Query the output by inputting a number from the indices
pages_and_chunks[2]

In [None]:
larger_embeddings = torch.randn(100 * embeddings.shape[0], 768).to(device)
print(f"Embeddings shape:  {larger_embeddings.shape}")

# Perform dot product across 168,000 embeddings
start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=larger_embeddings)[0]
embeddings.dtype
end_time = timer()

print(
    f"Time take to get scores on {len(larger_embeddings)} embeddings: {end_time-start_time:.5f} seconds."
)