In [3]:
# Download PDF file
import os
import requests

# Get PDF document
pdf_path = "_The Art of Electronics 3rd ed [2015].pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  # The URL of the PDF you want to download
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  # The local filename to save the downloaded file
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File _The Art of Electronics 3rd ed [2015].pdf exists.


In [4]:
# Requires !pip install PyMuPDF, see: https://github.com/pymupdf/pymupdf
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 33,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': -33,
  'page_char_count': 72,
  'page_word_count': 21,
  'page_sentence_count_raw': 1,
  'page_token_count': 18.0,
  'text': 'THIRD EDITION THE O F E L E C T R O N I C S PAUL  HOROWITZ WINFIELD HILL'},
 {'page_number': -32,
  'page_char_count': 2148,
  'page_word_count': 322,
  'page_sentence_count_raw': 15,
  'page_token_count': 537.0,
  'text': 'The Art of Electronics Third Edition At long last, here is the thoroughly revised and updated, and long-anticipated, third edition of the hugely successful The Art of Electronics. Widely accepted as the best single authoritative text and reference on electronic circuit design, both analog and digital, the ﬁrst two editions were translated into eight languages, and sold more than a million copies worldwide. The art of electronics is explained by stressing the methods actually used by circuit designers – a combination of some basic laws, rules of thumb, and a nonmathematical treatment that encourages understanding why and how a

In [5]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 963,
  'page_char_count': 5139,
  'page_word_count': 881,
  'page_sentence_count_raw': 35,
  'page_token_count': 1284.75,
  'text': '964 13.13. Phase-locked loops Art of Electronics Third Edition and R4/R3 determines the damping, i.e., absence of over- shoot for step changes in frequency. You might begin with a value of R4 somewhere in the range of 10% to 20% of R3. D. Loop damping and jitter A side effect of the nonzero “damping” resistor R4 is the creation of some jitter in the PLL output. An easy way to see this is to realize that even at high frequencies the loop ﬁlter permits a fraction R4/(R3 + R4) of the raw phase- detector output to reach the VCO. For typical ratios, R3 ≈ 10R4, this can add substantial jitter to the VCO output. The usual solution is to add a small capacitor (∼ C2/20) from the VCO control input to ground, preferably close to the VCO pin to ﬁlter any other high-frequency noise as well. E. PLL real-world design We sailed through this design exampl

In [6]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-33,72,21,1,18.0,THIRD EDITION THE O F E L E C T R O N I C S PA...
1,-32,2148,322,15,537.0,The Art of Electronics Third Edition At long l...
2,-31,34,5,1,8.5,This page intentionally left blank
3,-30,111,16,1,27.75,THE ART OF ELECTRONICS Third Edition Paul Horo...
4,-29,1163,167,7,290.75,"32 Avenue of the Americas, New York, NY 10013-..."


In [7]:
# Get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1225.0,1225.0,1225.0,1225.0,1225.0
mean,579.0,3869.54,664.74,27.03,967.39
std,353.77,1082.84,211.93,11.68,270.71
min,-33.0,15.0,4.0,1.0,3.75
25%,273.0,3292.0,556.0,22.0,823.0
50%,579.0,3952.0,670.0,28.0,988.0
75%,885.0,4656.0,788.0,33.0,1164.0
max,1191.0,7118.0,1865.0,148.0,1779.5


In [8]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [9]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1225 [00:00<?, ?it/s]

In [10]:
# Inspect an example
random.sample(pages_and_texts, k=1)

[{'page_number': 545,
  'page_char_count': 4656,
  'page_word_count': 799,
  'page_sentence_count_raw': 29,
  'page_token_count': 1164.0,
  'text': '546 8.11. Noise in transimpedance ampliﬁers Art of Electronics Third Edition gain-setting feedback resistor, it is stable.100 The input stage has a noise voltage density of en≈0.6nV/ √ Hz. The second op-amp (LT6230) is a wideband (200 MHz) low- noise (1.1 nV/ √ Hz) op-amp, powered by a low-noise ±5 V power supply101 (Figure 8.80D); with the JFET input stage, the combined GBW of these two stages is fT=gm/2πCc, or about 200 MHz with a Cc of 100 pF. The last stage has a gain of 50 and an fT of 65 MHz, boosting the composite ampliﬁer’s GBW to 10 GHz. When conﬁgured as a transimpedance ampliﬁer (i.e., Figures 8.80A and C), this circuit has 5× less noise voltage (thus 5× less enCin noise) than the composite TIA of Fig- ure 8.79. It also has greater bandwidth, thanks to its 20× higher fT, even with the larger value of feedback resistor (20M, chos

In [11]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0
mean,579.0,3869.54,664.74,27.03,967.39,25.8
std,353.77,1082.84,211.93,11.68,270.71,10.91
min,-33.0,15.0,4.0,1.0,3.75,1.0
25%,273.0,3292.0,556.0,22.0,823.0,21.0
50%,579.0,3952.0,670.0,28.0,988.0,27.0
75%,885.0,4656.0,788.0,33.0,1164.0,31.0
max,1191.0,7118.0,1865.0,148.0,1779.5,147.0


In [12]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 26

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 26 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1225 [00:00<?, ?it/s]

In [13]:
# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)
random.sample(pages_and_texts, k=1)

[{'page_number': 874,
  'page_char_count': 5033,
  'page_word_count': 821,
  'page_sentence_count_raw': 52,
  'page_token_count': 1258.25,
  'text': 'Art of Electronics Third Edition Review of Chapter 12 875 Review of Chapter 12 An A-to-S summary of what we have learned in Chap- ter 12. This summary reviews basic principles, facts, and application advice in Chapter 12. ¶A. Logic Interconnections. The subject of this chapter is the interconnection of digital logic signals and logic devices to. . . everything, where “ev- erything” includes (a) other logic devices, (b) input sources (switches, optoelectronics, cables), and (c) output devices (dc and ac power loads, optoelectonics, cables). So it’s a long chapter, rich with multiple themes. Here we try to or- ganize these diverse topics into manageable paragraphs. ¶B. Logic Families. §12.1.1. Contemporary digital logic is owned by CMOS, with the minor exception of some emitter-coupled logic families (ECL, PECL, and LVPECL) and some BiCMOS 

In [14]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0
mean,579.0,3869.54,664.74,27.03,967.39,25.8,1.52
std,353.77,1082.84,211.93,11.68,270.71,10.91,0.55
min,-33.0,15.0,4.0,1.0,3.75,1.0,1.0
25%,273.0,3292.0,556.0,22.0,823.0,21.0,1.0
50%,579.0,3952.0,670.0,28.0,988.0,27.0,2.0
75%,885.0,4656.0,788.0,33.0,1164.0,31.0,2.0
max,1191.0,7118.0,1865.0,148.0,1779.5,147.0,6.0


In [15]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/1225 [00:00<?, ?it/s]

1865

In [16]:
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': 566,
  'sentence_chunk': 'Art of Electronics Third Edition 8.13.5. Measuring the noise voltage 567 100Hz–1kHz 10Hz–100Hz 0.1Hz–1Hz 1Hz–10Hz (10s/div) (100ms/div) (10ms/div) (1s/div) Figure 8.106. LT1012 noise current versus time, for successive decade bandpasses. Vertical: 5 pA/div. Horizontal: scaled to band- pass, as indicated.10 Hz is only 3.5 times greater than that between 0.1 Hz and 10 Hz; going down another six decades (to 10−12 Hz), the corresponding ratio grows only to 6.5. Put another way, the 1/f total noise power, going all the way down to a frequency that is the reciprocal of 32,000 years (when Neanderthals still roamed the planet, and there were no op-amps), is just six times greater than that of the usual datasheet 0.1–10 Hz “low-frequency noise.”So much for catastrophes. To ﬁnd out whether the low-frequency noise of real op- amps continues to conform to a 1/f spectrum, we mea- sured the current noise spectrum of an LT1012 op-amp all the way down to 0.5 

In [17]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1865.0,1865.0,1865.0,1865.0
mean,566.37,2537.2,432.51,634.3
std,353.43,1486.46,268.22,371.62
min,-33.0,3.0,1.0,0.75
25%,256.0,1056.0,174.0,264.0
50%,562.0,2943.0,498.0,735.75
75%,876.0,3762.0,630.0,940.5
max,1191.0,6662.0,1824.0,1665.5


In [18]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 26.25 | Text: How did these magicians accomplish this? (The same way you shake hands with a gorilla – very carefully!).
Chunk token count: 17.75 | Text: THIRD EDITION THE O F E L E C T R O N I C S PAUL HOROWITZ WINFIELD HILL
Chunk token count: 11.75 | Text: Complete speciﬁcations also include the effects
Chunk token count: 8.5 | Text: This page intentionally left blank
Chunk token count: 19.5 | Text: 11 We think he meant that it will outlive us, not kill us. But we’re not sure.


In [19]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -32,
  'sentence_chunk': 'The Art of Electronics Third Edition At long last, here is the thoroughly revised and updated, and long-anticipated, third edition of the hugely successful The Art of Electronics. Widely accepted as the best single authoritative text and reference on electronic circuit design, both analog and digital, the ﬁrst two editions were translated into eight languages, and sold more than a million copies worldwide. The art of electronics is explained by stressing the methods actually used by circuit designers – a combination of some basic laws, rules of thumb, and a nonmathematical treatment that encourages understanding why and how a circuit works. Paul Horowitz is a Research Professor of Physics and of Electrical Engineering at Harvard University, where in 1974 he originated the Laboratory Electronics course from which emerged The Art of Electronics. In addition to his work in circuit design and electronic instrumentation, his research interests have

In [20]:
# Requires !pip install sentence-transformers
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device="cpu") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)

# Create a list of sentences to turn into numbers
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")



Sentence: The Sentences Transformers library provides an easy and open-source way to create embeddings.
Embedding: [-2.07981747e-02  3.03164832e-02 -2.01218091e-02  6.86483681e-02
 -2.55255867e-02 -8.47688038e-03 -2.07095101e-04 -6.32377341e-02
  2.81606503e-02 -3.33353430e-02  3.02634705e-02  5.30721024e-02
 -5.03526144e-02  2.62288135e-02  3.33314687e-02 -4.51577976e-02
  3.63044180e-02 -1.37116888e-03 -1.20171215e-02  1.14946431e-02
  5.04510738e-02  4.70857024e-02  2.11913753e-02  5.14607094e-02
 -2.03746147e-02 -3.58889364e-02 -6.67880930e-04 -2.94394009e-02
  4.95859012e-02 -1.05639324e-02 -1.52014028e-02 -1.31759769e-03
  4.48196940e-02  1.56023549e-02  8.60379828e-07 -1.21383998e-03
 -2.37978939e-02 -9.09392838e-04  7.34483125e-03 -2.53929431e-03
  5.23370206e-02 -4.68044169e-02  1.66214276e-02  4.71578985e-02
 -4.15599532e-02  9.01981664e-04  3.60279083e-02  3.42214406e-02
  9.68227461e-02  5.94828650e-02 -1.64984800e-02 -3.51249650e-02
  5.92523487e-03 -7.07979023e-04 -2.4103

In [23]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [24]:
%%time

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings

CPU times: total: 1.17 s
Wall time: 3.21 s


tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]],
       device='cuda:0')

In [24]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [26]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242675e-02 9.02281553e-02 -5.09548280e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156307e-02 5.92139475e-02 -1.66167468e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,766,116,191.5,[ 2.79801842e-02 3.39813866e-02 -2.06426680e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,144,235.25,[ 6.82566985e-02 3.81275080e-02 -8.46853852e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264494e-02 -8.49768054e-03 9.57158953e-...


In [25]:
import random

import torch
import numpy as np 
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([1813, 768])

In [26]:
text_chunks_and_embedding_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-32,The Art of Electronics Third Edition At long l...,2149,323,537.25,"[-0.00654813927, -0.0412254333, -0.0644022673,..."
1,-29,"32 Avenue of the Americas, New York, NY 10013-...",1161,165,290.25,"[-0.00998866744, -0.0643754452, -0.0292186998,..."
2,-24,CONTENTS List of Tables xxii Preface to the Fi...,2207,342,551.75,"[-0.0398231223, -0.100760438, -0.0599776283, 0..."
3,-23,x Contents Art of Electronics Third Edition Ad...,2803,415,700.75,"[-0.023741452, -0.0902954713, -0.0472430512, 0..."
4,-22,Art of Electronics Third Edition Contents xi 3...,2841,420,710.25,"[-0.0447079688, -0.0523321927, -0.0625134856, ..."


In [27]:
embeddings[0]

tensor([-6.5481e-03, -4.1225e-02, -6.4402e-02,  1.6427e-02, -1.6908e-02,
        -2.5385e-02,  3.2495e-02, -2.6058e-02,  5.3695e-02,  1.2287e-02,
        -1.6656e-02,  4.2595e-02,  5.2783e-02,  3.7763e-02,  2.1270e-02,
         1.4935e-02, -1.1150e-02,  2.5732e-02, -2.9821e-02, -8.2697e-03,
        -1.8214e-02, -3.8519e-02,  2.2010e-02,  1.6685e-02, -7.8782e-03,
         4.5970e-02, -2.0625e-02, -1.4414e-02, -2.4003e-04,  2.0524e-02,
        -1.5947e-02,  4.4469e-02, -3.8072e-02,  4.5238e-02,  2.3233e-06,
         8.3988e-03,  4.2182e-02,  1.7289e-02, -4.6551e-02,  5.4275e-02,
        -2.1486e-02,  3.2109e-02,  1.4298e-02, -1.0042e-02,  2.8033e-02,
        -1.1275e-02,  4.2179e-02, -3.2163e-02, -4.8631e-02,  4.0192e-02,
         9.7969e-03, -3.4346e-02,  5.1446e-02,  4.7456e-02,  5.4358e-02,
         3.6822e-02, -4.5360e-02,  1.4342e-02,  3.5378e-02,  1.8059e-02,
        -4.2338e-02,  1.8228e-02,  1.6923e-03, -1.7731e-02,  5.8096e-02,
        -1.4673e-02, -6.4336e-02,  4.0720e-02, -4.1

In [28]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device='cpu') # choose the device to load the model to



In [29]:
# 1. Define the query
# Note: This could be anything. But since we're working with a nutrition textbook, we'll stick with nutrition-based queries.
query = "oscilloscope functions"
print(f"Query: {query}")

# 2. Embed the query to the same numerical space as the text examples 
# Note: It's important to embed your query with the same model you embedded your examples with.
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# 3. Get similarity scores with the dot product (we'll time this for fun)
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# 4. Get the top-k results (we'll keep this to 5)
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Query: oscilloscope functions
Time take to get scores on 1813 embeddings: 0.00132 seconds.


torch.return_types.topk(
values=tensor([0.6111, 0.6079, 0.5985, 0.5971, 0.5636]),
indices=tensor([1775, 1785, 1777, 1783, 1782]))

In [30]:
# Define helper function to print wrapped text 
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [31]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Query: 'oscilloscope functions'

Results:
Score: 0.6111
Text:
THE OSCILLOSCOPE APPENDIX O The oscilloscope (“scope” for short) is, by far, the
most useful and versatile electronic circuit test instrument.1 As usually used,
it lets you “see” voltages in a circuit as a function of time, triggering on a
particular point of the waveform so that a stationary display results. Contempo-
rary scopes are almost invariably digital (input signals are digitized,
processed, and displayed), and they do (and usu- ally better) what their analog
ancestors did. To understand how to use an oscilloscope, we think it best to
start with the traditional (and nearly extinct) 2-channel analog scope, for
which we’ve drawn a block diagram (Figure O.1) and typ- ical front panel (Figure
O.2). Digital scopes carry forward nearly all of its features, to which they add
an impressive array of capabilities (and a few hazards). O.1 The analog
oscilloscope O.1.1 Vertical Beginning with the signal inputs, most analog scop

In [36]:
import torch

def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)

    # Get Euclidean/L2 norm of each vector (removes the magnitude, keeps direction)
    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))

    return dot_product / (norm_vector1 * norm_vector2)

# Example tensors
vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector2 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector3 = torch.tensor([4, 5, 6], dtype=torch.float32)
vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)

# Calculate dot product
print("Dot product between vector1 and vector2:", dot_product(vector1, vector2))
print("Dot product between vector1 and vector3:", dot_product(vector1, vector3))
print("Dot product between vector1 and vector4:", dot_product(vector1, vector4))

# Calculate cosine similarity
print("Cosine similarity between vector1 and vector2:", cosine_similarity(vector1, vector2))
print("Cosine similarity between vector1 and vector3:", cosine_similarity(vector1, vector3))
print("Cosine similarity between vector1 and vector4:", cosine_similarity(vector1, vector4))

Dot product between vector1 and vector2: tensor(14.)
Dot product between vector1 and vector3: tensor(32.)
Dot product between vector1 and vector4: tensor(-14.)
Cosine similarity between vector1 and vector2: tensor(1.0000)
Cosine similarity between vector1 and vector3: tensor(0.9746)
Cosine similarity between vector1 and vector4: tensor(-1.0000)
