In [2]:
# Using https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-cos-v1

# import the SentenceTransformer, a wrapper on top of the model
from sentence_transformers import SentenceTransformer

# get 'sentence-transformers/multi-qa-mpnet-base-cos-v1', a pretrained model
model = SentenceTransformer(
  'sentence-transformers/multi-qa-mpnet-base-cos-v1'
)

docs = [
  "A paragon of virtue",
  "The hero of legend"
]

# Note that there is a limit of 512 word pieces:
# Text longer than that will be truncated.
# Further note that the model was just trained on
# input text up to 250 word pieces.
# It might not work well for longer text.

embeddings = model.encode(
  docs
  , batch_size=32
  , show_progress_bar=True
)

print(embeddings.shape)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Batches: 100%|██████████| 1/1 [00:00<00:00, 20.66it/s]

(2, 768)





In [59]:
import re

def makeParagraphs(text):
  '''
  creates a list of paragraphs.
  text: a string of arbitrary length.
  '''
  paragraph_separator_re = re.compile(r'(\.\n\r?)+(\n\r?)*')
  return re.split(paragraph_separator_re, text)

def removeEmpty(chunks):
  return [chunk.strip() for chunk in chunks if chunk and len(chunk.strip('.\n\t ')) > 0]

def split(chunk, size_limit=300):
  if (len(chunk.split(' ')) > size_limit):
    periods = re.compile(r'\.')
    return re.split(periods, chunk)
  return [chunk]

def rightSize(chunks, size_limit=300):
  batch = []
  for chunk in chunks:
    rightsized_chunks = split(chunk, size_limit)
    for right_sized_chunk in rightsized_chunks:
      batch.append(right_sized_chunk)
  return batch

def makeEmbedding(chunks):
  return model.encode(
    chunks
    , batch_size=32
    , device='mps' # send work to Metal shaders in M1 macs
  )

def makeChunks(raw_text):
  paragraphs = removeEmpty(makeParagraphs(raw_text))
  return rightSize(paragraphs)

def fileToChunks(filePath, encoding='utf-8'):
  with open(filePath, encoding=encoding) as file:
    raw_text = file.read()
    return makeChunks(raw_text)

def makeEmbeddingsPerChunk(forFile):
  chunks = fileToChunks(forFile)
  embeddings = []
  step_size = 32
  chunk_idx = 0
  while len(chunks) - (chunk_idx) > 0:
    for embedding in makeEmbedding(chunks[chunk_idx:step_size]):
      embeddings.append(embedding)
    chunk_idx += step_size
  return {'chunks': chunks, 'embeddings': embeddings}

embeddings = makeEmbeddingsPerChunk('pg2680.txt')
for i in range(2):
  print(embeddings['chunks'][i], embeddings['embeddings'][i][1])

The Project Gutenberg eBook of Meditations
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook 0.09277532
Title: Meditations


Author: Emperor of Rome Marcus Aurelius

Release date: June 1, 2001 [eBook #2680]
                Most recently updated: March 9, 2021

Language: English



*** START OF THE PROJECT GUTENBERG EBOOK MEDITATIONS ***



MEDITATIONS

By Marcus Aurelius




CONTENTS


     NOTES

     INTRODUCTION

     FIRST BOOK

     SECOND BOOK

     THIRD BOOK

     FOURTH BOOK

     FIFTH BOOK

     SIXTH BOOK

     SEVENTH BOOK

     EIGHTH BOOK

     NINTH BOOK

     TENTH BOOK

     ELE