In [13]:
# Using https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-cos-v1

# import the SentenceTransformer, a wrapper on top of the model
from sentence_transformers import SentenceTransformer

# get 'sentence-transformers/multi-qa-mpnet-base-cos-v1', a pretrained model
model = SentenceTransformer(
  'sentence-transformers/multi-qa-mpnet-base-cos-v1'
)

docs = [
  "A paragon of virtue",
  "The hero of legend"
]

# Note that there is a limit of 512 word pieces:
# Text longer than that will be truncated.
# Further note that the model was just trained on
# input text up to 250 word pieces.
# It might not work well for longer text.

embeddings = model.encode(
  docs
  , batch_size=32
  , show_progress_bar=True
)

print(embeddings.shape)

Batches: 100%|██████████| 1/1 [00:00<00:00, 21.11it/s]

(2, 768)





In [21]:
import re

def makeParagraphs(text):
  '''
  creates a list of paragraphs.
  text: a string of arbitrary length.
  '''
  paragraph_separator_re = re.compile(r'(\.\n\r?)+(\n\r?)*')
  return re.split(paragraph_separator_re, text)

def removeEmpty(chunks):
  '''
  removes chunks that only contain dots or new lines
  '''
  return [chunk.strip() for chunk in chunks if chunk and len(chunk.strip('.\n\t ')) > 0]

def split(chunk, size_limit=150):
  '''
  splits chunks so that they contain no more than 150 words or the chosen limit
  '''
  if (len(chunk.split(' ')) > size_limit):
    periods = re.compile(r'\.|\?|;')
    return re.split(periods, chunk)
  return [chunk]

def rightSize(chunks, size_limit=150):
  '''
  creates new chunks if needed so that chunks do not exceed the size limit.
  '''
  batch = []
  for chunk in chunks:
    rightsized_chunks = split(chunk, size_limit)
    for right_sized_chunk in rightsized_chunks:
      batch.append(right_sized_chunk)
  return batch

def makeEmbedding(chunks, model=model):
  '''
  makes embeddings out of a group of chunks
  model is 'sentence-transformers/multi-qa-mpnet-base-cos-v1' and warpped in SentenceTransformer
  '''
  return model.encode(
    chunks
    , batch_size=32
    , device='mps' # send work to Metal shaders in M1 macs
    , show_progress_bar=True
  )

def makeChunks(raw_text):
  '''
  creates chunks out of raw text. Chunks will have default length
  '''
  paragraphs = removeEmpty(makeParagraphs(raw_text))
  return rightSize(paragraphs)

def fileToChunks(filePath, encoding='utf-8'):
  '''
  turns a file of raw text into chunks that are rightsized
  '''
  with open(filePath, encoding=encoding) as file:
    raw_text = file.read()
    return makeChunks(raw_text)

def makeEmbeddingsPerChunk(forFile):
  '''
  makes embeddings out of a file and returns chunks per file and its respective embeddings
  '''
  chunks = fileToChunks(forFile)
  embeddings = makeEmbedding(chunks)
  return { chunks[i]:embeddings[i] for i in range(len(chunks))}

# these are ready to get inserted into a database:
embeddings = makeEmbeddingsPerChunk('pg2680.txt')

Batches: 100%|██████████| 84/84 [00:07<00:00, 10.81it/s]


In [15]:
# print one of the vectors
one_item = None
for i in range(2):
  one_item = next(iter(embeddings.items()))
  print(one_item)

('\ufeffThe Project Gutenberg eBook of Meditations\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook', array([ 5.57664409e-02,  9.27753225e-02, -1.24235163e-02,  3.14126872e-02,
       -3.73991742e-03,  3.00762244e-02,  6.48141429e-02, -2.78083771e-03,
        1.91256171e-03,  2.53671926e-04, -1.09115914e-02,  1.96832493e-02,
       -1.41875250e-02, -2.61894632e-02,  1.68217961e-02, -4.98465560e-02,
       -1.43776238e-02,  1.50673147e-02, -5.30020557e-02, -3.47360559e-02,
        2.97887977e-02,  3.66981290e-02, -5.00558876e-02,  2.01754626e-02,
        4.50561196e-02, -2.59661544e-02, -

In [22]:
import psycopg2
from tqdm import tqdm

DATABASE = "semantic_search"
HOST = "127.0.0.1"
USER = "postgres"
PASSWORD = "123456"
CMD = """
insert into 
  items(embedding, text_chunk)
  values (%s, %s)
  returning id;
"""

with psycopg2.connect(
    host=HOST,
    database=DATABASE,
    user=USER,
    password=PASSWORD
  ) as connection:
    with connection.cursor() as cursor:
      for txt in tqdm(embeddings.keys()):
        if len(txt) > 1024:
           print('too big: %s' % txt)
           continue
        cursor.execute(CMD, (embeddings[txt].tolist(), txt))
        id = cursor.fetchone()[0]  
      cursor.close()
      connection.commit()


  0%|          | 0/2615 [00:00<?, ?it/s]

100%|██████████| 2615/2615 [00:06<00:00, 424.53it/s]


In [41]:
# query the book
q = "what is virtue?"
q_encoded = makeEmbedding([q])[0].tolist()
CMD = "select text_chunk from items order by embedding <=> '%s' limit 5"
results = None
with psycopg2.connect(
    host=HOST,
    database=DATABASE,
    user=USER,
    password=PASSWORD
  ) as connection:
    with connection.cursor() as cursor:
      cursor.execute(CMD % q_encoded)
      results = cursor.fetchall()

print(results)


Batches: 100%|██████████| 1/1 [00:00<00:00, 18.27it/s]

[(' which of all the virtues is the proper\nvirtue for this present use',), (' This dictum might easily be taken to mean that virtue consists\nin yielding to each natural impulse',), (' All these things are merely the sphere in which\nvirtue may act',), (' This conforming of the life to nature was the Stoic idea of\nVirtue',), (' For each fault in others, Nature (says\nhe) has given us a counteracting virtue',)]



