In [None]:
#
#
# spit and embed by chunking
#
#

import re
import uuid
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings

persistent_client = chromadb.PersistentClient()

collection_name = 'all-MiniLM-L6-v2_1000_split_clean'

try:
    persistent_client.delete_collection(name=collection_name)
except:
    print('nothing to delete')

collection = persistent_client.get_or_create_collection(collection_name)

embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

pdfs = ['London_Borough_of_Southwark',
        'London_Borough_of_Tower_Hamlets', 'London_Borough_of_Islington']

for pdf in pdfs[0:3]:
    with open('txts/' + pdf + '.txt') as f:
        string = f.read()
        cleaner_string = string.replace('\n', ' ').replace('\r', '')

        clean_string = re.sub("\s\s+", " ", cleaner_string)

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100,
            length_function=len,
            is_separator_regex=False,
        )

        # Split text
        split_texts = text_splitter.create_documents([clean_string])
        split_texts_list = [str(txt.page_content) for txt in split_texts]
        display(len(split_texts_list))

        # Embed text
        embedded_texts = embedding_model.embed_documents(
            texts=split_texts_list)

        # add vectors to collection
        ids = [str(uuid.uuid4()) for sent in split_texts_list]
        metadatas = [{"LPA": pdf}
                     for sent in split_texts_list]
        collection.add(
            embeddings=embedded_texts,
            documents=split_texts_list,
            ids=ids,
            metadatas=metadatas
        )

In [1]:
#
#
# split and embed by sentences
#
#

import chromadb
from nltk.tokenize import sent_tokenize
from langchain.embeddings import SentenceTransformerEmbeddings
import re
from helpers import embed

collection_name = 'all-MiniLM-L6-v2_sentence_split'

persistent_client = chromadb.PersistentClient()

try:
    persistent_client.delete_collection(name=collection_name)
except:
    print('nothing to delete')


collection = persistent_client.get_or_create_collection(collection_name)
print('Collection deleted?: ')
display(collection.count())
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

pdfs = ['London_Borough_of_Southwark',
        'London_Borough_of_Tower_Hamlets', 'London_Borough_of_Islington']


for pdf in pdfs[0:1]:
    with open('txts/' + pdf + '.txt') as f:
        string = f.read()
        cleaner_string = string.replace('\n', ' ').replace('\r', '')

        clean_string = re.sub("\s\s+", " ", cleaner_string)

        # Split text
        split_texts_list = sent_tokenize(clean_string)

        proper_setences = [
            i for i in split_texts_list if i.count(' ') >= 6]

        print('number of sentences to embed')
        print(len(proper_setences))

        n = 3  # group size
        m = 2  # overlap size
        triplets = [" ".join(proper_setences[i:i+n])
                    for i in range(0, len(proper_setences), n-m)]

        triplets_trucated = [
            i[:1000] for i in triplets]

        embed(collection, embedding_model, proper_setences, pdf)

Collection deleted?: 


0

  from .autonotebook import tqdm as notebook_tqdm


number of sentences to embed
3352
857546
3352
3352
3352
number of embeddings 
3352


In [30]:
collection.peek()

{'ids': ['cc0c8ef2-b3cb-4100-8c23-c13475bca9fc',
  '19c98ef0-0b0a-48d8-a8ea-866399762df0',
  '9a2bafed-e6e6-477f-a285-ac9a5bedb6fd',
  '7fae9376-d0ac-4465-b5c6-f34fe2e13b01',
  'ad6c0806-e4dd-4bd3-a522-7dc1d02a930f',
  '64ac662b-bc9d-45e1-918c-b766e8cada45',
  'd5792e9e-be8c-4de2-8527-ecff27d55392',
  '86f69b7f-f1f5-4510-9053-3a3783cb01e8',
  'ea64059e-edf9-4a45-9c2f-4b12b1324648',
  '960f53ce-0570-4579-a13b-9b4f80945821'],
 'embeddings': [[0.007049312349408865,
   0.0037307392340153456,
   0.05868951231241226,
   -0.12151091545820236,
   -0.026203883811831474,
   0.06338552385568619,
   -0.06024651601910591,
   -0.018123134970664978,
   -0.15988048911094666,
   0.0583149716258049,
   -0.0285935141146183,
   -0.060437120497226715,
   0.021241866052150726,
   0.027746431529521942,
   -0.0008897087536752224,
   0.0534585602581501,
   0.031648579984903336,
   -0.07946105301380157,
   -0.028515184298157692,
   -0.015992064028978348,
   0.07133373618125916,
   -0.023155035451054573,
   0.01