In [None]:
#
#
# spit and embed by chunking
#
#

import re
import uuid
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings

persistent_client = chromadb.PersistentClient()

collection_name = 'all-MiniLM-L6-v2_1000_split_clean'

try:
    persistent_client.delete_collection(name=collection_name)
except:
    print('nothing to delete')

collection = persistent_client.get_or_create_collection(collection_name)

embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

pdfs = ['London_Borough_of_Southwark',
        'London_Borough_of_Tower_Hamlets',
        'London_Borough_of_Islington']

for pdf in pdfs[0:3]:
    with open('txts/' + pdf + '.txt') as f:
        string = f.read()
        cleaner_string = string.replace('\n', ' ').replace('\r', '')

        clean_string = re.sub("\s\s+", " ", cleaner_string)

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100,
            length_function=len,
            is_separator_regex=False,
        )

        # Split text
        split_texts = text_splitter.create_documents([clean_string])
        split_texts_list = [str(txt.page_content) for txt in split_texts]
        display(len(split_texts_list))

        # Embed text
        embedded_texts = embedding_model.embed_documents(
            texts=split_texts_list)

        # add vectors to collection
        ids = [str(uuid.uuid4()) for sent in split_texts_list]
        metadatas = [{"LPA": pdf}
                     for sent in split_texts_list]
        collection.add(
            embeddings=embedded_texts,
            documents=split_texts_list,
            ids=ids,
            metadatas=metadatas
        )

In [1]:
#
#
# split and embed by sentences
#
#

import chromadb
from nltk.tokenize import sent_tokenize
from langchain.embeddings import SentenceTransformerEmbeddings
import re
from helpers import embed, pinecone_embed

collection_name = 'all-MiniLM-L6-v2_sentence_split'

persistent_client = chromadb.PersistentClient()

try:
    persistent_client.delete_collection(name=collection_name)
except:
    print('nothing to delete')


collection = persistent_client.get_or_create_collection(collection_name)
print('Collection deleted?: ')
display(collection.count())
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

pdfs = ['London_Borough_of_Southwark',
        'London_Borough_of_Tower_Hamlets', 'London_Borough_of_Islington']


for pdf in pdfs[0:1]:
    with open('txts/' + pdf + '.txt') as f:
        string = f.read()
        cleaner_string = string.replace('\n', ' ').replace('\r', '')

        clean_string = re.sub("\s\s+", " ", cleaner_string)

        # Split text
        split_texts_list = sent_tokenize(clean_string)

        proper_setences = [
            i for i in split_texts_list if i.count(' ') >= 6]

        print('number of sentences to embed')
        print(len(proper_setences))

        n = 3  # group size
        m = 2  # overlap size
        triplets = [" ".join(proper_setences[i:i+n])
                    for i in range(0, len(proper_setences), n-m)]

        triplets_trucated = [
            i[:1000] for i in triplets]

        embed(collection, embedding_model, proper_setences, pdf)

Collection deleted?: 


0

  from .autonotebook import tqdm as notebook_tqdm


number of sentences to embed
3352
3352
3352
3352


ValueError: Expected ID to be a str, got ('ea917922-6bf9-421b-b2b0-df973ba4d0ef', [0.007049312349408865, 0.0037307392340153456, 0.05868951231241226, -0.12151091545820236, -0.026203883811831474, 0.06338552385568619, -0.06024651601910591, -0.018123134970664978, -0.15988048911094666, 0.0583149716258049, -0.0285935141146183, -0.060437120497226715, 0.021241866052150726, 0.027746431529521942, -0.0008897087536752224, 0.0534585602581501, 0.031648579984903336, -0.07946105301380157, -0.028515184298157692, -0.015992064028978348, 0.07133373618125916, -0.023155035451054573, 0.018104810267686844, -0.0049970513209700584, -0.09808901697397232, -0.01911879889667034, 0.07275884598493576, 0.014110165648162365, -0.04004855453968048, 0.06201932579278946, 0.08114098012447357, 0.05467699468135834, -0.05132041126489639, -0.04241568222641945, 0.04462660849094391, 0.004436573479324579, 0.02685990184545517, -0.03510517254471779, 0.05156895890831947, -0.06374090909957886, -0.030079089105129242, -0.13062024116516113, -0.009631755761802197, 0.015886560082435608, 0.11390777677297592, 0.018420385196805, -0.003884058678522706, -0.01769823208451271, -0.07630152255296707, -0.031087491661310196, 0.03111291490495205, -0.011201485991477966, 0.006852882448583841, -0.08633913099765778, -0.03920772671699524, -0.0951298177242279, -0.06871294230222702, -0.04143739491701126, -0.012737831100821495, 0.036610301584005356, -0.12390118092298508, 0.05833679437637329, -0.03295230492949486, 0.02825004979968071, 0.04700757935643196, -0.03769423067569733, -0.026363637298345566, 0.04494086280465126, 0.06592946499586105, -0.004683416802436113, -0.0020479317754507065, -0.05277920141816139, 0.07100371271371841, -0.07879336178302765, -0.024561667814850807, 0.0005976302782073617, -0.03657855838537216, 0.09286227822303772, -0.02358214557170868, -0.15294834971427917, 0.01162131316959858, -0.02088131010532379, 0.0218668133020401, 0.07979875802993774, -0.02385084517300129, -0.06495512276887894, -0.043261971324682236, 0.01841338910162449, 0.014292958192527294, 0.02842588536441326, 0.07604965567588806, -0.03868488222360611, 0.05425497889518738, 0.09073752909898758, 0.0007025804370641708, 0.0547848604619503, -0.04202796146273613, -0.05269916355609894, -0.006251059006899595, 0.010118166916072369, -0.04453360661864281, 0.06480038911104202, -0.042416758835315704, 0.034953523427248, -0.05117040500044823, -0.05914692208170891, -0.03425202891230583, -0.024489855393767357, 0.006186860613524914, -0.08534905314445496, -0.011747783981263638, -0.02282688580453396, 0.016577810049057007, -0.03770388662815094, -0.03292609378695488, 0.017456894740462303, -0.058951444923877716, -0.044850654900074005, 0.006272000260651112, -0.047896482050418854, 0.01473766565322876, 0.06277043372392654, -0.023210853338241577, -0.026347294449806213, -0.02676379680633545, -0.03943553566932678, 0.029943622648715973, -2.1171158720880252e-32, -0.09270716458559036, -0.030240366235375404, -0.008301837369799614, 0.07347524911165237, -0.006122083403170109, -0.07728977501392365, 0.009871832095086575, 0.02222016267478466, 0.02873336896300316, 0.021275071427226067, 0.08219899982213974, -0.13223229348659515, -0.006232312880456448, 0.00485003087669611, 0.00452396972104907, -0.06232145428657532, 0.017015201970934868, 0.03316453471779823, -0.06636817008256912, 0.0028446651995182037, -0.028783222660422325, -0.05756930634379387, 0.0629068911075592, -0.08050686866044998, 0.03451812267303467, -0.002317120786756277, 0.08960642665624619, 0.05487607419490814, 0.005049906205385923, 0.040897585451602936, 0.046690452843904495, 0.06458599120378494, -0.025001714006066322, -0.018998602405190468, -0.0522192046046257, 0.05122971534729004, -0.016645153984427452, -0.12975937128067017, -0.04449431225657463, 0.032956965267658234, -0.036213468760252, -0.004170452244579792, 0.040403857827186584, 0.021113526076078415, 0.04283631965517998, -0.04151172935962677, -0.009331738576292992, -0.08197055011987686, 0.05033445730805397, 0.00531454011797905, -0.02137135900557041, -0.011908822692930698, -0.119925357401371, -0.1015389934182167, 0.020764492452144623, -0.016082799062132835, 0.004252560902386904, -0.0005632895627059042, 0.005960875190794468, 0.02869284711778164, 0.04610426351428032, -0.04304321110248566, -0.11607170104980469, -0.00432239705696702, -0.03832477703690529, -0.0606938973069191, -0.021909773349761963, -0.06652875244617462, -0.0371452160179615, -0.08547189086675644, -0.020888902246952057, 0.06578918546438217, 0.06682037562131882, 0.009732520207762718, -0.027770526707172394, 0.023180091753602028, 0.043595921248197556, 0.06922220438718796, 0.04036518186330795, 0.00823616236448288, -0.014171271584928036, 0.11661576479673386, -0.016685547307133675, 0.004375182557851076, 0.11817798763513565, -0.08211009949445724, 0.07805189490318298, -0.03669465333223343, -0.052675485610961914, -0.02019401267170906, -0.02776658535003662, -0.11263056844472885, -0.025010470300912857, 0.05036953091621399, 0.015528679825365543, 8.252216461902162e-33, -0.04714452847838402, 0.017830880358815193, -0.04731416329741478, -0.02505233883857727, -0.02074717916548252, -0.004722436424344778, 0.004234957508742809, -0.017343703657388687, 0.062481895089149475, 0.0206594355404377, -0.07541698962450027, 0.07303241640329361, 0.10126238316297531, -0.00793857965618372, 0.00360323884524405, -0.05248536169528961, 0.05680295452475548, -0.08156882971525192, -0.05919823795557022, 0.04839836433529854, 0.11293128877878189, -0.02631772868335247, -0.028139527887105942, 0.05515696853399277, -0.014178808778524399, -0.005843652877956629, 0.05260893702507019, -0.0681675523519516, -0.053373172879219055, -0.04195244982838631, -0.07472303509712219, -0.12803496420383453, -0.0933191105723381, -0.021557651460170746, -0.021188827231526375, -0.11815294623374939, 0.08196853846311569, -0.009192509576678276, -0.03790657967329025, 0.0030397227965295315, 0.05032181367278099, -0.01407338585704565, -0.02336583100259304, 0.013555091805756092, -0.0027313902974128723, 0.012907851487398148, -0.007199686486274004, -0.01470259577035904, 0.05703667551279068, -0.05808130279183388, 0.021713947877287865, 0.07878787070512772, -0.03417040407657623, -0.12987735867500305, 0.05894332379102707, -0.011220238171517849, -0.03949591889977455, -0.06598271429538727, 0.014651954174041748, -0.048834070563316345, 0.03315787389874458, 0.07745006680488586, -0.04193867743015289, 0.04833272099494934, -0.03846622258424759, -0.0031291351187974215, -0.05426067113876343, -0.03217501565814018, 0.01815761625766754, -0.006150817032903433, -0.0485050305724144, -0.07579643279314041, -0.055171359330415726, -0.056920360773801804, 0.05824737995862961, -0.0921459048986435, -0.023313460871577263, -0.007943489588797092, -0.03271319344639778, 0.018763337284326553, 0.01647571660578251, -0.006804874632507563, -0.011158649809658527, -0.0748186707496643, 0.0504630021750927, -0.07198766618967056, -0.012254004366695881, 0.012368433177471161, 0.025291167199611664, -0.032474637031555176, -0.050933368504047394, -0.008313391357660294, -0.03765570744872093, 0.037244029343128204, 0.0415775291621685, -4.649118778843331e-08, 0.021262958645820618, 0.021563665941357613, -0.009834805503487587, -0.019384004175662994, -0.02809830754995346, -0.03381434455513954, -0.05879770591855049, 0.0011529580224305391, -0.0023247881326824427, -0.004965544678270817, 0.04515598714351654, 0.017548473551869392, -0.04291708394885063, 0.024290885776281357, -0.10595259070396423, -0.00836084596812725, -0.008171834982931614, -0.017537273466587067, -0.059401802718639374, -0.026430439203977585, -0.02047484554350376, 0.0242674108594656, -0.042262062430381775, -0.017382262274622917, -0.03168819099664688, 0.040682196617126465, -0.04887528717517853, -0.024575138464570045, -0.00918505433946848, 0.06000257283449173, 0.011426057666540146, 0.031077438965439796, -0.01580214872956276, 0.016353245824575424, -0.017202887684106827, -0.008711702190339565, 0.006817817222326994, 0.0796644389629364, 0.07918009907007217, 0.0005691376863978803, 0.024084851145744324, -0.02705584652721882, 0.053119052201509476, 0.07646995037794113, 0.05848423391580582, 0.011089996434748173, -0.04053753986954689, 0.02393989823758602, -0.03967314958572388, -0.09922149777412415, -0.012327087111771107, -0.05561758577823639, 0.04610744118690491, 0.022683851420879364, 0.08344092965126038, 0.06709633767604828, -0.027880266308784485, -0.033802375197410583, -0.030899997800588608, 0.038676198571920395, 0.03352509066462517, -0.007238936144858599, -0.04212578013539314, 0.09477773308753967], {'LPA': 'London_Borough_of_Southwark'})

In [4]:
#
#
# split and embed by sentences TO PINECONE
#
#


from nltk.tokenize import sent_tokenize
from langchain.embeddings import SentenceTransformerEmbeddings
import re
import os
from helpers import pinecone_embed, pinecone_connect

index = pinecone_connect()

index.delete(delete_all=True, namespace='localplans')

embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

pdfs = os.listdir('txts')
pdfs = sorted(pdfs)

for pdf in pdfs:
    print('txts/' + pdf)
    with open('txts/' + pdf) as f:
        string = f.read()
        cleaner_string = string.replace('\n', ' ').replace('\r', '')

        clean_string = re.sub("\s\s+", " ", cleaner_string)

        # Split text
        split_texts_list = sent_tokenize(clean_string)

        proper_sentences = [
            i for i in split_texts_list if i.count(' ') >= 6]

        print('number of sentences to embed')
        print(len(proper_sentences))

        n = 3  # group size
        m = 2  # overlap size
        triplets = [" ".join(proper_sentences[i:i+n])
                    for i in range(0, len(proper_sentences), n-m)]

        triplets_trucated = [
            i[:1000] for i in triplets]

    n = 500
    for i in range(0, len(triplets_trucated), n):
        chunk = triplets_trucated[i:i + n]
        pinecone_embed(index, embedding_model, chunk, pdf)

txts/Allerdale_Borough_Council.txt
number of sentences to embed
1880
500
500
500
500
500
500
500
500
500
380
380
380
txts/Arun_District_Council.txt
number of sentences to embed
895
500
500
500
395
395
395
txts/Ashfield_District_Council.txt
number of sentences to embed
3624
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
124
124
124
txts/Barnsley_Metropolitan_Borough_Council.txt
number of sentences to embed
2795
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
295
295
295
txts/Basildon_Borough_Council.txt
number of sentences to embed
4929
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
500
429
429
429
txts/Bassetlaw_District_Council.txt
number of sentences to embed
1509
500
500
500
500
500
500
500
500
500
9
9
9
txts/Bath_and_North_East_Somerset_Council.txt
number of sentences to embed
1704
500
500
500
500
500
500
500
500
500
204
204
204
txts/Bedford_Borough_Council.txt
number of senten

In [2]:
from helpers import pinecone_connect
index = pinecone_connect()

  from tqdm.autonotebook import tqdm
