In [4]:
import sqlite3
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import chromadb
import numpy as np
import zipfile
import io
import re
import random


In [5]:
conn = sqlite3.connect('/content/drive/MyDrive/eng_subtitles_database.db')
query = "SELECT * FROM zipfiles"
df = pd.read_sql_query(query, conn)

num_rows_to_process = int(len(df) * 0.1) #15%
random.seed(42)
selected_rows = df.sample(n=num_rows_to_process)

selected_rows.head()

Unnamed: 0,num,name,content
73447,9483507,the.rookie.feds.s01.e19.burn.run.(2023).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00kh\x9aV\xc...
8031,9214995,the.things.we.bloom.(2021).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x11\x90\x...
5099,9203423,the.dunes.(2021).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00h\x8d\x99V...
24239,9277942,the.ateam.s04.e07.body.slam.(1985).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xdd\xb0\x...
41170,9349301,mythical.beasts.s01.e05.blood.for.the.snake.go...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xd8\xbe\x...


In [6]:
def preprocess_subtitle(subtitle_content):
    cleaned_text = ""

    # Unzip the content
    with zipfile.ZipFile(io.BytesIO(subtitle_content), 'r') as zip_ref:

        extracted_file = zip_ref.extract(zip_ref.namelist()[0])

    # Decode
    with open(extracted_file, 'r', encoding='latin-1') as f:
        for line in f:
            # Remove timestamp
            cleaned_line = re.sub(r'\d+:\d+:\d+,\d+ --> \d+:\d+:\d+,\d+\n', '', line)
            cleaned_text += cleaned_line

    subtitle_text_lower = cleaned_text.lower()

    # Remove special characters
    cleaned_text = re.sub("Watch any video online with Open-SUBTITLES|Free Browser extension: osdb.link/ext","", subtitle_text_lower)
    cleaned_text = re.sub("Please rate this subtitle at www.osdb.link/agwma|Help other users to choose the best subtitles","",cleaned_text)
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', cleaned_text)
    cleaned_text = re.sub(r'[\n\t]', ' ', cleaned_text)
    return cleaned_text

def chunk_subtitle(subtitle_content, chunk_size, overlap_size):
    chunks = []
    chunk_start = 0
    while chunk_start < len(subtitle_content):
        chunk_end = min(chunk_start + chunk_size, len(subtitle_content))
        chunks.append(subtitle_content[chunk_start:chunk_end])
        chunk_start += (chunk_size - overlap_size)
    return chunks

In [7]:
#filename-subtitle pairs
subtitle_map = {}

for index, row in selected_rows.iterrows():
    filename = row['name']
    cleaned_filename = filename.replace('eng.1cd', '').replace('.', ' ')
    subtitle_content = row['content']
    cleaned_subtitle = preprocess_subtitle(subtitle_content)
    # Chunk
    chunks = chunk_subtitle(cleaned_subtitle, chunk_size=500, overlap_size=100)
    subtitle_map[cleaned_filename] = chunks

In [9]:
import csv
csv_file = 'data.csv'

# Open the CSV file in write mode
with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Name', 'Content'])
    for key, value in subtitle_map.items():
        writer.writerow([key, value])

print('CSV file created successfully!')

CSV file created successfully!


In [10]:
df=pd.read_csv('/content/data.csv')

In [11]:
df.head()

Unnamed: 0,Name,Content
0,the rookie feds s01 e19 burn run (2023),['1 ipreviously on rookie fedsi 2 i think you...
1,the things we bloom (2021),['1 support us and become vip member to remov...
2,the dunes (2021),['1 watch any video online with opensubtitles ...
3,the ateam s04 e07 body slam (1985),['1 all right he gonna put him down now i can ...
4,mythical beasts s01 e05 blood for the snake go...,['1 hissing 2 narrator the feathered serpen...


In [12]:
df['Content'][3]

"['1 all right he gonna put him down now i can feel it  2 crowd cheering  3 ba hey man  4 he didnt save my life i saved his life  5 is there such a thing as a fiveman tag team  6 bring em on daddy  7 shouting  8 no you took the shell  9 hulkamania is gonna run wild  10 no way out this time smith  11 iface now were cookingi  12 oh i love it  13 imale narrator in 1972 a crack commando uniti  14 iwas sent to prison by a military courti  15 ifor a crime they didnt commiti  16 ithese men promptly escape', 'nt to prison by a military courti  15 ifor a crime they didnt commiti  16 ithese men promptly escapedi  17 ifrom a maximumsecurity stockadei  18 ito the los angeles undergroundi  19 itoday still wanted by the governmenti  20 ithey survive as soldiers of fortunei  21 iif you have a problem if no one else can helpi  22 iand if you can find themi  23 imaybe you can hire the ateami  24 gun firing  25 crickets chirping  26 iring announcer and big john studd slams hulk hogan on the mati  27 ith

In [13]:
def preprocess(text):

     cleaned_text = re.sub("use the free code joinnow at  wwwplayshipseu","", text)
     cleaned_text = re.sub("watch any video online with opensubtitles free browser extension osdblinkext","",cleaned_text)
     # Remove the ID
     cleaned_text = re.sub(r'\d+\r\n', ' ', cleaned_text)
     cleaned_text = re.sub(r'<[^>]*>', '', cleaned_text)
     cleaned_text = re.sub(r'[^a-zA-Z0-9 ]+', ' ', cleaned_text)
     cleaned_text = cleaned_text.lower()

     return cleaned_text

In [14]:
df['content_clean'] = df['Content'].apply(preprocess)

In [15]:
df.to_csv('Avideo_subtitles.csv')

In [16]:
df['combined_text'] = df['Name'] + ' ' + df['content_clean']

In [5]:
df=pd.read_csv('Avideo_subtitles.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Content,content_clean
0,0,american dad s01 e17 rough trade (2006),['1 good morning usa 2 i got a feeling tha...,good morning usa i got a feeling that i...
1,1,sister sister s04 e13 little man date (1997),['1 well another saturday night 2 and were of...,well another saturday night and were off t...
2,2,love now episode 1 16 (),['1 apiopensubtitlesorg is deprecated please i...,apiopensubtitlesorg is deprecated please imp...
3,3,the sleep experiment (2022),['1 watch any video online with opensubtitles ...,stop take off the bag and face the wind...
4,4,murdoch mysteries s09 e17 from buffalo with lo...,['1 use the free code joinnow at wwwplayships...,indistinct conversations midtempo piano m...


In [6]:
df['combined_text']=df['Name']+' '+df['content_clean']

In [7]:
chunk_size = 1000
chunks = [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)]

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

for chunk in chunks:
    text_column = chunk['combined_text']
    vectors = vectorizer.fit_transform(text_column)

In [9]:
import torch

In [10]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def bert_vectorize(text):
    # Tokenize text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the embeddings from BERT model output
    embeddings = outputs.last_hidden_state.mean(dim=1)  
    return embeddings.numpy()


In [11]:
# Vectorize each chunk
for chunk in chunks:
    text_column = chunk['combined_text']
    embeddings = text_column.apply(bert_vectorize)


In [44]:
# Connect to ChromaDB
client = chromadb.PersistentClient()
collection = client.create_collection(name="subtitles", metadata={"hnsw:space": "cosine"})


TypeError: Collection.add() missing 1 required positional argument: 'ids'

In [54]:
print(embeddings)

7000    [[-0.22828665, 0.10737817, 0.93315804, -0.3094...
7001    [[-0.35071284, 0.36272472, 0.5276593, -0.16588...
7002    [[-0.09392768, 0.2070764, 0.3695174, -0.036049...
7003    [[-0.27933416, 0.17398375, 0.79185164, -0.2466...
7004    [[-0.17893156, 0.06789591, 0.5646217, 0.084793...
                              ...                        
7790    [[-0.10816643, 0.056510597, 0.3465413, -0.1412...
7791    [[-0.34774515, -0.010042042, 0.438911, -0.0483...
7792    [[-0.40614253, 0.3010406, 0.51430047, -0.22900...
7793    [[-0.15506384, 0.13409151, 0.29507613, -0.0746...
7794    [[-0.31907433, 0.15267548, 0.49797863, -0.1297...
Name: combined_text, Length: 795, dtype: object


In [56]:
for i in range(7000, 7795): 
    subtitle_name = df.loc[i, 'Name']  
    subtitle_embedding = embeddings[i] 
    subtitle_id = str(i)  
    collection.add(documents=subtitle_name, embeddings=subtitle_embedding, ids=subtitle_id)

In [43]:
# client.delete_collection('subtitles')

In [59]:
query_embeddings = bert_vectorize("cars")
k = 5  
results = collection.query(query_embeddings=[query_embeddings], n_results=k)

for result in results:
    print(result)


ValueError: Expected each embedding in the embeddings to be a list, got [array([[ 3.00807208e-01,  7.22174272e-02, -5.40377833e-02,
         1.43974826e-01, -5.45000024e-02, -1.45927772e-01,
         1.37229785e-01, -5.20829372e-02, -1.35657415e-01,
        -9.78249777e-03,  1.82801887e-01,  4.90129925e-02,
         2.85392683e-02, -7.69456709e-03, -3.92446488e-01,
        -2.20101953e-01, -1.04609430e-01,  1.38398498e-01,
        -1.92499701e-02,  8.07900131e-02,  2.71441370e-01,
        -3.10674254e-02,  2.81607717e-01, -1.82386920e-01,
        -2.45416164e-03,  3.09585363e-01, -1.30774722e-01,
        -6.54224530e-02, -2.29234576e-01, -2.46564627e-01,
         4.39392542e-03, -5.85095771e-02,  6.71626255e-02,
         6.58630192e-01,  1.93724334e-02, -3.07913929e-01,
         3.44403721e-02, -2.62571096e-01, -7.84052610e-02,
        -1.18868547e-02,  4.22666483e-02,  7.46082738e-02,
         8.90171528e-03, -2.90871084e-01,  7.57003725e-02,
         1.22824572e-01, -5.89056425e-02,  3.81080396e-02,
        -1.97814763e-01,  1.94930315e-01, -6.96417242e-02,
         1.22316241e-01,  5.45325540e-02,  4.10658360e-01,
         1.80532530e-01,  1.91115692e-01,  7.13359192e-02,
        -3.35573442e-02, -1.13902241e-03,  2.34412387e-01,
         5.79590946e-02,  2.34621763e-03, -1.96594909e-01,
        -3.23623419e-04,  4.43072557e-01, -3.24211381e-02,
         2.13241920e-01, -2.38522980e-02, -3.92990321e-01,
         1.73244569e-02, -8.67935047e-02, -3.65728259e-01,
         2.29721025e-01,  1.39260581e-02,  1.36988714e-01,
        -1.09799258e-01, -3.43430728e-01,  4.37925816e-01,
         1.85258910e-01, -3.10717255e-01,  9.56228971e-02,
         2.67934144e-01,  1.94768056e-01,  4.35018271e-01,
         4.86395545e-02,  1.90450117e-01, -1.19170137e-01,
        -2.12085962e-01, -2.72888631e-01, -7.96700791e-02,
         7.06247911e-02,  7.68604353e-02,  3.54234986e-02,
         5.68579026e-02, -1.55092403e-01,  2.34524310e-01,
        -1.73879042e-01, -4.96935435e-02,  1.37664393e-01,
        -3.74282300e-02,  1.08329445e-01,  9.49946642e-02,
        -1.22634023e-02,  1.76564515e-01, -4.42365170e-01,
         1.05647534e-01,  1.14798568e-01,  4.84129079e-02,
         5.41205294e-02, -6.29548311e-01,  2.19072506e-01,
        -4.34591360e-02, -1.05501311e-02,  4.06312384e-02,
        -2.39428461e-01,  2.77749866e-01,  1.52096882e-01,
         1.56352863e-01,  7.65240416e-02,  1.46579191e-01,
        -3.79319578e-01, -2.60312349e-01,  1.73921049e-01,
         4.87954348e-01, -8.19406062e-02,  4.70211953e-02,
        -4.14252914e-02, -1.89511791e-01,  7.21665621e-02,
        -2.54142046e-01, -2.77359989e-02,  1.75909832e-01,
         5.88819496e-02,  3.32582027e-01, -1.28851190e-01,
         3.35438251e-02,  1.59135714e-01,  5.74764200e-02,
        -2.13403895e-01, -3.49099964e-01, -7.12973401e-02,
         4.13290858e-01, -9.45957363e-01, -4.31308359e-01,
         8.51607397e-02, -5.05227558e-02,  3.52347374e-01,
         6.65158927e-02,  2.66070843e-01, -1.58533558e-01,
         2.00647950e-01,  1.69147447e-01, -3.84740859e-01,
        -5.02405204e-02, -2.83917934e-01, -1.96276288e-02,
        -9.46147367e-02, -1.30340174e-01,  2.00071812e-01,
         5.00630140e-01, -1.34121282e-02,  1.95147827e-01,
         5.58943748e-02,  1.29161611e-01, -3.32343489e-01,
         4.91541810e-02, -1.91098496e-01,  2.91514575e-01,
         9.84644517e-02,  1.53002441e-01, -2.42116615e-01,
        -4.08060074e-01,  3.21183622e-01, -3.38992238e-01,
        -1.11680843e-01,  2.35033795e-01,  2.47689202e-01,
         2.58137852e-01,  2.00328514e-01, -7.67244920e-02,
        -3.00055957e+00,  2.68556867e-02,  9.23126638e-02,
        -8.12512934e-02,  3.00396085e-01, -2.06546187e-01,
         7.47049227e-02, -3.43867876e-02, -4.93378043e-02,
        -4.08126473e-01,  8.13451633e-02,  3.64052989e-02,
        -3.01256806e-01,  3.16985697e-01,  2.57846564e-01,
        -2.25959778e-01,  2.77201027e-01, -2.20367417e-01,
         5.28315715e-02, -2.21239612e-01, -5.55720413e-03,
        -7.01386034e-02,  6.60159672e-03,  2.76518255e-01,
        -1.87049434e-01,  9.79662001e-01,  2.51485586e-01,
        -2.57553011e-01, -3.77593446e-03,  8.76972675e-02,
        -3.79539251e-01,  4.99802917e-01, -1.46789281e-02,
        -1.95383787e-01,  1.91541016e-01, -4.28971142e-01,
         6.76284954e-02, -3.21113877e-02, -4.88413215e-01,
        -2.29596421e-01, -1.98918879e-02, -4.57959734e-02,
        -1.01060472e-01,  4.84771766e-02, -2.87233293e-01,
        -3.20878714e-01,  3.07380944e-01,  1.98885724e-01,
         2.92732865e-01, -1.54107466e-01,  8.89653563e-02,
         4.50983159e-02,  5.85860200e-02,  9.03916955e-02,
        -3.30470741e-01,  3.42332870e-01,  1.09613419e-01,
        -1.75202087e-01, -1.80293873e-01, -2.36114815e-01,
        -9.23821796e-03, -1.58455595e-01,  1.30998954e-01,
         8.15651938e-02, -2.02536985e-01, -2.91051745e-01,
         2.20908269e-01,  2.80681580e-01,  1.80088297e-01,
        -2.35463306e-01,  9.74201486e-02, -6.55312538e-01,
         2.76915818e-01, -4.16789144e-01,  7.05471560e-02,
         7.24288896e-02,  1.02472030e-01, -2.19159409e-01,
         2.71203607e-01,  1.16803668e-01,  1.67057469e-01,
         3.22934926e-01,  3.72097373e-01, -1.15981854e-01,
        -3.83187205e-01,  2.78058082e-01,  9.97201633e-03,
        -1.70225978e-01, -8.91730189e-02,  1.36720985e-01,
        -7.82888904e-02, -1.53215155e-01, -9.16961506e-02,
        -1.37869942e+00, -2.13287994e-01, -5.19826412e-02,
         1.75259516e-01,  2.00683892e-01,  1.15591608e-01,
         2.80595809e-01, -8.35957527e-02,  3.47029895e-01,
         4.63728160e-02,  1.84085906e-01,  2.97349632e-01,
        -3.56038481e-01, -2.29049519e-01, -6.99093714e-02,
         3.31393838e-01, -2.66671687e-01, -7.31852800e-02,
        -1.02277867e-01, -1.14109619e-02,  1.28249988e-01,
         3.61502886e-01, -4.92915027e-02,  8.17207098e-02,
         2.35668734e-01, -1.55148804e-01, -2.22242400e-01,
        -2.02355191e-01,  1.58702910e-01, -3.82685512e-01,
        -1.49475500e-01, -2.44954064e-01,  1.69230327e-01,
         6.98487312e-02, -5.78507930e-02, -2.31485772e+00,
        -2.68375576e-02,  8.76461193e-02, -3.33364487e-01,
         3.67710352e-01, -1.59394130e-01,  1.48977086e-01,
        -4.40135896e-02, -2.14556858e-01, -1.21597804e-01,
         1.75185680e-01, -3.93799096e-01,  1.11982144e-01,
         3.72059703e-01, -2.66947169e-02,  3.39510202e-01,
        -6.36758730e-02, -1.55861124e-01, -3.29968005e-01,
        -1.89784262e-02, -5.48898093e-02, -2.34872252e-02,
         5.40624857e-02, -3.25666398e-01,  2.78346181e-01,
        -1.40481755e-01, -1.86034754e-01,  1.30736157e-01,
        -4.38323647e-01,  1.49046049e-01, -2.60543466e-01,
        -1.02865495e-01, -4.13268328e-01,  1.19403340e-01,
         1.41990945e-01,  4.16305587e-02, -2.97684968e-02,
         1.76698998e-01,  4.96502548e-01, -4.43826109e-04,
        -7.57084042e-02,  2.21614003e-01,  2.61832029e-01,
        -4.31600809e-02, -1.31610379e-01,  1.49513083e-02,
        -9.57137644e-02, -2.36426875e-01,  2.08822563e-01,
         3.62742275e-01,  3.15662414e-01, -5.88545091e-02,
         3.28113943e-01,  1.26995621e-02,  3.91977243e-02,
        -1.76050499e-01,  3.24756056e-01,  1.81825817e-01,
        -3.49543281e-02, -9.64925885e-02,  1.88229203e-01,
        -3.54620695e-01, -7.64526650e-02,  2.95713097e-01,
        -2.22974420e-01, -1.13198549e-01,  4.77851816e-02,
        -1.16923638e-01, -1.05271421e-01, -6.04348481e-02,
        -3.33947212e-01, -1.20015763e-01,  1.05561214e-02,
        -5.90828001e-01, -3.03944081e-01,  8.85966197e-02,
        -1.98085770e-01, -1.27300499e-02,  3.76931518e-01,
         1.79263368e-01,  2.95672327e-01, -8.94598383e-03,
        -7.06072077e-02, -1.25177607e-01,  5.80498688e-02,
        -4.32367735e-02, -8.00365359e-02, -2.92533815e-01,
        -1.19630627e-01, -3.56524177e-02,  1.21584706e-01,
         1.15729630e-01,  4.60017353e-01, -8.60099420e-02,
         1.83655322e-01,  3.89905930e-01,  3.62948924e-01,
        -1.77527949e-01,  1.40801981e-01, -1.79720879e-01,
        -4.19669628e-01, -8.36629644e-02,  2.40265176e-01,
        -2.54725933e-01, -2.80449390e-01, -2.67410308e-01,
        -3.32515717e-01,  2.24163815e-01, -3.34634572e-01,
        -9.02665332e-02,  1.65532589e-01, -1.73890248e-01,
         1.66777089e-01, -1.72477681e-02,  1.38259545e-01,
        -2.44720936e-01,  1.30113810e-01,  6.32469535e-01,
         1.31159812e-01,  3.27201813e-01,  1.70142844e-01,
         2.61728615e-01,  3.67427282e-02, -1.95358619e-01,
         5.78281283e-03,  1.90216109e-01,  6.23775385e-02,
        -5.56565225e-01, -2.85347909e-01, -2.15915680e-01,
         1.50032103e-01,  3.92318368e-01, -6.42218962e-02,
        -3.15366864e-01, -1.89760566e-01,  7.12438449e-02,
         1.76379263e-01,  1.49976224e-01,  1.17921956e-01,
         8.27116892e-02,  3.55915070e-01,  7.26262107e-02,
        -4.13667917e-01, -1.89574435e-01,  1.68590799e-01,
         3.11322808e-01, -2.39351347e-01,  6.31624088e-02,
         6.45725310e-01,  2.04460308e-01, -2.26095498e-01,
        -2.90389299e-01,  3.06283683e-01, -2.68773705e-01,
        -7.46975392e-02, -3.01911056e-01,  1.56117424e-01,
        -9.61529836e-02, -2.12233543e-01, -2.62115598e-02,
        -1.62714884e-01,  3.11028391e-01, -9.54146758e-02,
         2.11854875e-02, -2.85505448e-02, -1.30449057e-01,
         3.10942024e-01, -1.17793411e-01, -1.89081267e-01,
         2.69487172e-01,  4.01292652e-01, -3.28274786e-01,
         2.54577607e-01, -1.51260242e-01, -2.93575916e-02,
         2.32131377e-01,  2.27381513e-01, -1.21554673e-01,
        -1.71649769e-01, -1.07347704e-01,  3.17143835e-02,
        -9.54682454e-02, -3.84684168e-02,  6.54539764e-02,
         2.81520009e-01,  1.38924107e-01, -2.51999885e-01,
        -2.86614954e-01,  2.52113283e-01,  3.10667992e-01,
         3.34533811e-01, -1.12347402e-01,  3.07301760e-01,
        -1.74779773e-01, -3.48381847e-01,  2.43381932e-01,
         2.41768673e-01, -2.06422210e-01,  3.81319433e-01,
         7.91088417e-02,  4.19590443e-01, -4.15724926e-02,
        -1.31823987e-01,  6.44827029e-03,  9.35727730e-02,
         1.60259336e-01,  1.26284435e-01,  1.60734113e-02,
        -5.01716852e-01,  5.95439458e-03,  1.09319560e-01,
        -3.64927530e-01, -4.12818193e-01, -2.43688688e-01,
         1.93201508e-02,  2.24088982e-01,  1.35840014e-01,
         4.05816324e-02,  2.00109258e-01, -7.52064073e-03,
        -5.22629082e-01, -2.92398274e-01,  6.57260120e-02,
        -2.61420101e-01, -4.51943241e-02, -5.91196418e-02,
        -4.48745757e-01, -1.96981430e-01, -3.32115114e-01,
        -6.60746917e-02, -4.56409641e-02,  1.14222944e-01,
        -8.52322951e-02,  3.85226198e-02,  5.99608600e-01,
         3.09564829e-01, -1.45975292e-01, -1.75101876e-01,
         3.93066043e-03,  6.18840270e-02, -8.97048339e-02,
        -2.58266896e-01, -3.05508345e-01, -1.77162349e-01,
         2.10532472e-01,  1.68406799e-01,  3.12137995e-02,
         4.87602323e-01,  1.96331903e-01, -2.94777870e-01,
        -2.55288839e-01, -2.22848300e-02, -4.73452918e-02,
         1.97617766e-02,  7.83937350e-02, -4.03465122e-01,
        -3.67478490e-01,  1.34589702e-01,  2.63104141e-01,
        -4.44231220e-02, -2.46809244e-01,  1.12843953e-01,
         3.19618493e-01,  1.03264451e-01, -1.07038379e-01,
        -5.03969705e-03, -1.97874844e-01,  3.25252563e-01,
         3.66354346e-01,  2.58403987e-01, -2.59747356e-03,
         3.43407750e-01, -1.88837945e-02,  2.66697049e-01,
        -2.74611145e-01, -3.44330072e-02,  2.70128697e-01,
        -3.66780728e-01, -2.31667101e-01,  1.27088398e-01,
         8.54434729e-01,  5.79956591e-01, -2.44625453e-02,
         2.59621348e-02,  1.01123042e-01,  7.24678710e-02,
        -6.92102090e-02,  2.83136666e-01, -3.36062521e-01,
        -3.17698717e-01,  5.71914852e-01, -1.61668137e-01,
         3.37901711e-02,  1.88275978e-01, -2.17393324e-01,
         2.45200738e-01, -1.52883872e-01,  2.93812361e-02,
        -1.58486634e-01,  1.91613272e-01,  1.69859733e-02,
         4.09291536e-01,  3.80781479e-02, -2.20045730e-01,
         1.61348265e-02,  1.90123916e-02,  1.74206778e-01,
        -3.99149090e-01,  7.48932874e-03, -2.51207143e-01,
        -3.99907827e-01, -1.10274829e-01, -2.10130308e-02,
         4.03790027e-01, -5.93857110e-01, -2.10047528e-01,
         5.93225472e-02, -4.55451965e-01, -1.04889177e-01,
         2.58354336e-01, -5.89613430e-02, -4.37787287e-02,
         1.14905417e-01,  6.31306469e-02,  1.14692159e-01,
         2.62809306e-01, -5.61590672e-01, -2.63019860e-01,
        -2.39696220e-01,  2.00065657e-01,  1.89638481e-01,
         4.76024151e-02, -1.74822927e-01,  9.71137062e-02,
        -8.00043866e-02, -1.56014517e-01, -1.13216989e-01,
        -3.45059228e-03,  1.85944065e-01, -3.21188271e-02,
         1.41863167e-01,  5.76707423e-02,  1.44274086e-02,
        -1.64204493e-01, -9.13991034e-02,  1.61491588e-01,
        -1.72287617e-02, -1.54543594e-01, -1.44427521e-02,
         2.35927463e-01,  1.93325892e-01, -1.49309054e-01,
        -1.98927876e-02,  8.58855471e-02, -3.63016039e-01,
        -8.00525248e-02,  3.13675165e-01, -5.99318743e-03,
        -1.23709448e-01, -1.13757765e+00,  1.34875372e-01,
        -8.09725523e-02, -1.08901300e-01, -1.84848130e-01,
         2.41620362e-01,  6.31195486e-01,  1.89451054e-01,
        -1.90942839e-01, -2.44864389e-01, -2.86674500e-03,
         1.05888426e-01,  1.59785971e-01, -1.78021982e-01,
        -3.05950642e-03, -8.59760344e-02,  2.71369934e-01,
        -1.85389981e-01,  1.52479574e-01, -4.06152248e-01,
        -3.34390029e-02,  9.66610536e-02, -2.35861421e-01,
        -2.52170116e-01, -2.29328558e-01,  1.19716287e-01,
        -7.92418197e-02, -1.82763115e-01, -2.10709020e-01,
         4.26654220e-01, -3.26856822e-01,  3.25299710e-01,
        -1.65819645e-01,  1.00485891e-01, -2.11032912e-01,
         8.97816718e-02,  5.83699793e-02, -2.96401773e-02,
        -1.20919868e-02,  2.39158332e-01,  1.91697702e-01,
         2.44462833e-01,  1.18594319e-01, -4.81461436e-01,
         2.53497452e-01, -2.80633777e-01,  2.69080121e-02,
         5.00311613e-01, -9.99705717e-02,  2.05433667e-01,
        -2.69041121e-01, -3.93294483e-01,  1.25640482e-01,
        -1.00632298e+00,  2.81518012e-01, -7.29729459e-02,
        -4.29362357e-02, -1.20453708e-01,  2.46761739e-01,
        -1.55347139e-02, -1.07485689e-01, -4.22660001e-02,
        -5.06359972e-02,  4.95910883e-01,  2.72718519e-01,
        -2.02229425e-01,  1.01983778e-01, -2.69294411e-01,
        -1.55363679e-02, -9.96563807e-02,  1.16876550e-01,
        -2.45688353e-02,  3.40327114e-01, -8.48575830e-02,
         1.79671690e-01,  1.86001644e-01,  7.07584545e-02,
        -9.82515812e-02,  2.28808280e-02,  5.43533377e-02,
        -1.02233626e-01, -3.84292342e-02,  4.76244278e-02,
         3.64664078e-01,  1.87960938e-01, -2.92477202e+00,
         4.47429009e-02,  1.03286542e-01, -2.00737119e-02,
         2.46488318e-01, -4.32638712e-02,  2.51687914e-01,
        -1.72380030e-01, -1.19961224e-01, -1.04466520e-01,
         2.64389485e-01, -6.01649404e-01,  2.96786696e-01,
        -2.48856381e-01, -2.89099067e-01,  8.47464334e-03]], dtype=float32)]

In [33]:
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_names = [row[0] for row in cursor.fetchall()]

In [34]:
for table_name in table_names:
    print(table_name)

migrations
embeddings_queue
collection_metadata
segments
segment_metadata
tenants
databases
collections
embeddings
embedding_metadata
max_seq_id
embedding_fulltext_search
embedding_fulltext_search_data
embedding_fulltext_search_idx
embedding_fulltext_search_content
embedding_fulltext_search_docsize
embedding_fulltext_search_config
