In [1]:
import chromadb

In [2]:
client = chromadb.PersistentClient(path=f"./chroma.db")

In [3]:
client.heartbeat() # returns a nanosecond heartbeat. Useful for making sure the client remains connected.


1704900084070207600

client.reset() # Empties and completely resets the database. ⚠️ This is destructive and not reversible.

In [None]:
# collection = client.create_collection(name="te_collection")
# collection = client.get_collection(name="te_collection")

In [None]:

# collection.modify(name="new_name") # Rename the collection

In [3]:
collection = client.create_collection(
    name="quran_english_semantic_search",
    metadata={"hnsw:space": "cosine"} # l2 is the default
)

In [5]:
import json

# Open the JSON file
with open('te-chat-demo-langchain.json', 'r') as file:
    # Load the JSON data
    data = json.load(file)

# Fields to exclude
fields_to_exclude = ['embeddings', 'text_chunks', 'list', 'header', 'footer', 'title', 'tables','$oid']

# Maximum length for each sublist
max_sublist_length = 512

# Function to chunk the text lists
def chunk_text_list(text_list, max_length):
    result = []
    current_chunk = []

    for text in text_list:
        # Check if adding the next text to the current chunk exceeds the max length
        if len(' '.join(current_chunk + [text]).split()) <= max_length:
            current_chunk.append(text)
        else:
            result.append(' '.join(current_chunk))
            current_chunk = [text]

    # Add the last chunk if it's not empty
    if current_chunk:
        result.append(' '.join(current_chunk))

    return result

# Iterate through the length of the data
for index, entry in enumerate(data):
    print(f"Entry {index + 1}:")

    # Iterate over the subrecords within each entry
    for key, value in entry.items():
        # Check if the key is not in the fields to exclude
        if key not in fields_to_exclude:
            print(f"{key}:")

            # Initialize metadata and id variables
            metadata = {'pages': entry.get('pages')}
            file_name = entry.get('file_name', 'N/A')
            
            # Check if the key is 'text' and the value is a list
            if key == 'text' and isinstance(value, list):
                # Chunk the 'text' list into sublists of maximum length
                sublists = chunk_text_list(value, max_sublist_length)

                # Process each sublist as a separate document
                for i, sublist in enumerate(sublists):
                    # Create the document ID and metadata
                    document_id = f"{file_name} {i + 1}"
                    metadata['bucket_path'] = entry.get('bucket_path')
                    metadata['file_name'] = entry.get('file_name')

                    # Print or use the document, metadata, and ID as needed
                    print(f"  Document ID: {document_id}")
                    print(f"  Metadata: {metadata}")
                    print(f"  Document Text: {sublist}")
                    collection.add(
                        documents=[sublist],
                        metadatas=[metadata],
                        ids=[document_id]
                    )
                    print("-----")
            elif isinstance(value, list):
                # If the value is a list, print it
                print(f"{value}")
            else:
                # If the value is not an array or list, it's a single entry
                print(f"{value}")

    print("-----")


ValueError: Expected metadata to be a dict or None, got Author: ahmedraza, Surah: 1, Verse: 1

# 148 minutes to execute

In [8]:
import csv

# Function to iterate through the CSV file and write metadata and record for all authors to a single file
def process_csv(input_csv, output_txt):
    with open(input_csv, 'r', newline='', encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file)
        header = next(csv_reader)  # Read the header

        with open(output_txt, 'w', encoding='utf-8') as output_file:
            for author_index in range(3, len(header)):
                author_name = header[author_index]

                csv_file.seek(0)  # Reset file pointer to start
                next(csv_reader)  # Skip header for subsequent reads

                for row in csv_reader:
                    surah_number, verse_number, *translations = row
                    metadata = {
                        'Author': author_name,
                        'Surah': surah_number,
                        'Verse': verse_number
                    }
                    document_id = f"{author_name}_{surah_number}_{verse_number}" 
                    translation = translations[author_index - 3] if translations else ""
                    output_file.write(f"{metadata}\n{translation}\n\n")
                    # Add the document to the collection
                    collection.add(
                        documents=[translation],
                        metadatas=[metadata],
                        ids=[document_id]
                    )


# Example usage
input_csv = 'quran.csv'
output_txt = 'metadata_all_authors.txt'
process_csv(input_csv, output_txt)


In [1]:
import chromadb
client = chromadb.PersistentClient(path=f"./chroma.db")
client.heartbeat() 
collection = client.get_collection(name="quran_english_semantic_search")

In [2]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
query = "who are the lsoers"
embeddings = embedder.encode(query)

collection.query(
    query_embeddings=embeddings.tolist(),
    n_results=5
)


{'ids': [['yusufali_43_69',
   'wahiduddin_23_48',
   'yusufali_114_6',
   'hilali_2_250',
   'sarwar_26_206']],
 'distances': [[0.7044472694396973,
   0.7177495956420898,
   0.7231388321872747,
   0.7265528440475464,
   0.7315015196800232]],
 'metadatas': [[{'Author': 'yusufali', 'Surah': '43', 'Verse': '69'},
   {'Author': 'wahiduddin', 'Surah': '23', 'Verse': '48'},
   {'Author': 'yusufali', 'Surah': '114', 'Verse': '6'},
   {'Author': 'hilali', 'Surah': '2', 'Verse': '250'},
   {'Author': 'sarwar', 'Surah': '26', 'Verse': '206'}]],
 'embeddings': None,
 'documents': [['those who believed in Our revelations and surrendered themselves to Us.',
   'So they rejected them and became of those who were destroyed.',
   'from jinn and men."',
   'And when they arrayed themselves against Jalut and his hosts, they said: our Lord pour forth on us patience, and set firm our feet, and make us triumph over the infidel people.',
   'And then there came to them that which they were promised?']]}

need to verify

In [4]:
collection.query(
    query_texts=["who are the losers"],
    n_results=5

)

{'ids': [['maududi_6_31',
   'wahiduddin_11_22',
   'daryabadi_6_140',
   'yusufali_11_22',
   'ahmedraza_16_109']],
 'distances': [[0.27176374197006226,
   0.28356873989105225,
   0.30186188220977783,
   0.30398643016815186,
   0.305777907371521]],
 'metadatas': [[{'Author': 'maududi', 'Surah': '6', 'Verse': '31'},
   {'Author': 'wahiduddin', 'Surah': '11', 'Verse': '22'},
   {'Author': 'daryabadi', 'Surah': '6', 'Verse': '140'},
   {'Author': 'yusufali', 'Surah': '11', 'Verse': '22'},
   {'Author': 'ahmedraza', 'Surah': '16', 'Verse': '109'}]],
 'embeddings': None,
 'documents': [['Losers are those who deny the encounter with God. Then, when the Hour comes upon them suddenly, they will say, “Alas for us, how we have neglected it.” And they will carry their burdens on their backs—evil is what they carry.',
   'Truly in the hereafter they are the greatest losers.',
   'Losers are they who slay their children in folly, without knowledge, and have forbidden what God has provided them, fo

In [None]:
collection.query(
    query_texts=["who are losers"],
    n_results=5,
    where={"file_name": "889780-2-contact-systems.pdf"},

)

In [11]:
query_results = collection.query(
     query_texts=["information accuracy"],
     n_results=5
     ,
 )

In [13]:
query_results.keys()

dict_keys(['ids', 'distances', 'metadatas', 'embeddings', 'documents'])

In [14]:
query_results["documents"]

[['DISCLAIMER While TE Connectivity (TE) has made every reasonable effort to ensure the accuracy of the information in this white paper, TE does not guarantee that it is error-free, nor does TE make any other representation, warranty or guarantee that the information is accurate, correct, reliable or current. TE reserves the right to make any adjustments to the information contained herein at any time without notice. TE expressly disclaims all implied warranties regarding the information contained herein, including, but not limited to, any implied warranties of merchantability or fitness for a particular purpose. The dimen- sions in this white paper are for reference purposes only and are subject to change without notice. Specifications are subject to change without notice. Consult TE for the latest dimensions and design specifications.']]

In [15]:
query_results["ids"]

[['160603_WhitePaper_MCON_8_06-2016_engl_CLEAR.pdf 4']]

In [16]:
query_results["distances"]

[[0.7493338584899902]]

In [17]:
query_results["metadatas"]

[[{'bucket_path': 'te-chat-demo-files/160603_WhitePaper_MCON_8_06-2016_engl_CLEAR.pdf',
   'file_name': '160603_WhitePaper_MCON_8_06-2016_engl_CLEAR.pdf',
   'pages': 8}]]

In [18]:
collection._embedding_function

<chromadb.utils.embedding_functions.ONNXMiniLM_L6_V2 at 0x24e7eaa9bd0>

In [21]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

In [25]:

from sentence_transformers import SentenceTransformer

query = "electrical and mechanical"
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = embedder.encode(query)

collection.query(
    query_embeddings=embeddings.tolist(),
    n_results=10
)


{'ids': [['160603_WhitePaper_MCON_8_06-2016_engl_CLEAR.pdf 1',
   '889780-2-contact-systems.pdf 35',
   '160603_WhitePaper_MCON_8_06-2016_engl_CLEAR.pdf 2',
   '889780-2-contact-systems.pdf 27',
   '160603_WhitePaper_MCON_8_06-2016_engl_CLEAR.pdf 3',
   '889780-2-contact-systems.pdf 36',
   '889780-2-contact-systems.pdf 7',
   '889780-2-contact-systems.pdf 32',
   '889780-2-contact-systems.pdf 37',
   '889780-2-contact-systems.pdf 55']],
 'distances': [[0.6573805212974548,
   0.6755064129829407,
   0.6811323165893555,
   0.6826337575912476,
   0.6859297752380371,
   0.6860485076904297,
   0.7063804268836975,
   0.7073202729225159,
   0.7091423273086548,
   0.7114173769950867]],
 'metadatas': [[{'bucket_path': 'te-chat-demo-files/160603_WhitePaper_MCON_8_06-2016_engl_CLEAR.pdf',
    'file_name': '160603_WhitePaper_MCON_8_06-2016_engl_CLEAR.pdf',
    'pages': 8},
   {'bucket_path': 'te-chat-demo-files/889780-2-contact-systems.pdf',
    'file_name': '889780-2-contact-systems.pdf',
    'pa

In [5]:
from sentence_transformers import SentenceTransformer
# "electrical and mechanical"
query = "who is financly strong"
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = embedder.encode(query)

collection.query(
    query_embeddings=embeddings.tolist(),
    n_results=10
)

{'ids': [['shakir_51_58',
   'shakir_53_48',
   'yusufali_51_58',
   'hilali_2_245',
   'qarai_2_245',
   'qarai_57_11',
   'daryabadi_2_245',
   'maududi_51_58',
   'ahmedraza_19_81',
   'yusufali_19_81']],
 'distances': [[0.6311873197555542,
   0.6616016030311584,
   0.6651338338851929,
   0.6800140142440796,
   0.6820188164710999,
   0.6880834698677063,
   0.6885476112365723,
   0.7002965807914734,
   0.701044499874115,
   0.702289879322052]],
 'metadatas': [[{'Author': 'shakir', 'Surah': '51', 'Verse': '58'},
   {'Author': 'shakir', 'Surah': '53', 'Verse': '48'},
   {'Author': 'yusufali', 'Surah': '51', 'Verse': '58'},
   {'Author': 'hilali', 'Surah': '2', 'Verse': '245'},
   {'Author': 'qarai', 'Surah': '2', 'Verse': '245'},
   {'Author': 'qarai', 'Surah': '57', 'Verse': '11'},
   {'Author': 'daryabadi', 'Surah': '2', 'Verse': '245'},
   {'Author': 'maududi', 'Surah': '51', 'Verse': '58'},
   {'Author': 'ahmedraza', 'Surah': '19', 'Verse': '81'},
   {'Author': 'yusufali', 'Surah':