In [1]:
import os 
from dotenv import load_dotenv

import chromadb
from chromadb.utils import embedding_functions
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import ChatPromptTemplate
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings


load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
default_ef = embedding_functions.DefaultEmbeddingFunction()

Settings.embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
Settings.llm = OpenAI(model= 'gpt-3.5-turbo')



In [3]:
client = chromadb.PersistentClient(path="../chroma_db")

In [7]:
def add_docs_to_collection(folder_name, collection):
    documents_objects = SimpleDirectoryReader(input_dir=f"../docs/{folder_name}").load_data()
    
    documents = [document.text for document in documents_objects] # ids have to be unique identifiers for the documents, here we choose the file name for simplicity.
    ids = [document.metadata['file_name'] for document in documents_objects] # ids have to be unique identifiers for the documents, here we choose the file name for simplicity.

    collection.add(documents=documents,
                   ids=ids)

    print(f"Folder {folder_name} successfully added to the collection")
    return collection

In [8]:

try:
    collection = client.get_collection(name="docs_collection")
    collection_exists = True
except ValueError:
    collection_exists = False

if not collection_exists:
    print('COLLECTION DOES NOT EXIST')
    collection = client.create_collection(name="docs_collection")

    documents_objects = SimpleDirectoryReader(input_dir="../docs/acs_cs_ut_ee").load_data()
    
    dir_as_list = os.listdir("../docs")
    print(dir_as_list)

    for dir in dir_as_list:
        add_docs_to_collection(dir, collection)
else:
    print('COLLECTION EXISTS')



COLLECTION DOES NOT EXIST
['acs_cs_ut_ee', 'adl_cs_ut_ee', 'bigdata_cs_ut_ee', 'biit_cs_ut_ee', 'blog_cs_ut_ee', 'cit_cs_ut_ee', 'courses_cs_ut_ee', 'crypto_cs_ut_ee', 'cs_ut_ee', 'didaktika_cs_ut_ee', 'dps_cs_ut_ee', 'elixir_ut_ee', 'hcis_cs_ut_ee', 'health-informatics_cs_ut_ee', 'infosec_cs_ut_ee', 'its_cs_ut_ee', 'majandus_ut_ee', 'math_ut_ee', 'mc_cs_ut_ee', 'ml_cs_ut_ee', 'nail_cs_ut_ee', 'sail_cs_ut_ee', 'sep_cs_ut_ee', 'sws_cs_ut_ee', 'tartunlp_ai', 'ut_ee']
Folder acs_cs_ut_ee successfully added to the collection
Folder adl_cs_ut_ee successfully added to the collection
Folder bigdata_cs_ut_ee successfully added to the collection
Folder biit_cs_ut_ee successfully added to the collection
Folder blog_cs_ut_ee successfully added to the collection
Folder cit_cs_ut_ee successfully added to the collection


Add of existing embedding ID: courses_cs_ut_ee_2024_webappsec_spring.txt
Insert of existing embedding ID: courses_cs_ut_ee_2024_webappsec_spring.txt


Folder courses_cs_ut_ee successfully added to the collection
Folder crypto_cs_ut_ee successfully added to the collection


Add of existing embedding ID: acs_cs_ut_ee_.txt
Add of existing embedding ID: ajalugu-arheoloogia_ut_ee_et_esileht.txt
Add of existing embedding ID: arvutimuuseum_ut_ee.txt
Add of existing embedding ID: biit_cs_ut_ee_.txt
Add of existing embedding ID: biomeditsiin_ut_ee_et_esileht-bio-ja-siirdemeditsiin-instituut.txt
Add of existing embedding ID: cgvr_cs_ut_ee_cg-demo-reels_.txt
Add of existing embedding ID: cgvr_cs_ut_ee_computer-graphics-seminar-slides_.txt
Add of existing embedding ID: cgvr_cs_ut_ee_courses_.txt
Add of existing embedding ID: cgvr_cs_ut_ee_student-projects_.txt
Add of existing embedding ID: chem_ut_ee_et_esileht.txt
Add of existing embedding ID: cit_cs_ut_ee_.txt
Add of existing embedding ID: comserv_cs_ut_ee_ati_practice_offers_.txt
Add of existing embedding ID: courses_cs_ut_ee.txt
Add of existing embedding ID: courses_cs_ut_ee_2018_progmaa_spring_Main_Maalahealusedvordlus.txt
Add of existing embedding ID: courses_cs_ut_ee_2018_turve_spring_Main_HomePage.txt
Add of

Folder cs_ut_ee successfully added to the collection


Add of existing embedding ID: didaktika_cs_ut_ee_mooc_kursus-programmeerimise-alused_.txt
Add of existing embedding ID: didaktika_cs_ut_ee_mooc_kursus-programmeerimisest-maalahedaselt_.txt
Add of existing embedding ID: didaktika_cs_ut_ee_moocid_.txt
Add of existing embedding ID: didaktika_cs_ut_ee_progttl_.txt
Add of existing embedding ID: didaktika_cs_ut_ee_sundmused_konverents-2021_.txt
Add of existing embedding ID: ut_ee_et_avaleht.txt
Insert of existing embedding ID: didaktika_cs_ut_ee_mooc_kursus-programmeerimise-alused_.txt
Insert of existing embedding ID: didaktika_cs_ut_ee_mooc_kursus-programmeerimisest-maalahedaselt_.txt
Insert of existing embedding ID: didaktika_cs_ut_ee_moocid_.txt
Insert of existing embedding ID: didaktika_cs_ut_ee_progttl_.txt
Insert of existing embedding ID: didaktika_cs_ut_ee_sundmused_konverents-2021_.txt
Insert of existing embedding ID: ut_ee_et_avaleht.txt


Folder didaktika_cs_ut_ee successfully added to the collection
Folder dps_cs_ut_ee successfully added to the collection
Folder elixir_ut_ee successfully added to the collection
Folder hcis_cs_ut_ee successfully added to the collection
Folder health-informatics_cs_ut_ee successfully added to the collection
Folder infosec_cs_ut_ee successfully added to the collection
Folder its_cs_ut_ee successfully added to the collection


Add of existing embedding ID: majandus_ut_ee_et_node_60166.txt
Insert of existing embedding ID: majandus_ut_ee_et_node_60166.txt


Folder majandus_ut_ee successfully added to the collection


Add of existing embedding ID: math_ut_ee_et_matemaatika-ja-statistika-instituudi-esileht.txt
Insert of existing embedding ID: math_ut_ee_et_matemaatika-ja-statistika-instituudi-esileht.txt


Folder math_ut_ee successfully added to the collection
Folder mc_cs_ut_ee successfully added to the collection
Folder ml_cs_ut_ee successfully added to the collection
Folder nail_cs_ut_ee successfully added to the collection
Folder sail_cs_ut_ee successfully added to the collection
Folder sep_cs_ut_ee successfully added to the collection
Folder sws_cs_ut_ee successfully added to the collection
Folder tartunlp_ai successfully added to the collection


Add of existing embedding ID: ajalugu-arheoloogia_ut_ee_et_esileht.txt
Add of existing embedding ID: biomeditsiin_ut_ee_et_esileht-bio-ja-siirdemeditsiin-instituut.txt
Add of existing embedding ID: chem_ut_ee_et_esileht.txt
Add of existing embedding ID: cs_ut_ee_et_arvutiteaduse-instituudi-esileht.txt
Add of existing embedding ID: farmaatsia_ut_ee_et_farmaatsia-instituut.txt
Add of existing embedding ID: fi_ut_ee_et_node_60177.txt
Add of existing embedding ID: filsem_ut_ee_et_avaleht.txt
Add of existing embedding ID: genomics_ut_ee_et_node_60221.txt
Add of existing embedding ID: hambaarstiteadus_ut_ee_et_node_60173.txt
Add of existing embedding ID: haridus_ut_ee_et_haridusteaduste-instituudi-avaleht.txt
Add of existing embedding ID: humanitaarteadused_ut_ee_et_node_60060.txt
Add of existing embedding ID: keel_ut_ee_et_node_69803.txt
Add of existing embedding ID: kliinilinemeditsiin_ut_ee_et_node_69438.txt
Add of existing embedding ID: kosmos_ut_ee_et_esileht.txt
Add of existing embeddi

Folder ut_ee successfully added to the collection


In [58]:

add_docs_to_collection("ut_ee", collection)

Add of existing embedding ID: ajalugu-arheoloogia_ut_ee_et_esileht.txt
Add of existing embedding ID: biomeditsiin_ut_ee_et_esileht-bio-ja-siirdemeditsiin-instituut.txt
Add of existing embedding ID: chem_ut_ee_et_esileht.txt
Add of existing embedding ID: cs_ut_ee_et_arvutiteaduse-instituudi-esileht.txt
Add of existing embedding ID: farmaatsia_ut_ee_et_farmaatsia-instituut.txt
Add of existing embedding ID: fi_ut_ee_et_node_60177.txt
Add of existing embedding ID: filsem_ut_ee_et_avaleht.txt
Add of existing embedding ID: genomics_ut_ee_et_node_60221.txt
Add of existing embedding ID: hambaarstiteadus_ut_ee_et_node_60173.txt
Add of existing embedding ID: haridus_ut_ee_et_haridusteaduste-instituudi-avaleht.txt
Add of existing embedding ID: humanitaarteadused_ut_ee_et_node_60060.txt
Add of existing embedding ID: keel_ut_ee_et_node_69803.txt
Add of existing embedding ID: kliinilinemeditsiin_ut_ee_et_node_69438.txt
Add of existing embedding ID: kosmos_ut_ee_et_esileht.txt
Add of existing embeddi

Folder ut_ee successfully added to the collection


Collection(name=docs_collection)

In [9]:
collection.count()

2275

In [10]:
vector_store = ChromaVectorStore(chroma_collection=collection) 
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [14]:
q1 = "What does the Autonomous Driving Lab help to lay foundations for?"
q2 = "Where does Applied Cyber Security Group get it's funding from?"
q3 = "What room does autonomous driving lab work in?"

In [11]:
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store
)

In [15]:
chat_engine = index.as_chat_engine(chat_mode="condense_question", verbose=True)   

res = chat_engine.chat(q1)
print(res)

Querying with: What does the Autonomous Driving Lab help to lay foundations for?
The Autonomous Driving Lab helps to lay foundations for the development of self-driving technologies and autonomous driving industry in Estonia.


In [85]:
#index.as_query_engine(llm="Mis teleskoobid on Tartu ülikoolil?")

res = index.as_retriever().retrieve("What does the Autonomous Driving Lab help to lay foundations for?")
print(res)
docs_names = [res_obj.node.node_id for res_obj in res]
print(docs_names)

[NodeWithScore(node=TextNode(id_='adl_cs_ut_ee_research_projects.txt', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="<LINK>https://adl.cs.ut.ee/research/projects</LINK>\r\nProjects — Autonomous Driving Lab Autonomous Driving Lab Discover ADL The Lab Research Teaching Blog Research Sure, big ideas might begin on the back of a napkin, but if they're to become a reality, they must be put to paper. Here you can see the many culminations of our efforts on self-driving research. Research areas Publications Projects Projects We cannot possibly imagine delivering the self-driving future by ourselves. To get there, we have to combine forces with other like-minded thinkers and doers. Here you can see what we’ve been working on in cooperation with our partners. Project Vision-based off-road navigation with geographical hints Learn more Project Applied Research on Development of Autonomous Driving Lab for Level 4 Autonomy Learn

In [6]:
client.delete_collection("docs_collection")

ValueError: Collection docs_collection does not exist.