# Feed and query our RAG with DBPedia descriptions

During class, we will be going back and forth with some parameters, chunking sizes etc. to vary and look at the results.


In [None]:
import csv
import json
from tqdm import tqdm
from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
def read_csv(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append(row)
    return data

# Create documents from CSV entries
def create_documents(entries):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=200)
    documents = []
    for entry in tqdm(entries, desc="Creating Documents"):
        content = f"URL: {entry['subject']}\nLabel: {entry['subjectLabel']}\nDescription: {entry['abstract']}"
        doc = Document(page_content=content)
        splits = text_splitter.split_documents([doc])
        documents.extend(splits)
    return documents

# Create vector store with progress indication
def create_vectorstore_with_progress(documents, embedding):
    #vectorstore = Chroma()
    vectorstore = Chroma(embedding_function=embedding,  persist_directory='./data/')
    for doc in tqdm(documents, desc="Adding Documents to Vector Store"):
        vectorstore.add_documents([doc])
    return vectorstore


filename = './data/dbpedia_building_materials_small.csv'
entries = read_csv(filename)
documents = create_documents(entries)

oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="nomic-embed-text")
#vectorstore = Chroma.from_documents(documents=documents, embedding=oembed)
vectorstore = create_vectorstore_with_progress(documents, oembed)

print("Vectorstore data saved successfully.")



In [None]:
from langchain_community.llms import Ollama
#question="what is the URL of the material that is created of isocyanate and polyol resin"
question="""using the SKOS vocabulary, creata a turtle triple {subject} {predicate} {object} 
use a SKOS relation that describes the similarity to a subject from another vocabulary in the namespace odb: with a label 'Solid wood' / 'Glue-laminated timber board''
write a valid line usng turtle syntax.
do not write anything else"""
#question="what is the URL of the material that closest fitting to the category 'Wood' / 'Derived timber products' / '3- and 5-ply wood'? Only give the URL and a single digit that indicates how sure you are between 0.0 and 1.0"
ollama = Ollama(
    base_url='http://localhost:11434',
    model="mistral"
)
docs = vectorstore.similarity_search(question)
from langchain.chains import RetrievalQA
qachain=RetrievalQA.from_chain_type(ollama, retriever=vectorstore.as_retriever())
res = qachain.invoke({"query": question})
print(res['result'])