In [12]:
import os 
from dotenv import load_dotenv

import chromadb
from chromadb.utils import embedding_functions
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import ChatPromptTemplate
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
import openai

load_dotenv()

True

In [13]:
default_ef = embedding_functions.DefaultEmbeddingFunction()

# Settings.embed_model = HuggingFaceEmbedding(
#     model_name="sentence-transformers/all-MiniLM-L6-v2"
# )

Settings.embed_model = OpenAIEmbedding(model = 'text-embedding-3-small')

embedding_function = OpenAIEmbeddingFunction(model_name = 'text-embedding-3-small', api_key=os.getenv("OPENAI_API_KEY"))

Settings.llm = OpenAI(model = 'gpt-3.5-turbo')

In [None]:
client = chromadb.PersistentClient(path="../chroma_db")
# chroma_client = chromadb.HttpClient(host="chroma", port = 8000, settings=Settings(allow_reset=True, anonymized_telemetry=False))

In [None]:
def add_docs_to_collection(folder_name, collection):
    documents_objects = SimpleDirectoryReader(input_dir=f"../docs/{folder_name}").load_data()
    
    documents = [document.text for document in documents_objects] # ids have to be unique identifiers for the documents, here we choose the file name for simplicity.
    ids = [document.metadata['file_name'] for document in documents_objects] # ids have to be unique identifiers for the documents, here we choose the file name for simplicity.

    collection.add(documents=documents,
                   ids=ids)

    print(f"Folder {folder_name} successfully added to the collection")
    return collection

In [None]:
try:
    collection = client.get_collection(name="docs_collection")
    collection_exists = True
except ValueError:
    collection_exists = False

if not collection_exists:
    print('COLLECTION DOES NOT EXIST')
    collection = client.create_collection(name="docs_collection", metadata={"hnsw:space": "cosine"})

    dir_as_list = os.listdir("../docs")
    print(dir_as_list)

    for dir in dir_as_list:
        add_docs_to_collection(dir, collection)
else:
    print('COLLECTION EXISTS')

In [None]:
collection.count()

In [None]:
results = collection.query(
    query_texts=["mitme täiendkoolituse vahel on võimalik valida?"], # Chroma will embed this for you
    n_results=4 # how many results to return
)
print(results)

In [None]:
vector_store = ChromaVectorStore(chroma_collection=collection) 
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
q1 = "What does the Autonomous Driving Lab help to lay foundations for?"
q2 = "Where does Applied Cyber Security Group get it's funding from?"
q3 = "What room does autonomous driving lab work in?"

In [None]:
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
)

In [None]:
chat_engine = index.as_chat_engine(chat_mode="condense_question", verbose=True)   

res = chat_engine.chat("Kui palju tudengeid on Tartu ülikoolis?")
print(res)

In [None]:
#index.as_query_engine(llm="Mis teleskoobid on Tartu ülikoolil?")

res = index.as_retriever().retrieve("What does the Autonomous Driving Lab help to lay foundations for?")
print(res)
docs_names = [res_obj.node.node_id for res_obj in res]
print(docs_names)

# Test 2

In [None]:
try:
    collection = client.get_collection(name="docs_openai2_collection", embedding_function=embedding_function)
    collection_exists = True
except ValueError:
    collection_exists = False

if not collection_exists:
    print('COLLECTION DOES NOT EXIST')
    collection = client.create_collection(name="docs_openai2_collection", embedding_function=embedding_function, metadata={"hnsw:space": "cosine"})

    documents = SimpleDirectoryReader(input_dir="../docs", recursive=True).load_data()

    # set up ChromaVectorStore and load in data
    vector_store = ChromaVectorStore(chroma_collection=collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(
        documents, storage_context=storage_context
    )
else:
    print('COLLECTION EXISTS')

In [None]:
chat_engine = index.as_chat_engine(chat_mode="condense_question", verbose=True)

In [None]:
collection = client.get_collection(name="docs_openai2_collection", embedding_function=embedding_function)

vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store, storage_context=storage_context
)

chat_engine = index.as_chat_engine(chat_mode="condense_question", verbose=True)   

In [None]:
res = chat_engine.chat("Kui palju tudengeid on Tartu ülikoolis?")
print(res)

In [None]:
dir_as_list = os.listdir("../docs")
print(dir_as_list)

for dir in dir_as_list:
    if dir == 'ut_ee':
        continue
    add_docs_to_collection(dir, collection)

# Pinecone

In [None]:
import logging
import sys
import os

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
from pinecone import Pinecone, ServerlessSpec

In [None]:
pc.create_index(
    name="quickstart",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

# Create Dataset of Docs Embeddings

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.pinecone import PineconeVectorStore
from IPython.display import Markdown, display
import itertools

In [None]:
# load documents
documents_objects = SimpleDirectoryReader(input_dir="../docs", recursive=True).load_data()

In [None]:
len(documents_objects)

In [None]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def chunk_text(text, max_tokens):
    # Tokenize the text into words
    tokens = word_tokenize(text)
    
    # Initialize variables
    chunks = []
    current_chunk = []
    current_chunk_len = 0

    # Iterate through the tokens and create chunks
    for token in tokens:
        token_len = len(token)
        if current_chunk_len + token_len + 1 > max_tokens:  # +1 for the space
            # If adding the next token exceeds the limit, save the current chunk
            chunks.append(' '.join(current_chunk))
            current_chunk = [token]
            current_chunk_len = token_len
        else:
            # Add the token to the current chunk
            current_chunk.append(token)
            current_chunk_len += token_len + 1  # +1 for the space

    # Append the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

In [None]:
import math
import re 

def get_embeddings(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    link_tag = re.findall(r"(<LINK>.*?<\/LINK>)", text)[0]
    num_tokens = num_tokens_from_string(text)

    if num_tokens > 8191:
        print(f"Tokens limit exceede, tokens: {num_tokens}")
        text_chunks = chunk_text(text, 8191)
        text_chunks = [link_tag + chunk for chunk in text_chunks]
        embeddings = openai.embeddings.create(input = text_chunks, model=model)
       
        return [data.embedding for data in embeddings.data], text_chunks
    else:
        return [openai.embeddings.create(input = [text], model=model).data[0].embedding], [text]
    

def divide_text_into_chunks(text, token_limit):
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(text))
    if num_tokens > token_limit:
        num_chunks = math.ceil(num_tokens / token_limit)
        num_symbols_per_chunk = math.floor(len(text) / num_chunks)
        text_chunks = []
        for i in range(0, len(text), num_symbols_per_chunk):
            text_chunk = text[i: i+num_symbols_per_chunk]
            text_chunks.append(text_chunk)
        return text_chunks
    else:
        return [text]


In [None]:
import pandas as pd

docs_df = pd.DataFrame({ 
    'id': [document.metadata['file_name'] for document in documents_objects], 
    'values': [''] * len(documents_objects),
    'metadata': [{'text': document.text} for document in documents_objects]
    })


In [None]:
docs_df

In [None]:
count = 0

docs_add_df = pd.DataFrame(columns=["id", "values", "metadata"])

for index, row in docs_df.iterrows():
    id = row['id']
    text = row['metadata']['text']
    num_tokens = num_tokens_from_string(text)

    embeddings, texts = get_embeddings(text)

    if len(embeddings) > 1: 
        print(len(embeddings), len(texts))
        print()
        for i, (embedding, text) in enumerate(zip(embeddings, texts)):
            docs_add_df.loc[len(docs_add_df)] = [f"{id}-{i+1}", embedding, {'text': text}]
        docs_df.drop(index, inplace=True)
    else: 
        row['values'] = embeddings[0]

    print(f"{index}: id: {id}, num_of_embeddings: {len(embeddings)}")
    count += 1
    if count % 500 == 0:
        docs_df.to_csv('docs_df_local.csv', index=False)

docs_df = pd.concat([docs_df, docs_add_df], ignore_index=True)


In [None]:
docs_add_df

In [None]:
docs_df 


In [None]:
docs_df

In [None]:
import json
docs_df['metadata'] = docs_df['metadata'].apply(json.dumps)
docs_df['values'] = docs_df['values'].apply(json.dumps)

In [None]:
docs_df

In [None]:
# Save DataFrame to CSV
docs_df.to_csv('data.csv', index=False)

In [None]:
# Read DataFrame from CSV
df = pd.read_csv('data.csv')

In [None]:
df = df.rename(columns={'embedding': 'values'})

In [None]:
df

In [None]:
df.dropna(subset=['values'], inplace=True)

In [None]:
import ast
# Decode JSON strings back into JSON objects
df['metadata'] = df['metadata'].apply(json.loads)
df['values'] = df['values'].apply(lambda x: [float(i) for i in ast.literal_eval(x)])




In [None]:
df['metadata'][0]['text']

In [None]:
df

In [None]:
def text_chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))

    print(text_chunks)
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

In [None]:
# initialize without metadata filter
from llama_index.core import StorageContext

if "OPENAI_API_KEY" not in os.environ:
    raise EnvironmentError(f"Environment variable OPENAI_API_KEY is not set")

vector_store = PineconeVectorStore(pinecone_index=index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

In [None]:
def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

In [None]:
data = [{"id": row['id'], "values": row['values'], "metadata": row['metadata']} for _, row in df.iterrows()]


In [None]:
data

In [None]:
api_key = os.environ["PINECONE_API_KEY"]

pc = Pinecone(api_key=api_key)

In [None]:
index = pc.Index("quickstart")

In [None]:
df


In [None]:
type(data[0]['values'])

In [None]:
df

In [None]:
# Upsert data with 100 vectors per upsert request
for ids_vectors_chunk in chunks(data, batch_size=100):
    print(ids_vectors_chunk)
    index.upsert(vectors=ids_vectors_chunk) 

# Retrieving Pinecone

In [14]:
from pinecone import Pinecone, ServerlessSpec
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.pinecone import PineconeVectorStore
from IPython.display import Markdown, display
import itertools

api_key = os.environ["PINECONE_API_KEY"]

pc = Pinecone(api_key=api_key)

index = pc.Index("quickstart")


In [15]:
# initialize without metadata filter
from llama_index.core import StorageContext

if "OPENAI_API_KEY" not in os.environ:
    raise EnvironmentError(f"Environment variable OPENAI_API_KEY is not set")

vector_store = PineconeVectorStore(pinecone_index=index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store, storage_context=storage_context
)

In [16]:
query_engine = index.as_query_engine()

In [20]:
response = query_engine.query("Kui palju õppekavasid on Tartu ülikoolis")
print(response)

Üle 160 õppekava


# NB! DO THIS AT YOUR OWN RISK

In [None]:
add_docs_to_collection("ut_ee", collection)

In [None]:
client.delete_collection("docs_openai2_collection")