In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
import chromadb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# This should work, but this would require you to recreate a TFIDF matrix for every query, which is not efficient
def retrieve_documents(query, documents, num_results=5):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents + [query])
    cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
    # magnitude doesn't matter for text embeddings usually, only orientation does, so use cosine similarity
    top_indices = cosine_similarities.argsort()[-num_results:][::-1]
    return [documents[i] for i in top_indices]

In [3]:
# I can use ChromaDB, since ChromaDB has their own built in vector similarity/search feature
class DocumentDB:
    def __init__(self):
        self.db = chromadb.Client()
        self.num_docs = 0

    def add_collection(self, name):
        self.db.get_or_create_collection(name)

    def add_documents(self, collection, docs):
        ids = [str(i) for i in range(self.num_docs, self.num_docs + len(docs))]
        self.db.get_collection(collection).add(documents = docs, ids = ids)
        self.num_docs += len(docs)
        
    def retrieve_documents(self, collection, prompt, num_documents=3):
        return self.db.get_collection(collection).query(query_texts = prompt, n_results=num_documents)

In [4]:
test_db = DocumentDB()
test_db.add_collection(name="Test")


In [5]:

docs = ["Roblox (/ˈroʊblɒks/ ROH-bloks) is an online game platform and game creation system developed by Roblox Corporation that allows users to program and play games created by themselves or other users.",
        "Roblox was created by David Baszucki and Erik Cassel in 2004 and released in 2006, the platform hosts user-created games of multiple genres coded in the programming language Lua.",
        "Roblox is free to play, with in-game purchases available through a virtual currency called Robux. As of August 2020, Roblox had over 164 million monthly active users, including more than half of all American children under 16.[12][13] Although Roblox has received generally positive reviews from critics, it has faced criticism for its moderation, microtransactions, and allegations of exploitative practices toward children",
        "Roblox allows players to create their own games using its proprietary engine, Roblox Studio, which can then be played by other users.[14] Games, officially referred to as 'experiences' on the platform, are made with a derivative of the language Lua named Luau.[15][16] Users are able to create purchasable content through one-time purchases, known as 'game passes', as well as microtransactions which can be purchased more than once, known as 'developer products' or 'products'.[17][18] The majority of games produced using Roblox Studio are developed by minors, and a total of 20 million games a year are produced using it",
        "Roblox allows players to buy, sell, and create virtual items which can be used to decorate their virtual character that serves as their avatar on the platform.[12] Previously, only Roblox administrators had the ability to sell accessories, body parts, gear, and packages under the official Roblox user account,[20] with virtual hats and accessories also being able to be published by a select few users with past experience working with Roblox Corporation.[21][22]"]



test_db.add_documents(collection="Test", docs = docs)

In [6]:
question = "Who created Roblox?"
res = test_db.retrieve_documents(collection="Test", prompt=question, num_documents=1) # cosine similarity

In [7]:
print(res)
# finds the documents really fast, but installing the model takes time; took about 1 minute to install the embedding model for ChromaDB
# but after that, it is able to add and searhc instantaneously

{'ids': [['1']], 'distances': [[0.38205447793006897]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['Roblox was created by David Baszucki and Erik Cassel in 2004 and released in 2006, the platform hosts user-created games of multiple genres coded in the programming language Lua.']], 'uris': None, 'data': None}


In [8]:
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# Install the model
model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
model_file = "mistral-7b-instruct-v0.2.Q2_K.gguf"
model_path = hf_hub_download(model_name, filename=model_file)

llm = Llama(
    model_path = model_path,
    n_ctx=2048
)

context = "\n".join(res["documents"][0])
input = f"Here is some context:{context} Q: Who created Roblox? A: "
output = llm(
    prompt = input,
    max_tokens = 100,
    stop = ["Q:"],
    echo=False
)
print(output)
# took 35.8 seconds to generate the output given 1 document (about a sentence long) as context
# can def do better prompt engineering to get a better result, but it works pretty well

print(output['choices'])