In [4]:
import numpy as np
import requests
import os
import json, ast
import re
from transformers import AutoTokenizer
import uuid
from llama_index.embeddings import HuggingFaceEmbedding
from typing import Dict, List, Any
import torch
from transformers import AutoModel, AutoTokenizer

In [5]:
def compute_embeddings(text):
    tokenizer = AutoTokenizer.from_pretrained("/Users/joesasson/Desktop/articles/rag-from-scratch/model/tokenizer") 
    model = AutoModel.from_pretrained("/Users/joesasson/Desktop/articles/rag-from-scratch/model/embedding")

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) 
    
    # Generate the embeddings 
    with torch.no_grad():    
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze()

    return embeddings.tolist()

def compute_matches(vector_store, query_str, top_k):
    """
    This function takes in a vector store dictionary, a query string, and an int 'top_k'.
    It computes embeddings for the query string and then calculates the cosine similarity against every chunk embedding in the dictionary.
    The top_k matches are returned based on the highest similarity scores.
    """
    # Get the embedding for the query string
    query_str_embedding = np.array(compute_embeddings(query_str))
    scores = {}

    # Calculate the cosine similarity between the query embedding and each chunk's embedding
    for doc_id, chunks in vector_store.items():
        for chunk_id, chunk_embedding in chunks.items():
            chunk_embedding_array = np.array(chunk_embedding)
            # Normalize embeddings to unit vectors for cosine similarity calculation
            norm_query = np.linalg.norm(query_str_embedding)
            norm_chunk = np.linalg.norm(chunk_embedding_array)
            if norm_query == 0 or norm_chunk == 0:
                # Avoid division by zero
                score = 0
            else:
                score = np.dot(chunk_embedding_array, query_str_embedding) / (norm_query * norm_chunk)

            # Store the score along with a reference to both the document and the chunk
            scores[(doc_id, chunk_id)] = score

    # Sort scores and return the top_k results
    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:top_k]
    top_results = [(doc_id, chunk_id, score) for ((doc_id, chunk_id), score) in sorted_scores]

    return top_results

def open_json(path):
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def retrieve_docs(doc_store, matches):
    top_match = matches[0]
    doc_id = top_match[0]
    chunk_id = top_match[1]
    docs = doc_store[doc_id][chunk_id]
    return docs

vec_store = open_json('/Users/joesasson/Desktop/articles/rag-from-scratch/api/vector_store.json')
doc_store = open_json('/Users/joesasson/Desktop/articles/rag-from-scratch/api/doc_store.json')

matches = compute_matches(vector_store=vec_store,
                query_str="Wall-mounted electric fireplace with realistic LED flames",
                top_k=3)

retrieved_docs = retrieve_docs(doc_store, matches)

print(retrieved_docs)

{'text': 'Wall-mounted electric fireplace with realistic LED flames and heat settings. Features a black glass frame and remote control for easy operation. Ideal for adding warmth and ambiance. Manufactured by Hearth & Home. Dimensions: 50"W x 6"D x 21"H.', 'metadata': {'file_name': 'ENT-4001'}}


In [4]:
from llama_cpp import Llama
import sys

def stream_and_buffer(base_prompt, llm, max_tokens=800, stop=["Q:", "\n"], echo=True, stream=True):

    # Formatting the base prompt
    formatted_prompt = f"Q: {base_prompt} A: "

    # Streaming the response from llm
    response = llm(formatted_prompt, max_tokens=max_tokens, stop=stop, echo=echo, stream=stream)

    buffer = ""

    for message in response:
        chunk = message['choices'][0]['text']
        buffer += chunk

        # Split at the last space to get words
        words = buffer.split(' ')
        for word in words[:-1]:  # Process all words except the last one (which might be incomplete)
            sys.stdout.write(word + ' ')  # Write the word followed by a space
            sys.stdout.flush()  # Ensure it gets displayed immediately

        # Keep the rest in the buffer
        buffer = words[-1]

    # Print any remaining content in the buffer
    if buffer:
        sys.stdout.write(buffer)
        sys.stdout.flush()

def construct_prompt(system_prompt, retrieved_docs, user_query):
    prompt = f"""{system_prompt}

    Here is the retrieved context:
    {retrieved_docs}

    Here is the users query:
    {user_query}
    """
    return prompt

# Usage
system_prompt = """
You are an intelligent search engine. You will be provided with some retrieved context, as well as the users query.

Your job is to understand the request, and answer based on the retrieved context.
"""

prompt = construct_prompt(system_prompt=system_prompt,
                          retrieved_docs=retrieved_docs,
                          user_query="I am looking for a wall-mounted electric fireplace with realistic LED flames")

llm = Llama(model_path="/Users/joesasson/Downloads/mistral-7b-instruct-v0.2.Q3_K_L.gguf", n_gpu_layers=1)

 # Call the function using the provided arguments
stream_and_buffer(base_prompt=prompt, llm=llm)

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /Users/joesasson/Downloads/mistral-7b-instruct-v0.2.Q3_K_L.gguf (version unknown)
llama_model_loader: - tensor    0:                token_embd.weight q3_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q3_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q3_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q5_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q3_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q3_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q5_K     [ 1

 Based on the retrieved context, there appears to be an electric fireplace that fits your description. The product mentioned in the context has realistic LED flames and can be mounted on a wall. It also comes with heat settings and a remote control for easy operation. Additionally, the dimensions provided match those you've specified (50"W x 6"D x 21"H). The manufacturer is Hearth & Home.


llama_print_timings:        load time =  9182.07 ms
llama_print_timings:      sample time =    67.88 ms /    90 runs   (    0.75 ms per token,  1325.91 tokens per second)
llama_print_timings: prompt eval time =  9181.92 ms /   179 tokens (   51.30 ms per token,    19.49 tokens per second)
llama_print_timings:        eval time =  5517.81 ms /    89 runs   (   62.00 ms per token,    16.13 tokens per second)
llama_print_timings:       total time = 15070.34 ms
