In [None]:
%pip install llama-index
%pip install llama-index-embeddings-huggingface
%pip install 'accelerate>=0.26.0'
%pip install einops

In [None]:
# loading/embedding&indexing/storing
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex

# retrieve stage
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# import any embedding model on HF hub (https://huggingface.co/spaces/mteb/leaderboard)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Settings.embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large") # alternative model
Settings.llm = None
Settings.chunk_overlap = 25
Settings.chunk_size = 256

primetime = SimpleDirectoryReader("./PrimeTime").load_data()

index = VectorStoreIndex.from_documents(primetime, show_progress=True)
# see https://docs.llamaindex.ai/en/stable/understanding/storing/storing/ for storing indexes on local machine, 
# so index doesnt need to be re-indexed every time -- save time

LLM is explicitly disabled. Using MockLLM.


In [33]:
# set number of relative chunks to retreive
top_k = 10

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)
# configure response synthesizer
response_synthesizer = get_response_synthesizer()
# configure query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

In [42]:
# query documents
query = "get relationships between input delay and hold analysis"
response = query_engine.query(query)
context = "Context:\n"
for i in range(top_k):
    context = context + response.source_nodes[i].text + "\n\n"
print(len(context))

8026


In [12]:
# load fine-tuned model from hub


###model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True, safe_serialization=True)

# load tokenizer
#tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
###tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

device = "cpu" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

In [21]:
#input string format: 
message_template = lambda x: [{"role": "user", "content": x}]
messages = message_template("what does -add_delay do in set_output_delay command in PrimeTime")
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)

In [45]:
message_w_context = f"what will happen to hold analysis when I set the input delay to 0? plase use the context below for your reference: {context}"

message_template_w_context = lambda x: [{"role": "user", "content": x}]
messages_w_context = message_template_w_context(message_w_context)
input_text=tokenizer.apply_chat_template(messages_w_context, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=500, temperature=0.2, do_sample=True)

In [46]:
with open("./response.txt", 'w') as output:
    output.write(tokenizer.decode(outputs[0]))

In [33]:
# prompt (with context)
prompt_template_w_context = lambda context, comment: f"""[INST]ShawGPT, functioning as a virtual data science consultant on YouTube, communicates in clear, accessible language, escalating to technical depth upon request. \
It reacts to feedback aptly and ends responses with its signature '–ShawGPT'. \
ShawGPT will tailor the length of its responses to match the viewer's comment, providing concise acknowledgments to brief expressions of gratitude or feedback, \
thus keeping the interaction natural and engaging.

{context}
Please respond to the following comment. Use the context above if it is helpful.

{comment}
[/INST]
"""