In [None]:
%pip install huggingface_hub --quiet
%pip install transformers --quiet
%pip install torch torchvision torchaudio --quiet
%pip install --upgrade pip --quiet
%pip install tensorflow --quiet
%pip install python-dotenv --quiet
%pip install --upgrade --upgrade-strategy eager "optimum[openvino]" --quiet
%pip install tf-keras --quiet
%pip install sentence-transformers --quiet
%pip install langchain_community --quiet
%pip install langchain_openai --quiet
%pip install pypdf --quiet
%pip install chromadb --quiet
%pip install langchain-chroma --quiet

In [None]:
import shutil
import logging
import os
import gc
from pathlib import Path
from dotenv import load_dotenv

from huggingface_hub import login, whoami
from optimum.intel import OVQuantizer
from optimum.intel.openvino import OVModelForCausalLM
import openvino as ov
import nncf

nncf.set_log_level(logging.ERROR)

load_dotenv(verbose=True)
cache_dir = os.environ['CACHE_DIR']

In [None]:
def prepare_model(model_vendor, model_id, group_size:int, ratio:float, int4_mode:str='SYM', generate_fp16:bool=True, generate_int8:bool=True, generate_int4:bool=True, cache_dir='./cache'):
    pt_model_id = f'{model_vendor}/{model_id}'
    fp16_model_dir = Path(model_id) / "FP16"
    int8_model_dir = Path(model_id) / "INT8"
    int4_model_dir = Path(model_id) / "INT4"

    ov_model_file_name = 'openvino_model.xml'

    print(f'** Prepaing model : {model_vendor}/{model_id}')

    # FP16
    if generate_fp16 and not os.path.exists(fp16_model_dir / ov_model_file_name):
        print('\n** Generating an FP16 IR model')
        ov_model = OVModelForCausalLM.from_pretrained(pt_model_id, export=True, compile=False, cache_dir=cache_dir, ov_config={'CACHE_DIR':cache_dir})
        ov_model.half()
        ov_model.save_pretrained(fp16_model_dir)
        del ov_model
        gc.collect()
    else:
        print('\n** Skip generation of FP16 IR model (directory already exists)')

    # INT8
    if generate_int8 and not os.path.exists(int8_model_dir / ov_model_file_name):
        print('\n** Generating an INT8 IR model')
        ov_model = OVModelForCausalLM.from_pretrained(fp16_model_dir, compile=False, cache_dir=cache_dir, ov_config={'CACHE_DIR':cache_dir})
        quantizer = OVQuantizer.from_pretrained(ov_model, cache_dir=cache_dir)
        quantizer.quantize(save_directory=int8_model_dir, weights_only=True)
        del quantizer
        del ov_model
        gc.collect()
    else:
        print('\n** Skip generation of INT8 IR model (directory already exists)')

    # INT4
    if generate_int4 and not os.path.exists(int4_model_dir / ov_model_file_name):
        print(f'\n** Generating an INT4_{int4_mode} IR model')
        ov_model = OVModelForCausalLM.from_pretrained(fp16_model_dir, compile=False, cache_dir=cache_dir, ov_config={'CACHE_DIR':cache_dir})
        int4_model_dir.mkdir(parents=True, exist_ok=True)
        ov_model = ov.Core().read_model(fp16_model_dir / ov_model_file_name)
        shutil.copy(fp16_model_dir / 'config.json', int4_model_dir / 'config.json')
        comp_mode = nncf.CompressWeightsMode.INT4_ASYM if int4_mode=='ASYM' else nncf.CompressWeightsMode.INT4_SYM
        compressed_model = nncf.compress_weights(ov_model, mode=comp_mode, ratio=ratio, group_size=group_size)
        ov.save_model(compressed_model, int4_model_dir / ov_model_file_name)
        del ov_model
        del compressed_model
        gc.collect()
    else:
        print('\n** Skip generation of INT4 IR model (directory already exists)')

In [None]:
# hf_token = hf_xxxxxx

In [None]:
print('*** LLM model downloader')
try:
    whoami()
    print('Authorization token already provided')
except OSError:
    print('The llama2 model is a controlled model.')
    print('You need to login to HuggingFace hub to download the model.')
    login()
finally:
    prepare_model('meta-llama', 'Llama-3.2-1B', group_size=128, ratio=0.8)

In [None]:
from langchain.document_loaders import  DirectoryLoader, PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
import os
import shutil

os.environ["OPENAI_API_BASE"] ='http://10.35.151.101:8001/v1'
os.environ["OPENAI_API_KEY"] = "sk-1234"
CHROMA_PATH = "docs_embedding"

In [None]:
def load_documents():
  document_loader = PyPDFDirectoryLoader('docs') 
  return document_loader.load()

In [None]:
def split_text(documents: list[Document]):

  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, # Size of each chunk in characters
    chunk_overlap=100, # Overlap between consecutive chunks
    length_function=len, # Function to compute the length of the text
    add_start_index=True, # Flag to add start index to each chunk
  )

  chunks = text_splitter.split_documents(documents)
  print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
  document = chunks[0]
  print(document.page_content)
  print(document.metadata)

  return chunks

In [None]:
def save_to_chroma(chunks: list[Document]):
  if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)
    db = Chroma.from_documents(
      chunks,
      OpenAIEmbeddings(),
      persist_directory=CHROMA_PATH
    )

    db.persist()
  print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [None]:
def generate_data_store():
  documents = load_documents()
  chunks = split_text(documents)
  save_to_chroma(chunks)

print('*** Converting documents into embeddings and creating a vector store(s)')
generate_data_store()

In [None]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt", api_url="https://api.hub.langchain.com")

INT 4

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_chroma import Chroma


vectorstore_dir   = os.environ['VECTOR_DB_DIR']
embeddings_model  = os.environ['MODEL_EMBEDDINGS']


embeddings = OpenAIEmbeddings()
PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
 - -
Answer the question based on the above context: {question}
"""


vectorstore_dir = f'{vectorstore_dir}'
vectorstore = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)
retriever = vectorstore.as_retriever()
#    search_type='similarity_score_threshold', 
#    search_kwargs={
#        'score_threshold' : 0.8, 
#        'k' : 4
#    }
#)

# results = vectorstore.similarity_search_with_relevance_scores(query_text, k=3)

# if len(results) == 0 or results[0][1] < 0.7:
#    print(f"Unable to find matching results.")

# context_text = "\n\n - -\n\n".join([doc.page_content for doc, _score in results])

# prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
# prompt = prompt_template.format(context=context_text, question=query_text)
print(f'** Vector store : {vectorstore_dir}')

In [None]:
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForCausalLM
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA


model_id = 'Llama-3.2-1B'
model_name        = os.environ['MODEL_NAME']
model_precision   = os.environ['MODEL_PRECISION']
inference_device  = os.environ['INFERENCE_DEVICE']
ov_config         = {"PERFORMANCE_HINT":"LATENCY", "NUM_STREAMS":"1", "CACHE_DIR":cache_dir}
num_max_tokens    = int(os.environ['NUM_MAX_TOKENS'])
rag_chain_type    = os.environ['RAG_CHAIN_TYPE']

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
ov_model_path = f'./{model_name}/{model_precision}'
model = OVModelForCausalLM.from_pretrained(model_id=ov_model_path, device=inference_device, ov_config=ov_config, cache_dir=cache_dir)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=num_max_tokens)
llm = HuggingFacePipeline(pipeline=pipe)
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type=rag_chain_type, retriever=retriever)

In [None]:
text_user_en = "Who administrates the Promotions?"
ans = qa_chain.run(text_user_en)
print(ans)

In [None]:
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForCausalLM
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA


model_id = 'Llama-3.2-1B'
model_name        = os.environ['MODEL_NAME']
model_precision   = "INT4"
inference_device  = os.environ['INFERENCE_DEVICE']
ov_config         = {"PERFORMANCE_HINT":"LATENCY", "NUM_STREAMS":"1", "CACHE_DIR":cache_dir}
num_max_tokens    = 200
rag_chain_type    = os.environ['RAG_CHAIN_TYPE']

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
ov_model_path = f'./{model_name}/{model_precision}'
model = OVModelForCausalLM.from_pretrained(model_id=ov_model_path, device=inference_device, ov_config=ov_config, cache_dir=cache_dir)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=num_max_tokens)
llm = HuggingFacePipeline(pipeline=pipe)
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type=rag_chain_type, retriever=retriever)

In [None]:
text_user_en = "What is the promotion policy for non teaching staff?"
ans = qa_chain.run(text_user_en)
print(ans)

Int 8

In [None]:
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForCausalLM
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA


model_id = 'Llama-3.2-1B'
model_name        = os.environ['MODEL_NAME']
model_precision   = "INT8"
inference_device  = os.environ['INFERENCE_DEVICE']
ov_config         = {"PERFORMANCE_HINT":"LATENCY", "NUM_STREAMS":"1", "CACHE_DIR":cache_dir}
num_max_tokens    = 100
rag_chain_type    = os.environ['RAG_CHAIN_TYPE']

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
ov_model_path = f'./{model_name}/{model_precision}'
model = OVModelForCausalLM.from_pretrained(model_id=ov_model_path, device=inference_device, ov_config=ov_config, cache_dir=cache_dir)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=num_max_tokens)
llm = HuggingFacePipeline(pipeline=pipe)
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type=rag_chain_type, retriever=retriever)

In [None]:

text_user_en = "Can you tell me about the staff referal incentive policy"
ans = qa_chain.run(text_user_en)
print(ans)

In [None]:
query_text = "What is the promotion policy for non teaching staff?"
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)
results = db.similarity_search_with_relevance_scores(query_text, k=3)

if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")
context_text = "\n\n - -\n\n".join([doc.page_content for doc, _score in results])

In [None]:
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForCausalLM
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import ChatPromptTemplate


model_id = 'Llama-3.2-1B'
model_name        = os.environ['MODEL_NAME']
model_precision   = "INT8"
inference_device  = os.environ['INFERENCE_DEVICE']
ov_config         = {"PERFORMANCE_HINT":"LATENCY", "NUM_STREAMS":"1", "CACHE_DIR":cache_dir}
num_max_tokens    = 200
rag_chain_type    = os.environ['RAG_CHAIN_TYPE']
PROMPT_TEMPLATE   = """
Answer the question based only on the following context:
{context}
 - -
Answer the question based on the above context: {question}
"""

# prompt = "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know."
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
ov_model_path = f'./{model_name}/{model_precision}'
model = OVModelForCausalLM.from_pretrained(model_id=ov_model_path, device=inference_device, ov_config=ov_config, cache_dir=cache_dir)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=num_max_tokens)
llm = HuggingFacePipeline(pipeline=pipe)
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
chain = prompt_template | llm
# print(chain.invoke({"context": context_text,"question": query_text}))
response_text = llm.predict(prompt)
# qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type=rag_chain_type, retriever=retriever, chain_type_kwargs={"prompt": prompt})

In [None]:

ans = qa_chain.run(text_user_en)
print(ans)

In [None]:
print(response_text)

FP16

In [None]:
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForCausalLM
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA


model_id = 'Llama-3.2-1B'
model_name        = os.environ['MODEL_NAME']
model_precision   = "FP16"
inference_device  = os.environ['INFERENCE_DEVICE']
ov_config         = {"PERFORMANCE_HINT":"LATENCY", "NUM_STREAMS":"1", "CACHE_DIR":cache_dir}
num_max_tokens    = 200
rag_chain_type    = os.environ['RAG_CHAIN_TYPE']

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
ov_model_path = f'./{model_name}/{model_precision}'
model = OVModelForCausalLM.from_pretrained(model_id=ov_model_path, device=inference_device, ov_config=ov_config, cache_dir=cache_dir)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=num_max_tokens)
llm = HuggingFacePipeline(pipeline=pipe)
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type=rag_chain_type, retriever=retriever, chain_type_kwargs={"prompt": prompt})

In [None]:
text_user_en = "Can you tell me about the Extension of Probation Period"
ans = qa_chain.run(text_user_en)
print(ans)

In [None]:

text_user_en = "Can you tell me about the staff referal incentive policy"
ans = qa_chain.run(text_user_en)
print(ans)

In [None]:
text_user_en = "Can you tell me about the leave policy?"
ans = qa_chain.run(text_user_en)
print(ans)

In [None]:
from optimum.intel.openvino import OVModelForCausalLM
from transformers import AutoTokenizer, pipeline

# Load and compress model from Hugging Face
model_id = "unsloth/Llama-3.2-1B-Instruct"
model = OVModelForCausalLM.from_pretrained(model_id, export=True, load_in_8bit=True)

# Inference
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
phrase = "The weather is"
results = pipe(phrase)
print(results)

In [None]:
from optimum.intel.openvino import OVModelForCausalLM
from transformers import AutoTokenizer, pipeline

# Load and compress model from Hugging Face
model_id = "unsloth/Llama-3.2-1B-Instruct"
model = OVModelForCausalLM.from_pretrained(model_id, export=True)

# Inference
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
phrase = "The weather is"
results = pipe(phrase)
print(results)