In [1]:
import torch
import torch.nn as nn
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoConfig,
)
from IPython.display import Markdown
from huggingface_hub import notebook_login
from datasets import load_dataset
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import chromadb
import numpy as np

In [2]:
# Load PTB dataset (Penn Treebank)
dataset = load_dataset("ptb_text_only")

documents = []
for item in dataset["train"]:
    # For the "ptb_text_only" config, the text is typically in item["sentence"]
    text = item["sentence"]
    documents.append(text)

# Use the recursive character splitter
recur_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=60, separators=["\n\n", "\n", "\.", " ", ""]
)

# Perform the splits using the splitter
data_splits = recur_splitter.split_text("".join(documents))
print("Number of splits:", len(data_splits))

  chunk_size=1000, chunk_overlap=60, separators=["\n\n", "\n", "\.", " ", ""]


Number of splits: 5293


In [3]:
# Set the model id to load the model from HuggingFace
model_id = "meta-llama/Llama-3.2-3B"  # context length of 262k
# While waiting access to Llama model, you can use the falcon model to run the code.
# model_id = "tiiuae/falcon-7b-instruct"

# Load the default tokenizer for the selected model
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

# Load the model from HuggingFace
llama3 = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    trust_remote_code=True,
)

# Wrap the model and tokenizer into a text generation pipeline
hf_pipeline = pipeline(
    "text-generation",
    model=llama3,
    tokenizer=tokenizer,
    max_new_tokens=50,
    min_new_tokens=30,
    temperature=0.1,
    repetition_penalty=1.2,
    pad_token_id=tokenizer.eos_token_id,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk.
Device set to use mps


In [4]:
# Define the PDF loader
pdf_loader = PyMuPDFLoader("llama2.pdf")

# Load data from the pdf
pages = pdf_loader.load()

# Observe number of pages loaded
print("Number of pages loaded: {} \n".format(len(pages)))

Markdown(pages[10].page_content)

Number of pages loaded: 77 



Dataset
Num. of
Comparisons
Avg. # Turns
per Dialogue
Avg. # Tokens
per Example
Avg. # Tokens
in Prompt
Avg. # Tokens
in Response
Anthropic Helpful
122,387
3.0
251.5
17.7
88.4
Anthropic Harmless
43,966
3.0
152.5
15.7
46.4
OpenAI Summarize
176,625
1.0
371.1
336.0
35.1
OpenAI WebGPT
13,333
1.0
237.2
48.3
188.9
StackExchange
1,038,480
1.0
440.2
200.1
240.2
Stanford SHP
74,882
1.0
338.3
199.5
138.8
Synthetic GPT-J
33,139
1.0
123.3
13.0
110.3
Meta (Safety & Helpfulness)
1,418,091
3.9
798.5
31.4
234.1
Total
2,919,326
1.6
595.7
108.2
216.9
Table 6: Statistics of human preference data for reward modeling. We list both the open-source and
internally collected human preference data used for reward modeling. Note that a binary human preference
comparison contains 2 responses (chosen and rejected) sharing the same prompt (and previous dialogue).
Each example consists of a prompt (including previous dialogue if available) and a response, which is the
input of the reward model. We report the number of comparisons, the average number of turns per dialogue,
the average number of tokens per example, per prompt and per response. More details on Meta helpfulness
and safety data per batch can be found in Appendix A.3.1.
knows. This prevents cases where, for instance, the two models would have an information mismatch, which
could result in favoring hallucinations. The model architecture and hyper-parameters are identical to those
of the pretrained language models, except that the classification head for next-token prediction is replaced
with a regression head for outputting a scalar reward.
Training Objectives.
To train the reward model, we convert our collected pairwise human preference data
into a binary ranking label format (i.e., chosen & rejected) and enforce the chosen response to have a higher
score than its counterpart. We used a binary ranking loss consistent with Ouyang et al. (2022):
Lranking = −log(σ(rθ(x, yc) −rθ(x, yr)))
(1)
where rθ(x, y) is the scalar score output for prompt x and completion y with model weights θ. yc is the
preferred response that annotators choose and yr is the rejected counterpart.
Built on top of this binary ranking loss, we further modify it separately for better helpfulness and safety
reward models as follows. Given that our preference ratings is decomposed as a scale of four points (e.g.,
significantly better), as presented in Section 3.2.1, it can be useful to leverage this information to explicitly
teach the reward model to assign more discrepant scores to the generations that have more differences. To
do so, we further add a margin component in the loss:
Lranking = −log(σ(rθ(x, yc) −rθ(x, yr) −m(r)))
(2)
where the margin m(r) is a discrete function of the preference rating. Naturally, we use a large margin
for pairs with distinct responses, and a smaller one for those with similar responses (shown in Table 27).
We found this margin component can improve Helpfulness reward model accuracy especially on samples
where two responses are more separable. More detailed ablation and analysis can be found in Table 28 in
Appendix A.3.3.
Data Composition.
We combine our newly collected data with existing open-source preference datasets
to form a larger training dataset. Initially, open-source datasets were used to bootstrap our reward models
while we were in the process of collecting preference annotation data. We note that in the context of RLHF in
this study, the role of reward signals is to learn human preference for Llama 2-Chat outputs rather than
any model outputs. However, in our experiments, we do not observe negative transfer from the open-source
preference datasets. Thus, we have decided to keep them in our data mixture, as they could enable better
generalization for the reward model and prevent reward hacking, i.e. Llama 2-Chat taking advantage of
some weaknesses of our reward, and so artificially inflating the score despite performing less well.
With training data available from different sources, we experimented with different mixing recipes for both
Helpfulness and Safety reward models to ascertain the best settings. After extensive experimentation, the
11

In [4]:
### Using embeddings by MPNET: https://huggingface.co/sentence-transformers/all-mpnet-base-v2
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda" if torch.cuda.is_available() else "cpu"}
encode_kwargs = {"normalize_embeddings": False}
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

  hf_embeddings = HuggingFaceEmbeddings(


In [5]:
# Run for the first time if you don't have the original vector db
# Define the location to persist data
persist_directory = "./vector_store/"
# Generate and store embeddings

# vectordb = Chroma.from_texts(
#     texts=data_splits,
#     embedding=hf_embeddings,
#     persist_directory=persist_directory,
# )
quantized_directory = "./quantized_vector_store/"
# quantized_vectordb = Chroma(
#     collection_name="quantized",
#     persist_directory=quantized_directory,
#     embedding_function=hf_embeddings,
# )

In [6]:
original_client = chromadb.PersistentClient(path="./vector_store")
org_collection = original_client.get_or_create_collection(
    name="langchain", metadata={"hnsw:space": "l2"}
)

original_data = org_collection.get(include=["embeddings", "documents"])
# print(original_data.keys())
# print(original_data["embeddings"])
original_embeddings = torch.Tensor(original_data["embeddings"])
# print(original_embeddings.shape)
original_ids = original_data["ids"]
original_documents = original_data["documents"]
print(len(original_ids))

5293


In [7]:
query = "the chains include bloomingdale 's owned by campeau corp. toronto saks fifth"
query_embedding = hf_embeddings.embed_query(query)

In [9]:
org_result = org_collection.query(
    query_embedding,
    n_results=4,
    include=["embeddings", "documents"],
)
print(org_result)

{'ids': [['f780bd65-c9aa-42b8-9f7c-57c4f9cd86f6', '6f13c270-3968-465f-80eb-8bf3c016db6f', '25ff322b-125f-4614-80b6-09479793f47d', '74f447f5-8d53-4101-bd17-0eacd21bacc4']], 'embeddings': [array([[ 0.02809857,  0.052302  ,  0.01488034, ..., -0.04275348,
        -0.02404324,  0.0008042 ],
       [ 0.05341762, -0.00395871,  0.00827139, ..., -0.04405971,
        -0.02083498, -0.01610375],
       [ 0.05040516,  0.00120951,  0.01073335, ..., -0.04083301,
        -0.04182988, -0.01017789],
       [-0.01398617,  0.074091  , -0.02127344, ..., -0.02011109,
        -0.02271994, -0.01761337]], shape=(4, 768))], 'documents': [["in very early stages stillbloomingdale 's is a <unk> chain acquired last year by campeau in its $ N billion acquisition of federatedbloomingdale 's does an estimated $ N billion in annual salesthe sale of bloomingdale 's is a condition of efforts by toronto-based olympia & york developments ltd. to arrange $ N million in bridge financing for campeau which disclosed last month

In [10]:
# Quantize the embeddings
quantized_client = chromadb.PersistentClient(path="./quantized_vector_store")
quantized_collection = quantized_client.get_or_create_collection(
    name="quantized", metadata={"hnsw:space": "l2"}
)
quantized_data = quantized_collection.get(include=["embeddings"])
quantized_embeddings = torch.Tensor(quantized_data["embeddings"])
print(f"Loaded {len(quantized_embeddings)} vectors from 'quantized_vectordb'.")

Loaded 0 vectors from 'quantized_vectordb'.


In [23]:
quantized_collection.delete(ids=original_ids)

In [13]:
def uniform_quantization(tensor: torch.Tensor, clip_val: torch.Tensor, bit):
    scale = (2 ** (bit - 1)) - 1
    tensor_q: torch.Tensor = tensor.clamp(-clip_val, clip_val) / clip_val * scale
    tensor_q = (tensor_q.round() - tensor_q).detach() + tensor_q  # STE 적용
    tensor_q_int = tensor_q.to(torch.int8)
    # print(tensor_q_int)
    msb_2_bits = tensor_q_int & 0xC0
    mid_2_bits = tensor_q_int & 0x30
    mid2_2_bits = tensor_q_int & 0x0C
    lsb_4_bits = tensor_q_int & 0x03
    # print(msb_2_bits, mid_2_bits, mid2_2_bits, lsb_4_bits)
    msb_2_bits_scaled = msb_2_bits / scale * clip_val
    mid_2_bits_scaled = mid_2_bits / scale * clip_val
    mid2_2_bits_scaled = mid2_2_bits / scale * clip_val
    lsb_4_bits_scaled = lsb_4_bits / scale * clip_val
    # print(msb_2_bits_scaled, mid_2_bits_scaled, mid2_2_bits_scaled, lsb_4_bits_scaled)
    # return msb_2_bits_scaled, mid_2_bits_scaled, mid2_2_bits_scaled, lsb_4_bits_scaled
    return tensor_q_int / scale * clip_val

In [14]:
def noise_inject_tensor(weight_tensor: torch.Tensor, std: torch.Tensor, typ: bool):
    device = weight_tensor.device
    std = std.to(device)
    if typ:
        std_reshaped = std.view(-1, 1) if std.dim() == 1 else std
        adjusted_noise = 1.0 + std_reshaped * torch.randn_like(weight_tensor)
    else:
        adjusted_noise = 1.0 + std * torch.randn_like(weight_tensor)
    return torch.mul(weight_tensor, adjusted_noise).to(device)

In [15]:
quantized_collection.add(
    embeddings=uniform_quantization(original_embeddings, 1.0, 8).numpy(),
    ids=original_ids,
    documents=original_documents,
)

In [16]:
# Quantize the embeddings
qn_client = chromadb.PersistentClient(path="./quantized_noise_vector_store")
qn_collection = quantized_client.get_or_create_collection(
    name="quantized_noise", metadata={"hnsw:space": "l2"}
)
qn_data = qn_collection.get(include=["embeddings"])
qn_embeddings = torch.Tensor(qn_data["embeddings"])
print(f"Loaded {len(qn_embeddings)} vectors from 'quantized_noise_vectordb'.")

Loaded 0 vectors from 'quantized_noise_vectordb'.


In [17]:
qn_collection.add(
    ids=original_ids,
    documents=original_documents,
    embeddings=noise_inject_tensor(
        weight_tensor=uniform_quantization(
            tensor=original_embeddings,
            clip_val=1.0,
            bit=8,
        ),
        std=torch.Tensor([0.03]),
        typ=True,
    ).numpy(),
)

In [None]:
original_db: Chroma = Chroma(
    collection_name="langchain",
    persist_directory="./vector_store",
    embedding_function=hf_embeddings,
)
original_db.persist()
quantized_db: Chroma = Chroma(
    collection_name="quantized",
    persist_directory="./quantized_vector_store",
    embedding_function=hf_embeddings,
)
quantized_db.persist()

  original_db: Chroma = Chroma(
  original_db.persist()


In [18]:
query = "the chains include bloomingdale 's owned by campeau corp. toronto saks fifth"
query_embedding = hf_embeddings.embed_query(query)
q_query_embedding = uniform_quantization(torch.Tensor(query_embedding), 1.0, 8).numpy()

In [19]:
org_result = org_collection.query(
    query_embeddings=query_embedding,
    n_results=4,
    include=["embeddings", "documents", "distances"],
)
print(org_result.keys())
query_ids = org_result["ids"][0]
print(query_ids)
# query_docs = org_result["documents"][0]
# print(query_docs)
# query_dist = org_result["distances"][0]
# print(query_dist)
# query_emb = org_result["embeddings"]
q_result = quantized_collection.query(
    query_embeddings=q_query_embedding,
    n_results=4,
    include=["embeddings", "documents", "distances"],
)
print(q_result.keys())
q_query_ids = q_result["ids"][0]
# q_query_embs = q_result["embeddings"]
# print(q_query_embs)
# print(q_query_ids)


qn_result = qn_collection.query(
    query_embeddings=q_query_embedding,
    n_results=4,
    include=["embeddings", "documents", "distances"],
)
print(qn_result["ids"][0])
for i in range(4):
    org_emb = org_result["embeddings"][0][i]
    q_emb = q_result["embeddings"][0][i]
    qn_emb = qn_result["embeddings"][0][i]
    print(
        torch.norm(torch.Tensor(org_emb) - torch.Tensor(q_emb)),
        torch.norm(torch.Tensor(org_emb) - torch.Tensor(qn_emb)),
    )

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'distances', 'included'])
['f780bd65-c9aa-42b8-9f7c-57c4f9cd86f6', '6f13c270-3968-465f-80eb-8bf3c016db6f', '25ff322b-125f-4614-80b6-09479793f47d', '74f447f5-8d53-4101-bd17-0eacd21bacc4']
dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'distances', 'included'])
['f780bd65-c9aa-42b8-9f7c-57c4f9cd86f6', '6f13c270-3968-465f-80eb-8bf3c016db6f', '25ff322b-125f-4614-80b6-09479793f47d', '74f447f5-8d53-4101-bd17-0eacd21bacc4']
tensor(0.0634) tensor(0.0710)
tensor(0.0639) tensor(0.0695)
tensor(0.0628) tensor(0.0688)
tensor(0.0622) tensor(0.0692)
