# Binary Quantization

# Importing all the necessary libraries

In [None]:
import numpy as np
import pandas as pd
import os,sys
sys.path.append('../')
from sentence_transformers import SentenceTransformer
from src.model_evaluation import evaluation
from src.other_function import functions

# creating the object of evaluation class

In [None]:
eval_object=evaluation()

# creating the object of other functions class

In [None]:
func_object=functions()

# Loading the data

In [17]:
df=pd.read_csv("..//Data/processed_data.csv")


list_data = df['tokenized_docstring'].tolist()

# Loading the testing data

In [18]:
queries=pd.read_csv("..//Data/query.csv")

# Loading the model to be quantized


In [2]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")


  _torch_pytree._register_pytree_node(


# Loading the embeddings saved in local drive

In [3]:
embeddings=np.load("..//embeddings/embeddings_all_mpnet_base_v2.npy")

# doing binary quantization on embeddings

In [4]:
# all the values that are less than 0 converted to -1 and greater or equal to 0 are converted to 1
binary_embeddings = np.where(embeddings >= 0, 1, -1).astype(np.int8)

# Check if the dtype is int8 and size is 256 bytes
print("Data type:", binary_embeddings.dtype)
print("Size in bytes:", binary_embeddings.nbytes)


Data type: int8
Size in bytes: 87462912


In [7]:
print(f"Size of binary embeddings is {(binary_embeddings.nbytes)/(1024*1024):.2f} in MB")

Size of binary embeddings is 83.41 in MB


# generating the response on testing data

In [22]:
# order in which column of databse will be present
column_order = ['Query', 'Docstrings', 'Code','Match']

#creating the dataframe for model_1 which consists of for particular query all top code and
#match column shows whether it is correctly retrieved or not
binary_emb_query_response=pd.DataFrame(columns=column_order)

for i in range(len(queries)):
    #getting the particular query from queries 
    query=queries.loc[i].Questions
    
    # getting the table dataframe that consists of docstring and code 
    #that has top match with query using cosine similarity
    table=eval_object.get_top_code_and_docstring(query,binary_embeddings,model,list_data,df)
    
    #adding the column query with all values as 
    table=add_column(table,query)
    
    #adding columns Match that defines whether the code for given query is correctly retrieved or not
    table=add_column(table,'Match')
    
    #reordering the table for better view
    table=reorder_columns(table)
    
    #iterating in the table
    for index,row in table.iterrows():
        
        #fetching the current code for query
        result = row['Code']
        
        #getting the response from claude api
        response=eval_object.check_response(query,row['Code'])
    
        #Checking the value present in the response generated by the claude
        if 'YES' in response.content:
            response='YES'
        else :
            response='NO'
            
        #Giving the value of response in match column for current row
        table.at[index,'Match']=response
    
    #merging the table dataframe and model_1_query_response in single dataframe
    binary_emb_query_response= pd.concat([binary_emb_query_response, table], ignore_index=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# MAP for binary quantized embedding

In [36]:
total_yes_model_3 = (binary_emb_query_response['Match'] == 'YES').sum()
total_no_model_3 = (binary_emb_query_response['Match'] == 'NO').sum()

print(f"MAP@10 (mean average precision) of model_3 : {(total_yes_model_3/(total_yes_model_3+total_no_model_3))*100}")


MAP@10 (mean average precision) of model_3 : 63.859649122807014
