### About 
### Developing a retrieval system using semantic vector search.

### Import necessary packages and libraries

In [18]:
pip install faiss-cpu sentence-transformers -qq

Note: you may need to restart the kernel to use updated packages.


In [19]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from tqdm import tqdm

from sentence_transformers import SentenceTransformer, util
from sentence_transformers import CrossEncoder
import faiss
from sklearn.metrics.pairwise import cosine_similarity



### Load data

In [20]:

train = pd.read_csv("/kaggle/input/secure-offline-rag-system/Matter_Dataset_v3-1728292563373.csv")
test = pd.read_csv("/kaggle/input/secure-offline-rag-system/Matter_Dataset_v3-1728292563373 (1).csv")


### Train and test info

#### Train data

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2424 entries, 0 to 2423
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Query     2418 non-null   object
 1   Response  2418 non-null   object
dtypes: object(2)
memory usage: 38.0+ KB


#### The training dataset has 2,424 QnR pairs. The training dataset also has some duplicate rows.

#### Duplicate rows

In [7]:
train[train.duplicated(keep=False)]

Unnamed: 0,Query,Response
3,What is the subject key identifier of the DAC?,The subject key identifier of the DAC is 96:C2...
48,Is a ProductID value present in either the sub...,"No, a ProductID value is not present in either..."
88,What does the EventTrigger field indicate in t...,The EventTrigger field indicates the test or t...
216,What is the purpose of the TestEventTrigger co...,The TestEventTrigger command provides a means ...
219,How should applications handle copying and pas...,Applications presenting the Manual Pairing Cod...
...,...,...
2274,What does the deobfuscated output (M) represent?,The deobfuscated output (M) represents the dec...
2328,What does the deobfuscated output (M) represent?,The deobfuscated output (M) represents the dec...
2369,How many VendorID values are present in the su...,There are no VendorID values present in the su...
2399,How should Commissioners restart from step 2 i...,They can either immediately expire the fail-sa...


#### Test data

In [21]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1040 entries, 0 to 1039
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   trustii_id  1040 non-null   int64 
 1   Query       1039 non-null   object
dtypes: int64(1), object(1)
memory usage: 16.4+ KB


#### The test dataset has 1,040 queries that need to be answered based on the training dataset information.

### Missing values

#### Train data

In [24]:
train.isnull().sum()

Query       6
Response    6
dtype: int64

#### The train dataset has 6 missing QnR pairs.

#### Test data

In [25]:
test.isnull().sum()

trustii_id    0
Query         1
dtype: int64

#### The test dataset has 1 missing QnR pair.

### Let's drop rows with missing values.

#### Train data

In [26]:
train = train[~train['Query'].isnull()]
train = train.reset_index(drop=True)



#### Fill missing row values with an empty string in the test dataset.

In [27]:
test = test.fillna("")

### Create the list of query and response

In [28]:
data = [(q,r) for q, r in zip(train['Query'].str.lower(),train['Response'].str.lower())]
    

In [29]:
data[:5]

[('when should ota requestors invoke the applyupdaterequest command?',
  'ota requestors should invoke the applyupdaterequest command once they are ready to apply a previously downloaded software image (section applyupdaterequest command).'),
 ("what does the 'wifisecuritybitmap' data type encode?",
  "the 'wifisecuritybitmap' data type encodes the supported wi-fi security types present in the security field of the wifiinterfacescanresultstruct."),
 ('what is the minimal requirement for subscription path?',
  'defined in section 2.11.2.2, “subscribe interaction limits”.'),
 ('what is the subject key identifier of the dac?',
  'the subject key identifier of the dac is 96:c2:d9:24:94:ea:97:85:c0:d1:67:08:e3:88:f1:c0:91:ea:0f:d5.'),
 ('what is matter?',
  'matter is the foundation for connected things, serving as an industry-unifying standard for reliable, seamless, and secure connectivity across smart home devices and ecosystems.')]

### Creating embedding for QnR pairs using sentence transformers dense vector model.

In [31]:
bi_encoder_name = "multi-qa-MiniLM-L6-cos-v1"

### Sentence transformer

In [32]:
bi_encoder_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Endode the QnR pairs

In [33]:
embeddings = bi_encoder_model.encode(data, convert_to_numpy=True)
embeddings = np.array(embeddings)

Batches:   0%|          | 0/76 [00:00<?, ?it/s]

### FAISS: Hierarchical Navigable Small Worlds

#### Add embeddings to faiss HSNW index.

In [56]:
def add_to_faiss(embed_vector):
    d = embed_vector.shape[1]  # Dimensionality of vectors
   
    index = faiss.IndexHNSWFlat(d, 32)  # d: dimensionality, 32: M (number of bi-directional links)
    index.hnsw.ef_construction = 40  # Set ef_construction
    index.hnsw.ef_search = 50  # Set ef_search

    index.add(embed_vector)
    print("Faiss Index Created")
    return index

    

In [57]:
index = add_to_faiss(embeddings)

Faiss Index Created


### Retrieval with re-ranking

### Retrieve the top 10 results using the dense embeddings, then create query-response pairs from these results, and use the cross-encoder model to re-rank them.

In [58]:
cross_encode_name = "ms-marco-MiniLM-L-12-v2"
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
    
def cross_enc_rerank(query,retrieved_docs):
    
    # Prepare the query-document pairs
    pairs = [(query, doc) for doc in retrieved_docs]
    
    # Get reranked results
    reranked_scores = cross_encoder.predict(pairs, show_progress_bar=False)
    
    # Combine the reranked results with the original documents
    reranked_results = [retrieved_docs[i] for i in np.argsort(reranked_scores)[::-1]]


    #get top rank index
    top_rank_index = np.argsort(reranked_scores)[::-1][0]

    return retrieved_docs[top_rank_index]
    


In [60]:
def retrieve_response(faiss_index, query,df):
    query_embedding = bi_encoder_model.encode([query],show_progress_bar=False)

    # Search for the top-k most similar documents
    k = 10
    distances, indices = faiss_index.search(query_embedding, k)

    # Retrieve top-k documents
    retrieved_docs = df.iloc[indices[0]]['Response'].values
    
   
    return cross_enc_rerank(query,retrieved_docs)


    

### Sample retrieval

In [85]:
train['Query'][14]

'What is the purpose of the Time Synchronization Cluster?'

In [86]:
retrieve_response(index,train['Query'][14],train)

'The Time Synchronization Cluster provides attributes for reading a Node’s current time and allows Administrators to set the current time, time zone, and daylight savings time (DST) settings.'

### Test prediction

In [72]:
def predict_test_data(df):
    res = []
    for i, q in enumerate(tqdm(df['Query'])):
        #print(q)
        result = retrieve_response(index,q,train)
        if result:
            res.append(result)
        else:
            res.append("")
        
        
    return res
    
    

In [81]:
test['Query'][0]

'What qualities are defined in tables describing attributes, commands, etc.?'

In [82]:
predict_test_data(test.loc[:0])

100%|██████████| 1/1 [00:00<00:00,  1.27it/s]


['Command fields are defined in a table with columns for ID, Name, Type, Constraint, Quality, Default, and Conformance.']

### Add results to test dataframe

In [None]:
test['Response']  = predict_test_data(test)

In [None]:
test.to_csv(f"faiss_sentence_transformer_{bi_encoder_name}_{cross_encode_name}_hnsw.csv",index=False)