## Upload the sample_corpus zip file with name 'sample_corpus'
Note: Make sure it is a zip file.

In [1]:
from google.colab import files
import zipfile
import os

# Upload ZIP file
uploaded = files.upload()  # Manually select sample_corpus.zip

# Extract ZIP file
zip_file = "sample_corpus.zip"
if zip_file in uploaded:
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall("/content/")  # Extracts directly inside /content/
    print("Extracted 'sample_corpus' successfully!")

# Verify extraction
print("Extracted folder structure:")
os.system("ls -R /content/sample_corpus")  # Lists all files & subfolders.

Saving sample_corpus.zip to sample_corpus.zip
Extracted 'sample_corpus' successfully!
Extracted folder structure:


0

## Upload the sample_benchmarks zip file with name 'sample_benchmarks' following the same structure as legalBench data.

Note: Make sure it is a zip file.

In [2]:
from google.colab import files
import zipfile
import os

# Upload ZIP file
uploaded = files.upload()  # Manually select sample_corpus.zip

# Extract ZIP file
zip_file = "sample_benchmarks.zip"
if zip_file in uploaded:
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall("/content/")  # Extracts directly inside /content/
    print("Extracted 'sample_benchmarks' successfully!")

# Verify extraction
print("Extracted folder structure:")
os.system("ls -R /content/sample_benchmarks")

Saving sample_benchmarks.zip to sample_benchmarks.zip
Extracted 'sample_benchmarks' successfully!
Extracted folder structure:


0

## Chunking and Embedding Test Files for Retrievalfor colab.

Code cell below processed the text documents stored in the sampl_corpus folder located in content folder in google colab, splits them using the given chunking method and generates embeddings using SentenceTransformer models. These processed chunks and their corresponding embeddings are then stored in json files following the same folder structure as the original sample corpus folder for convenience.these chunked documents are stored in the 'sample_corpus_chunked' folder in the '/content' folder. These json files will be used for similarity search and reranking.

In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("Shitao/RetroMAE")
model = AutoModelForMaskedLM.from_pretrained("Shitao/RetroMAE")

tokenizer_config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [4]:
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

#  Define Model Storage
supported_models = {
    "sbert": "sentence-transformers/all-mpnet-base-v2",
    "distilled_sbert": "sentence-transformers/all-MiniLM-L6-v2",
    "gte_large": "thenlper/gte-large",
    "retromae": "Shitao/RetroMAE"
}

#  Load Model
def load_embedding_model(model_name):
    return SentenceTransformer(supported_models[model_name])

#  Chunking Function
def naive_chunking(text, chunk_size, overlap):
    """Splits text into overlapping chunks"""
    start = 0
    chunks = []
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

def recursive_chunking(text, chunk_size, overlap):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap,
                                            separators=["\n\n", "\n", "!", "?", ".", ":", ";", ",", " ", ""])
    return splitter.split_text(text)


#  Batch Embedding Function
def get_embeddings(chunks, model_name, batch_size):
    model = load_embedding_model(model_name)
    embeddings = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        batch_embeddings = model.encode(batch, convert_to_numpy=True)
        embeddings.extend(batch_embeddings)
    return np.array(embeddings)

#  Save Chunks to JSON
def save_chunks_to_json(chunks, file_name, output_folder, sub_folder):
    """Save chunked text in structured JSON format."""

    output_path = os.path.join(output_folder, sub_folder)
    os.makedirs(output_path, exist_ok=True)  # Ensure subdirectory exists

    json_file_path = os.path.join(output_path, f"{file_name}.json")

    # Save JSON File
    with open(json_file_path, "w", encoding="utf-8") as f:
        json.dump(chunks, f, indent=2)

    print(f"Saved: {json_file_path}")

# Main Function to Process Text Files
def process_text_file(chunk_size, overlap, batch_size, chunking_method, model_name):
    folder_path = "/content/sample_corpus"  # Google Colab Path
    output_folder = "/content/sample_corpus_chunked"  # Output Directory

    if not os.path.exists(folder_path):
        print(f"Folder '{folder_path}' does not exist!")
        return

    sub_folders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]

    print("Subfolders:", sub_folders)
    chunk_id = 0

    for sub in sub_folders:
        sub_folder_path = os.path.join(folder_path, sub)
        text_files = [f for f in os.listdir(sub_folder_path) if f.endswith(".txt")]

        for filename in text_files:
            file_path = os.path.join(sub_folder_path, filename)
            print(f"\nProcessing file: {file_path}")

            try:
                with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
                    text = file.read()

                # Generate Chunks
                if chunking_method=='naive':
                    chunks = naive_chunking(text, chunk_size, overlap)
                elif chunking_method == 'recursive':
                    chunks = recursive_chunking(text, chunk_size, overlap)

                # Add span information
                spans = []
                current_position = 0
                for chunk in chunks:
                    start = text.find(chunk, current_position)
                    end = start + len(chunk)
                    spans.append((start, end))
                    current_position = end

                print(f"Total chunks created: {len(chunks)}")

                # Compute Embeddings
                embeddings = get_embeddings(chunks, model_name, batch_size)
                print(f"Embeddings generated")

                # Store Chunks in JSON Format
                chunk_data = [{"chunk_id": chunk_id + i + 1, "text": chunk, "embedding": embedding.tolist(), "span": spans[i], "filepath": file_path}
                                for i, (chunk, embedding) in enumerate(zip(chunks, embeddings))]

                chunk_id += len(chunks)
                file_base_name = filename.removesuffix(".txt")
                save_chunks_to_json(chunk_data, file_base_name, output_folder, sub)

            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    print("Processing completed!")

# Run
if __name__ == "__main__":
    chunk_size = 500
    overlap = 0
    batch_size = 32
    chunking_method =  'recursive' #select either 'naive' or 'recursive'.
    model_name = 'retromae'
    process_text_file(chunk_size, overlap, batch_size, chunking_method, model_name)

Subfolders: ['maud', 'cuad', 'contractnli', 'privacy_qa']

Processing file: /content/sample_corpus/maud/Cubic Corporation_Investment Group.pdf||Cubic_Corporation_Investment_Group_Amendment No.1.txt
Total chunks created: 1362


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/Cubic Corporation_Investment Group.pdf||Cubic_Corporation_Investment_Group_Amendment No.1.json

Processing file: /content/sample_corpus/maud/Alexion Pharmaceuticals, Inc._AstraZeneca PLC.txt
Total chunks created: 1520


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/Alexion Pharmaceuticals, Inc._AstraZeneca PLC.json

Processing file: /content/sample_corpus/maud/TIFFANY_&_CO._LVMH_MOE╠êT_HENNESSY-LOUIS_VUITTON.txt
Total chunks created: 1202


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/TIFFANY_&_CO._LVMH_MOE╠êT_HENNESSY-LOUIS_VUITTON.json

Processing file: /content/sample_corpus/maud/W_R_Grace_Co_40_North_Management_LLC.txt
Total chunks created: 1083


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/W_R_Grace_Co_40_North_Management_LLC.json

Processing file: /content/sample_corpus/maud/American_Renal_Associates_Holdings_IRC_Superman_Midco.txt
Total chunks created: 1164


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/American_Renal_Associates_Holdings_IRC_Superman_Midco.json

Processing file: /content/sample_corpus/maud/Telenav, Inc._Management Led Buyout.pdf||Telenav, Inc._Management Led Buyout Amendment No. 1.txt
Total chunks created: 984


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/Telenav, Inc._Management Led Buyout.pdf||Telenav, Inc._Management Led Buyout Amendment No. 1.json

Processing file: /content/sample_corpus/maud/Inovalon_Holdings_Management_Led_Buyout.txt
Total chunks created: 1129


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/Inovalon_Holdings_Management_Led_Buyout.json

Processing file: /content/sample_corpus/maud/HMS Holdings Corp._Veritas Capital.txt
Total chunks created: 1045


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/HMS Holdings Corp._Veritas Capital.json

Processing file: /content/sample_corpus/maud/Glu Mobile Inc._Electronic Arts Inc..txt
Total chunks created: 993


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/Glu Mobile Inc._Electronic Arts Inc..json

Processing file: /content/sample_corpus/maud/Protective Insurance Corporation_The Progressive Corporation.txt
Total chunks created: 894


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/Protective Insurance Corporation_The Progressive Corporation.json

Processing file: /content/sample_corpus/maud/Altabancorp_Glacier Bancorp, Inc..txt
Total chunks created: 744


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/Altabancorp_Glacier Bancorp, Inc..json

Processing file: /content/sample_corpus/maud/Chiasma, Inc._Amryt Pharma plc.txt
Total chunks created: 1436


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/Chiasma, Inc._Amryt Pharma plc.json

Processing file: /content/sample_corpus/maud/Prevail Therapeutics Inc._Eli Lilly and Company.txt
Total chunks created: 914


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/Prevail Therapeutics Inc._Eli Lilly and Company.json

Processing file: /content/sample_corpus/maud/Bank of Commerce Holdings_Columbia Banking System, Inc..txt
Total chunks created: 925


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/Bank of Commerce Holdings_Columbia Banking System, Inc..json

Processing file: /content/sample_corpus/maud/Endurance International Group Holdings, Inc._Clearlake Capital Group, L.P..txt
Total chunks created: 1161


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/Endurance International Group Holdings, Inc._Clearlake Capital Group, L.P..json

Processing file: /content/sample_corpus/maud/The Michaels Companies, Inc._Apollo Global Management, LLC.txt
Total chunks created: 1233


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/maud/The Michaels Companies, Inc._Apollo Global Management, LLC.json

Processing file: /content/sample_corpus/cuad/TomOnlineInc_20060501_20-F_EX-4.46_749700_EX-4.46_Co-Branding Agreement.txt
Total chunks created: 431


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/TomOnlineInc_20060501_20-F_EX-4.46_749700_EX-4.46_Co-Branding Agreement.json

Processing file: /content/sample_corpus/cuad/EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B(01)_525118_EX-10.B(01)_Content License Agreement.txt
Total chunks created: 101


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B(01)_525118_EX-10.B(01)_Content License Agreement.json

Processing file: /content/sample_corpus/cuad/IntegrityMediaInc_20010329_10-K405_EX-10.17_2373875_EX-10.17_Co-Branding Agreement.txt
Total chunks created: 100


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/IntegrityMediaInc_20010329_10-K405_EX-10.17_2373875_EX-10.17_Co-Branding Agreement.json

Processing file: /content/sample_corpus/cuad/GentechHoldingsInc_20190808_1-A_EX1A-6 MAT CTRCT_11776814_EX1A-6 MAT CTRCT_Distributor Agreement.txt
Total chunks created: 91


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/GentechHoldingsInc_20190808_1-A_EX1A-6 MAT CTRCT_11776814_EX1A-6 MAT CTRCT_Distributor Agreement.json

Processing file: /content/sample_corpus/cuad/GopageCorp_20140221_10-K_EX-10.1_8432966_EX-10.1_Content License Agreement.txt
Total chunks created: 132


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/GopageCorp_20140221_10-K_EX-10.1_8432966_EX-10.1_Content License Agreement.json

Processing file: /content/sample_corpus/cuad/CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.txt
Total chunks created: 107


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.json

Processing file: /content/sample_corpus/cuad/DeltathreeInc_19991102_S-1A_EX-10.19_6227850_EX-10.19_Co-Branding Agreement_ Service Agreement.txt
Total chunks created: 92


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/DeltathreeInc_19991102_S-1A_EX-10.19_6227850_EX-10.19_Co-Branding Agreement_ Service Agreement.json

Processing file: /content/sample_corpus/cuad/MusclepharmCorp_20170208_10-KA_EX-10.38_9893581_EX-10.38_Co-Branding Agreement.txt
Total chunks created: 297


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/cuad/MusclepharmCorp_20170208_10-KA_EX-10.38_9893581_EX-10.38_Co-Branding Agreement.json

Processing file: /content/sample_corpus/cuad/FuseMedicalInc_20190321_10-K_EX-10.43_11575454_EX-10.43_Distributor Agreement.txt
Total chunks created: 86


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/FuseMedicalInc_20190321_10-K_EX-10.43_11575454_EX-10.43_Distributor Agreement.json

Processing file: /content/sample_corpus/cuad/FulucaiProductionsLtd_20131223_10-Q_EX-10.9_8368347_EX-10.9_Content License Agreement.txt
Total chunks created: 64


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/FulucaiProductionsLtd_20131223_10-Q_EX-10.9_8368347_EX-10.9_Content License Agreement.json

Processing file: /content/sample_corpus/cuad/EtonPharmaceuticalsInc_20191114_10-Q_EX-10.1_11893941_EX-10.1_Development Agreement.txt
Total chunks created: 217


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/EtonPharmaceuticalsInc_20191114_10-Q_EX-10.1_11893941_EX-10.1_Development Agreement.json

Processing file: /content/sample_corpus/cuad/ReedsInc_20191113_10-Q_EX-10.4_11888303_EX-10.4_Development Agreement.txt
Total chunks created: 96


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/ReedsInc_20191113_10-Q_EX-10.4_11888303_EX-10.4_Development Agreement.json

Processing file: /content/sample_corpus/cuad/IdeanomicsInc_20160330_10-K_EX-10.26_9512211_EX-10.26_Content License Agreement.txt
Total chunks created: 141


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/IdeanomicsInc_20160330_10-K_EX-10.26_9512211_EX-10.26_Content License Agreement.json

Processing file: /content/sample_corpus/cuad/FuelcellEnergyInc_20191106_8-K_EX-10.1_11868007_EX-10.1_Development Agreement.txt
Total chunks created: 333


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated




Saved: /content/sample_corpus_chunked/cuad/FuelcellEnergyInc_20191106_8-K_EX-10.1_11868007_EX-10.1_Development Agreement.json

Processing file: /content/sample_corpus/cuad/ImineCorp_20180725_S-1_EX-10.5_11275970_EX-10.5_Distributor Agreement.txt
Total chunks created: 97


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/ImineCorp_20180725_S-1_EX-10.5_11275970_EX-10.5_Distributor Agreement.json

Processing file: /content/sample_corpus/cuad/ConformisInc_20191101_10-Q_EX-10.6_11861402_EX-10.6_Development Agreement.txt
Total chunks created: 197


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/ConformisInc_20191101_10-Q_EX-10.6_11861402_EX-10.6_Development Agreement.json

Processing file: /content/sample_corpus/cuad/EdietsComInc_20001030_10QSB_EX-10.4_2606646_EX-10.4_Co-Branding Agreement.txt
Total chunks created: 207


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/cuad/EdietsComInc_20001030_10QSB_EX-10.4_2606646_EX-10.4_Co-Branding Agreement.json

Processing file: /content/sample_corpus/contractnli/INFOMAGNET%20NDA.txt
Total chunks created: 28


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/INFOMAGNET%20NDA.json

Processing file: /content/sample_corpus/contractnli/DBT%20Mutual%20NDA.txt
Total chunks created: 24


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/DBT%20Mutual%20NDA.json

Processing file: /content/sample_corpus/contractnli/IPTK-CO-MutualNon-DisclosureAgreement.txt
Total chunks created: 48


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/IPTK-CO-MutualNon-DisclosureAgreement.json

Processing file: /content/sample_corpus/contractnli/Grindrod%20SA%20Confidentiality%20and%20Non-Disclosure%20Undertaking.txt
Total chunks created: 16


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/Grindrod%20SA%20Confidentiality%20and%20Non-Disclosure%20Undertaking.json

Processing file: /content/sample_corpus/contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt
Total chunks created: 44


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.json

Processing file: /content/sample_corpus/contractnli/Data Use Agreement New York City.txt
Total chunks created: 66


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/Data Use Agreement New York City.json

Processing file: /content/sample_corpus/contractnli/Evelozcity%20OESA%20NDA.txt
Total chunks created: 28


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/Evelozcity%20OESA%20NDA.json

Processing file: /content/sample_corpus/contractnli/Focus-Group-APIC-Seattle-Confidentiality-Agreement-031115.txt
Total chunks created: 8


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/Focus-Group-APIC-Seattle-Confidentiality-Agreement-031115.json

Processing file: /content/sample_corpus/contractnli/DoiT-ICN-NonDisclosure-Agreement.txt
Total chunks created: 11


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/DoiT-ICN-NonDisclosure-Agreement.json

Processing file: /content/sample_corpus/contractnli/HNBA-2017-18-Confidentiality-Agreement.txt
Total chunks created: 35


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/HNBA-2017-18-Confidentiality-Agreement.json

Processing file: /content/sample_corpus/contractnli/ExcelerateStandardNDAFormat.txt
Total chunks created: 33


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/ExcelerateStandardNDAFormat.json

Processing file: /content/sample_corpus/contractnli/Geheimhaltungsvereinbarung_Abschlussarbeiten_HFU_englisch.txt
Total chunks created: 14


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/Geheimhaltungsvereinbarung_Abschlussarbeiten_HFU_englisch.json

Processing file: /content/sample_corpus/contractnli/IBC-PMS-NDA-agreement.txt
Total chunks created: 66


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/IBC-PMS-NDA-agreement.json

Processing file: /content/sample_corpus/contractnli/epsteen_nda.txt
Total chunks created: 24


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/epsteen_nda.json

Processing file: /content/sample_corpus/contractnli/GreenStorm%20NDCSC.txt
Total chunks created: 31


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/GreenStorm%20NDCSC.json

Processing file: /content/sample_corpus/contractnli/Eskom%20Template%20Confidentiality%20and%20Non-disclosure%20Agreement%20Rev%204%20Effective%20August%202017_11.txt
Total chunks created: 67


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/Eskom%20Template%20Confidentiality%20and%20Non-disclosure%20Agreement%20Rev%204%20Effective%20August%202017_11.json

Processing file: /content/sample_corpus/contractnli/FNHA-2019RFP-02-NDA-form.txt
Total chunks created: 18


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/FNHA-2019RFP-02-NDA-form.json

Processing file: /content/sample_corpus/contractnli/eulerhermes-nda.txt
Total chunks created: 17


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/eulerhermes-nda.json

Processing file: /content/sample_corpus/contractnli/IGC-Non-Disclosure-Agreement-LSE-Sample.txt
Total chunks created: 47


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/IGC-Non-Disclosure-Agreement-LSE-Sample.json

Processing file: /content/sample_corpus/contractnli/EFCAConfidentialityAgreement.txt
Total chunks created: 17


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/contractnli/EFCAConfidentialityAgreement.json

Processing file: /content/sample_corpus/privacy_qa/23andMe.txt
Total chunks created: 141


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/privacy_qa/23andMe.json

Processing file: /content/sample_corpus/privacy_qa/Keep.txt
Total chunks created: 27


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/privacy_qa/Keep.json

Processing file: /content/sample_corpus/privacy_qa/Wordscapes.txt
Total chunks created: 71


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/privacy_qa/Wordscapes.json

Processing file: /content/sample_corpus/privacy_qa/Groupon.txt
Total chunks created: 72


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/privacy_qa/Groupon.json

Processing file: /content/sample_corpus/privacy_qa/Viber Messenger.txt
Total chunks created: 77


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/privacy_qa/Viber Messenger.json

Processing file: /content/sample_corpus/privacy_qa/TickTick: To Do List with Reminder, Day Planner.txt
Total chunks created: 9


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/privacy_qa/TickTick: To Do List with Reminder, Day Planner.json

Processing file: /content/sample_corpus/privacy_qa/Fiverr.txt
Total chunks created: 64


Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings generated
Saved: /content/sample_corpus_chunked/privacy_qa/Fiverr.json
Processing completed!


## Chunked Data Retreival and Similarity Search for Google colab.

The code cell below retrievs text chunks from the json files that were stored in the sample_corpus_chunked folder, along with conducting a similarity search. It supports cosine similarity and BM25.

Key steps are:
  1. Loads the JSON files containing all the sampled text chunks and embeddings from the sample_corpus_folder.
  2. Similarity Search: retrival of most relevant chunks for given k_value using either cosine similarity or BM25.
  3. Duplicates JSON folder: duplication of the sample_benchmarks folder containgn the questions and answers JSON files which is named as 'sample_benchmarks_unranked'.
  4. updating the duplicated JSON files: Modifying the duplication sample_benchmarks folder to include the retrived text chunk for each of the query in the same json format as the q&a pairs.


In [5]:
import json
import numpy as np
!pip install rank_bm25
!pip install nltk
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
import nltk
import shutil
import os
from nltk.tokenize import word_tokenize
from google.colab import drive

nltk.download('punkt_tab')

# Set base path for Colab (inside /content/)
BASE_PATH = "/content/sample_corpus_chunked"

def load_chunked_data(directory):
    """Load precomputed chunked embeddings from JSON files."""
    # folder_path = directory
    chunked_data = []

    for folder in os.listdir(directory):
      folder_path = os.path.join(directory, folder)
      for files in os.listdir(folder_path):
          if files.endswith(".json"):
            file_path = os.path.join(folder_path, files)
            with open(file_path, "r", encoding="utf-8") as f:
                file_data = json.load(f)
                if isinstance(file_data, list):
                        chunked_data.extend(file_data)  # Append data from each file

    return chunked_data

# Similarity Search (Cosine)
def cosine_retrieved_chunks(query, embed_model, chunked_data, top_k=3):
    """Find the most similar chunks using cosine similarity."""

    embedding_vectors = np.array([chunk["embedding"] for chunk in chunked_data])
    if embedding_vectors.ndim == 1:
      embedding_vectors = embedding_vectors.reshape(1, -1)

    query_embedding = embed_model.encode([query], convert_to_numpy=True)
    query_embedding = np.array(query_embedding).reshape(1, -1)

    similarities = cosine_similarity(query_embedding, embedding_vectors)[0]
    top_indices = np.argsort(similarities)[::-1][:top_k]

    return [chunked_data[i] for i in top_indices]

# BM25 Search
def BM25_retrieved_chunks(query, chunked_data, top_k):
    corpus = [word_tokenize(chunk["text"].lower()) for chunk in chunked_data]
    tokenized_query = word_tokenize(query.lower())

    bm25 = BM25Okapi(corpus)
    scores = bm25.get_scores(tokenized_query)

    top_indices = sorted(range(len(scores)), key=lambda x: scores[x], reverse=True)[:top_k]
    return [chunked_data[i] for i in top_indices]

# Select Similarity Search Method
def similarity_search_model(model, embed_model, query, chunked_data, top_k):
    if model == "cosine":
        return cosine_retrieved_chunks(query, embed_model, chunked_data, top_k)
    elif model == "BM25":
        return BM25_retrieved_chunks(query, chunked_data, top_k)

# Duplicate JSON Folder for Unranked Chunks
def duplicate_json_file(original_folder, duplicate_folder):
    """Duplicate folder inside Colab environment."""
    original_path = os.path.join("/content", original_folder)
    duplicated_path = os.path.join("/content", duplicate_folder)

    if os.path.exists(duplicated_path):
        shutil.rmtree(duplicated_path)
    shutil.copytree(original_path, duplicated_path)

# Update JSON Files with Retrieved Chunks
def add_retrieved_chunks(duplicate_folder, chunked_data, embed_model, model, top_k):
    """Update duplicated JSON files with retrieved chunks for each query."""
    duplicate_folder_path = os.path.join("/content", duplicate_folder)
    prefix_to_remove = "/content/sample_corpus/"

    for filename in os.listdir(duplicate_folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(duplicate_folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            for test in data:
                query_text = test["query"]
                retrieved_chunks = similarity_search_model(model, embed_model, query_text, chunked_data, top_k)

                # Add span info from mapping
                test["retrieved_chunks_unranked"] = [
                    {
                        "chunk_id": chunk["chunk_id"],
                        "filepath": chunk["filepath"].replace(prefix_to_remove, ""),
                        "span": chunk["span"],
                        "text": chunk["text"]
                    }
                    for chunk in retrieved_chunks
                ]

            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2)


# Main Execution
if __name__ == "__main__":
    from sentence_transformers import SentenceTransformer
    embed_model = SentenceTransformer("Shitao/RetroMAE")  # Load model

    original_folder_="sample_benchmarks"
    duplicate_folder_="sample_benchmarks_unranked"

    print("Loading chunked data...")
    BASE_PATH = "/content/sample_corpus_chunked"
    directory=BASE_PATH
    chunked_data = load_chunked_data(BASE_PATH)
    print(f"Loaded {len(chunked_data)} chunks.")

    model = "cosine"    #'cosine' or 'BM25'
    top_k = 50

    print("Duplicating JSON folder...")
    duplicate_json_file(original_folder_, duplicate_folder_)

    print("Adding retrieved chunks...")
    add_retrieved_chunks(duplicate_folder_, chunked_data, embed_model, model, top_k)

    print("Processing complete.")

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
Some weights of BertModel were not initialized from the model checkpoint at Shitao/RetroMAE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading chunked data...
Loaded 21681 chunks.
Duplicating JSON folder...
Adding retrieved chunks...
Processing complete.


In [None]:
import torch
torch.cuda.empty_cache()

## Re-Ranking Retrieved Chunks with a Cross-Encoder in Google colab

The code cell below, performs re-ranking of retrieved text chunks using a cross-encoder model.

This includes the following steps:
1. Duplicating JSON files: the sample_benchmarks_unranked folder containing the retrieved chunks in the original sample benchmarks json strucutre for a given top_k are duplicated and are saved in '/content' as 'sample_benchmarks_ranked'.
2. Load Retrieved chunks: Reads the JSON files containing the retrieved text chunks in the 'sample_benchmarks_ranked'.
3. Re-rank Chunks using Coss-Encoder: Uses MS MARCO MiniLM Cross-Encoder to score query-text pairs, and sorts the retieved chunks based on their relevance score.
4. Stores the ranked Chunks: updates the ranked text chunks to the json files in 'sample_benchmarks_ranked' folder.


In [6]:
import shutil
import os
import json
from sentence_transformers import CrossEncoder

BASE_PATH = "/content/"

# Load Cross Encoder Model
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# Set Directories
original_folder = os.path.join(BASE_PATH, "sample_benchmarks_unranked")
duplicate_folder_path = os.path.join(BASE_PATH, "sample_benchmarks_ranked")  # FIXED VARIABLE NAME

# Duplicate Folder
def duplicate_folder():
    """Create a duplicate of the original folder before re-ranking."""

    if os.path.exists(duplicate_folder_path):
        shutil.rmtree(duplicate_folder_path)  # Remove if already exists
    shutil.copytree(original_folder, duplicate_folder_path)  # Copy contents

    print(f"Folder duplicated: '{original_folder}' to '{duplicate_folder_path}'.")

# Load and Re-rank Retrieved Chunks
def load_and_rerank(directory=duplicate_folder_path):  # UPDATED VARIABLE NAME
    """Load and re-rank retrieved chunks using a cross-encoder."""

    for filename in os.listdir(directory):
        if filename.endswith(".json"):  # Process only JSON files
            file_path = os.path.join(directory, filename)

            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            for test in data:
                query_text = test["query"]

                # Extract retrieved chunks
                retrieved_chunks = [
                    {
                        "chunk_id": chunk["chunk_id"],
                        "filepath": chunk["filepath"],
                        "span": chunk["span"],
                        "text": chunk["text"]
                    }
                    for chunk in test["retrieved_chunks_unranked"]
                ]

                # Prepare query-chunk pairs for cross-encoder
                query_chunk_pairs = [(query_text, chunk["text"]) for chunk in retrieved_chunks]

                # Compute cross-encoder scores
                scores = cross_encoder.predict(query_chunk_pairs)

                # Attach scores and re-rank
                for i, chunk in enumerate(retrieved_chunks):
                    chunk["cross_encoder_score"] = float(scores[i])  # Convert to float for JSON compatibility

                # Sort by score (higher is better)
                ranked_chunks = sorted(retrieved_chunks, key=lambda x: x["cross_encoder_score"], reverse=True)

                # Store ranked chunks in JSON
                test["retrieved_chunks_ranked"] = ranked_chunks

            # Save updated JSON file
            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"Processed and re-ranked all JSON files in '{directory}' successfully.")

# Run duplication and re-ranking
if __name__ == "__main__":
    print("Duplicating JSON folder...")
    duplicate_folder()

    print("Re-ranking retrieved chunks...")
    load_and_rerank()

    print("Processing complete.")


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Duplicating JSON folder...
Folder duplicated: '/content/sample_benchmarks_unranked' to '/content/sample_benchmarks_ranked'.
Re-ranking retrieved chunks...
Processed and re-ranked all JSON files in '/content/sample_benchmarks_ranked' successfully.
Processing complete.


## Downloading the files

In [7]:

!zip -r sample_benchmarks_unranked.zip sample_benchmarks_unranked/
!zip -r sample_benchmarks_ranked.zip sample_benchmarks_ranked/
!zip -r sample_corpus_chunked.zip sample_corpus_chunked/

from google.colab import files

files.download("sample_benchmarks_unranked.zip")
files.download("sample_benchmarks_ranked.zip")
files.download("sample_corpus_chunked.zip")  # Change filename accordingly


  adding: sample_benchmarks_unranked/ (stored 0%)
  adding: sample_benchmarks_unranked/sampled_queries_privacy_qa.json (deflated 82%)
  adding: sample_benchmarks_unranked/sampled_queries_contractnli.json (deflated 82%)
  adding: sample_benchmarks_unranked/sampled_queries_cuad.json (deflated 85%)
  adding: sample_benchmarks_unranked/sampled_queries_maud.json (deflated 85%)
  adding: sample_benchmarks_ranked/ (stored 0%)
  adding: sample_benchmarks_ranked/sampled_queries_privacy_qa.json (deflated 86%)
  adding: sample_benchmarks_ranked/sampled_queries_contractnli.json (deflated 86%)
  adding: sample_benchmarks_ranked/sampled_queries_cuad.json (deflated 86%)
  adding: sample_benchmarks_ranked/sampled_queries_maud.json (deflated 87%)
  adding: sample_corpus_chunked/ (stored 0%)
  adding: sample_corpus_chunked/maud/ (stored 0%)
  adding: sample_corpus_chunked/maud/Altabancorp_Glacier Bancorp, Inc..json (deflated 65%)
  adding: sample_corpus_chunked/maud/Endurance International Group Holding

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## What I am currently working on.

1. Cleaning the code so it is more readable.
2. Adding more metadata for the json files containing the retrieved chunks after cosine similarity and reranking(such as the top_k vlaue used and the file paths of the individual retrieved chunks).
3. Adapting the code to implement FAISS for storing embeddings and SQLite for storing the Metadata if our computers cannot handle the whole size of the knwoledge corpus.
4. Implementing FAISS automate automate the RAG pipleine wihout doing seperate similarity search(So that we don't have to do similarity search and reranking seperately)