In [None]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize


def get_sentences_from_document(filepath):
    with open(filepath, 'r') as file:
        document = file.read()
    sentences = sent_tokenize(document)
    return sentences


def get_document_embedding(sentences):
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    sentence_embeddings = model.encode(sentences)
    document_embedding = np.mean(sentence_embeddings, axis=0)
    return document_embedding


def rank_documents(query_embedding, document_embeddings, documents, k):
    similarities = cosine_similarity(query_embedding.reshape(1, -1), document_embeddings)[0]
    ranked_indices = np.argsort(-similarities)[:k]
    ranked_documents = [documents[i] for i in ranked_indices]
    return ranked_documents


def process_query_case(query_case_filepath, corpus_folder, k, batch_size=50):
    query_sentences = get_sentences_from_document(query_case_filepath)
    query_embedding = get_document_embedding(query_sentences)

    document_embeddings = []
    documents = []

    for document_file in os.listdir(corpus_folder):
        document_filepath = os.path.join(corpus_folder, document_file)
        document_sentences = get_sentences_from_document(document_filepath)
        document_embedding = get_document_embedding(document_sentences)
        document_embeddings.append(document_embedding)
        documents.append(document_file)

        if len(document_embeddings) == batch_size:
            document_embeddings = np.vstack(document_embeddings)
            ranked_documents = rank_documents(query_embedding, document_embeddings, documents, k)
            document_embeddings = []
            documents = []

            yield ranked_documents

    if len(document_embeddings) > 0:
        document_embeddings = np.vstack(document_embeddings)
        ranked_documents = rank_documents(query_embedding, document_embeddings, documents, k)

        yield ranked_documents


def main():
    query_case_filepath = r"C:\Users\This PC\Desktop\Task_2\Prior_Cases\prior_case_0001.txt"
    corpus_folder = r"C:\Users\This PC\Desktop\Task_2\Prior_Cases"
    k = 100  # Number of top-ranked documents to retrieve
    batch_size = 50  # Batch size for processing documents

    batch_generator = process_query_case(query_case_filepath, corpus_folder, k, batch_size)

    for i, ranked_documents in enumerate(batch_generator):
        print(f"Batch {i+1} - Ranked Documents:")
        for j, document in enumerate(ranked_documents):
            print(f"Rank {j+1}: {document}")
        print()

if __name__ == '__main__':
    main()


Batch 1 - Ranked Documents:
Rank 1: prior_case_0001.txt
Rank 2: prior_case_0045.txt
Rank 3: prior_case_0037.txt
Rank 4: prior_case_0011.txt
Rank 5: prior_case_0031.txt
Rank 6: prior_case_0050.txt
Rank 7: prior_case_0030.txt
Rank 8: prior_case_0039.txt
Rank 9: prior_case_0015.txt
Rank 10: prior_case_0004.txt
Rank 11: prior_case_0043.txt
Rank 12: prior_case_0042.txt
Rank 13: prior_case_0025.txt
Rank 14: prior_case_0023.txt
Rank 15: prior_case_0033.txt
Rank 16: prior_case_0028.txt
Rank 17: prior_case_0021.txt
Rank 18: prior_case_0008.txt
Rank 19: prior_case_0036.txt
Rank 20: prior_case_0027.txt
Rank 21: prior_case_0009.txt
Rank 22: prior_case_0003.txt
Rank 23: prior_case_0020.txt
Rank 24: prior_case_0049.txt
Rank 25: prior_case_0017.txt
Rank 26: prior_case_0010.txt
Rank 27: prior_case_0048.txt
Rank 28: prior_case_0012.txt
Rank 29: prior_case_0002.txt
Rank 30: prior_case_0016.txt
Rank 31: prior_case_0006.txt
Rank 32: prior_case_0035.txt
Rank 33: prior_case_0014.txt
Rank 34: prior_case_0038