In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize


def get_sentences_from_document(filepath):
    with open(filepath, 'r') as file:
        document = file.read()
    sentences = sent_tokenize(document)
    return sentences


def get_document_embedding(sentences):
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    sentence_embeddings = model.encode(sentences)
    document_embedding = np.mean(sentence_embeddings, axis=0)
    return document_embedding


def rank_documents(query_embedding, document_embeddings, documents, k):
    similarities = cosine_similarity(query_embedding.reshape(1, -1), document_embeddings)[0]
    ranked_indices = np.argsort(-similarities)[:k]
    ranked_documents = [documents[i] for i in ranked_indices]
    return ranked_documents


def process_query_case(query_case_filepath, corpus_folder, k, batch_size=50):
    query_sentences = get_sentences_from_document(query_case_filepath)
    query_embedding = get_document_embedding(query_sentences)

    document_embeddings = []
    documents = []

    for document_file in os.listdir(corpus_folder):
        document_filepath = os.path.join(corpus_folder, document_file)
        document_sentences = get_sentences_from_document(document_filepath)
        document_embedding = get_document_embedding(document_sentences)
        document_embeddings.append(document_embedding)
        documents.append(document_file)

        if len(document_embeddings) == batch_size:
            document_embeddings = np.vstack(document_embeddings)
            ranked_documents = rank_documents(query_embedding, document_embeddings, documents, k)
            document_embeddings = []
            documents = []

            yield ranked_documents

    if len(document_embeddings) > 0:
        document_embeddings = np.vstack(document_embeddings)
        ranked_documents = rank_documents(query_embedding, document_embeddings, documents, k)

        yield ranked_documents


def main():
    query_case_filepath = r"C:\Users\This PC\Desktop\Task_2\Prior_Cases\prior_case_0001.txt"
    corpus_folder = r"C:\Users\This PC\Desktop\Task_2\Prior_Cases"
    k = 5  # Number of top-ranked documents to retrieve
    batch_size = 50  # Batch size for processing documents

    batch_generator = process_query_case(query_case_filepath, corpus_folder, k, batch_size)

    document_embeddings = []
    documents = []

    for i, ranked_documents in enumerate(batch_generator):
        print(f"Batch {i+1} - Ranked Documents:")
        for j, document in enumerate(ranked_documents):
            print(f"Rank {j+1}: {document}")
        print()

        document_embeddings.extend([get_document_embedding(get_sentences_from_document(doc)) for doc in ranked_documents])
        documents.extend(ranked_documents)

    document_embeddings = np.vstack(document_embeddings)
    similarity_matrix = cosine_similarity(document_embeddings)
    
    # Create heatmap
    fig, ax = plt.subplots()
    im = ax.imshow(similarity_matrix, cmap='hot')

    # Set axis labels
    ax.set_xticks(np.arange(len(documents)))
    ax.set_yticks(np.arange(len(documents)))
    ax.set_xticklabels(documents)
    ax.set_yticklabels(documents)
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right")

    # Set colorbar
    cbar = ax.figure.colorbar(im, ax=ax)
    cbar.ax.set_ylabel("Similarity Score", rotation=-90, va="bottom")

    # Set title and show the plot
    ax.set_title("Similarity Matrix - Query vs. Documents")
    plt.show()


if __name__ == '__main__':
    main()


Batch 1 - Ranked Documents:
Rank 1: prior_case_0001.txt
Rank 2: prior_case_0045.txt
Rank 3: prior_case_0037.txt
Rank 4: prior_case_0011.txt
Rank 5: prior_case_0031.txt



FileNotFoundError: [Errno 2] No such file or directory: 'prior_case_0001.txt'

In [6]:
file_list = os.listdir(corpus_folder)
print(file_list)


NameError: name 'corpus_folder' is not defined