In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize


def extract_topic_keywords(sentences, n_keywords=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    similarity_graph = similarity_matrix > 0.5  # Threshold for similarity
    clusters = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5,
                                       linkage='average').fit(similarity_graph)
    keywords = []
    for cluster_id in np.unique(clusters.labels_):
        cluster_sentences = [sentences[i] for i, label in enumerate(clusters.labels_) if label == cluster_id]
        cluster_tfidf = vectorizer.transform(cluster_sentences)
        cluster_keywords = np.array(vectorizer.get_feature_names())[np.argsort(-cluster_tfidf.sum(axis=0).A1)]
        keywords.extend(cluster_keywords[:n_keywords])
    return keywords


def get_paragraphs_from_document(filepath):
    with open(filepath, 'r') as file:
        document = file.read()
    paragraphs = document.split('\n')
    return paragraphs


def get_sentences_from_paragraph(paragraph):
    return sent_tokenize(paragraph)


def get_document_embedding(sentences):
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    sentence_embeddings = model.encode(sentences)
    document_embedding = np.mean(sentence_embeddings, axis=0)
    return document_embedding


def cluster_paragraphs(paragraphs):
    embeddings = []
    for p in paragraphs:
        sentences = get_sentences_from_paragraph(p)
        if sentences:
            embeddings.append(get_document_embedding(sentences))
    if len(embeddings) < 2:
        return np.array([])
    embeddings = np.vstack(embeddings)
    clusters = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5,
                                       linkage='average').fit(embeddings)
    return clusters.labels_


def process_case_documents(dataset_folder):
    case_files = [file for file in os.listdir(dataset_folder) if file.endswith(".txt")]
    for case_file in case_files:
        document_filepath = os.path.join(dataset_folder, case_file)

        paragraphs = get_paragraphs_from_document(document_filepath)
        print(f"Processing file: {case_file}")
        print("Paragraphs:")
        for i, paragraph in enumerate(paragraphs):
            print(f"Paragraph {i+1}: {paragraph}")

        paragraph_clusters = cluster_paragraphs(paragraphs)
        if len(paragraph_clusters) < 2:
            print("No paragraphs to cluster.")
            continue

        print("\nParagraph Clusters:")
        for i, cluster_id in enumerate(paragraph_clusters):
            print(f"Paragraph {i+1}: Cluster {cluster_id}")

        topic_keywords = extract_topic_keywords(paragraphs)
        print("\nTopic Keywords:")
        for i, keyword in enumerate(topic_keywords):
            print(f"Topic {i+1}: {keyword}")
        print()

        # Visualize the clusters
        embeddings = []
        for paragraph in paragraphs:
            sentences = get_sentences_from_paragraph(paragraph)
            if sentences:
                embeddings.append(get_document_embedding(sentences))
        embeddings = np.vstack(embeddings)

        fig, ax = plt.subplots()
        scatter = ax.scatter(embeddings[:, 0], embeddings[:, 1], c=paragraph_clusters, cmap='rainbow')
        legend = ax.legend(*scatter.legend_elements(), title="Clusters")
        ax.add_artist(legend)

        ax.set_title("Paragraph Clusters")
        plt.show()


def main():
    dataset_folder = r"C:\Users\This PC\Desktop\Task_1\Test_docs"
    process_case_documents(dataset_folder)


if __name__ == '__main__':
    main()
