# Part 2 : This is the notebook to evaluate the performance of our LLM based approach on its ability to cluster similar documents together based on their generated Keywords.

*To reproduce this output you need to adjust the file_path to read the raw `.txt` files as input available in the github repository https://github.com/krutik-2-11/LLM-Tagger-Project/tree/main*

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Given below is the sample clusters created by LLMs based on Keywords generated in Part 1. The title of clusters are also created by LLM on its own. The clusters are based on the prompts given to LLM.

In [None]:
data = {
    "Natural Language Processing":[
        "HyperTuning: Toward Adapting Large Language Models without Back-propagation",
        "Using LLM for Improving Key Event Discovery:Temporal-Guided News Stream Clustering with Event Summaries"
    ],


    "Programming and Software Engineering":[
        "Automated Repair of Programs from Large Language Models",
        "LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS"
    ],


    "Disaster and Emergency Response":[
        "Categorization of disaster‑related deaths in Minamisoma city after the Fukushima nuclear disaster using clustering analysis"
    ],


    "Health and Medicine":[
        "Effects of a Home-Based Stretching Program on Bench Press Maximum Strength and Shoulder Flexibility",
        "The effect of verbal praise on prospective memory"
    ],


    "Quantum Computing":[
        "Implementation of quantum compression on IBM quantum computers"
    ],


    "Astronomy and Cosmology":[
        "The alignment between brightest cluster galaxies and host clusters"
    ]
}


In [None]:
file_path_1 = '/content/drive/My Drive/BDA 798 LLMs/TestCase 1 Papers text/paper1raw.txt'
file_path_2 = '/content/drive/My Drive/BDA 798 LLMs/TestCase 1 Papers text/paper2raw.txt'
file_path_3 = '/content/drive/My Drive/BDA 798 LLMs/TestCase 1 Papers text/paper3raw.txt'
file_path_4 = '/content/drive/My Drive/BDA 798 LLMs/TestCase 1 Papers text/paper4raw.txt'
file_path_5 = '/content/drive/My Drive/BDA 798 LLMs/TestCase 1 Papers text/paper5raw.txt'
file_path_6 = '/content/drive/My Drive/BDA 798 LLMs/TestCase 1 Papers text/paper6raw.txt'
file_path_7 = '/content/drive/My Drive/BDA 798 LLMs/TestCase 1 Papers text/paper7raw.txt'
file_path_8 = '/content/drive/My Drive/BDA 798 LLMs/TestCase 1 Papers text/paper8raw.txt'
file_path_9 = '/content/drive/My Drive/BDA 798 LLMs/TestCase 1 Papers text/paper9raw.txt'
with open(file_path_1, 'r') as file:
    paper1raw = file.read()
with open(file_path_2, 'r') as file:
    paper2raw = file.read()
with open(file_path_3, 'r') as file:
    paper3raw = file.read()
with open(file_path_4, 'r') as file:
    paper4raw = file.read()
with open(file_path_5, 'r') as file:
    paper5raw = file.read()
with open(file_path_6, 'r') as file:
    paper6raw = file.read()
with open(file_path_7, 'r') as file:
    paper7raw = file.read()
with open(file_path_8, 'r') as file:
    paper8raw = file.read()
with open(file_path_9, 'r') as file:
    paper9raw = file.read()

## Standard Clustering

In [None]:
# Standard
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import MinMaxScaler

import numpy as np


# Mapping paper titles to their raw text
paper_dict = {
    "The alignment between brightest cluster galaxies and host clusters":paper1raw,
    "Automated Repair of Programs from Large Language Models": paper2raw,
    "Using LLM for Improving Key Event Discovery:Temporal-Guided News Stream Clustering with Event Summaries": paper3raw,
    "HyperTuning: Toward Adapting Large Language Models without Back-propagation":paper4raw,
    "LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS":paper5raw,
    "Implementation of quantum compression on IBM quantum computers":paper6raw,
    "The effect of verbal praise on prospective memory":paper7raw,
    "Categorization of disaster‑related deaths in Minamisoma city after the Fukushima nuclear disaster using clustering analysis":paper8raw,
    "Effects of a Home-Based Stretching Program on Bench Press Maximum Strength and Shoulder Flexibility":paper9raw
}

# Convert paper text to a list for vectorization
papers_raw_text = list(paper_dict.values())

# Vectorize the paper texts
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(papers_raw_text)

# Calculate cosine similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# Convert normalized similarity to distance matrix
distance_matrix = 1 - cosine_sim_matrix

#print(f'distance_matrix = {distance_matrix}')

# Define clusters based on your initial categorization
clusters = data

# Create a mapping of paper to its cluster number
paper_to_cluster = {}
for cluster_num, (cluster_name, papers) in enumerate(clusters.items()):
    for paper in papers:
        paper_to_cluster[paper] = cluster_num

# Create a list of cluster labels based on the order of papers in `papers_raw_text`
cluster_labels = [paper_to_cluster[title] for title in paper_dict.keys()]

# Calculate silhouette scores for each paper
silhouette_values = silhouette_samples(distance_matrix, cluster_labels, metric="euclidean")

# Calculate the overall silhouette score
overall_silhouette_score = silhouette_score(distance_matrix, cluster_labels, metric="euclidean")

# Enhanced print statements for better readability
print("Individual Silhouette Scores for Each Paper:")
for idx, score in enumerate(silhouette_values):
    print(f"Paper {idx + 1} ({list(paper_dict.keys())[idx]}): Silhouette Score = {score:.4f}")

print("\nOverall Silhouette Score for the Dataset: {:.4f}".format(overall_silhouette_score))



Individual Silhouette Scores for Each Paper:
Paper 1 (The alignment between brightest cluster galaxies and host clusters): Silhouette Score = 0.0000
Paper 2 (Automated Repair of Programs from Large Language Models): Silhouette Score = -0.0048
Paper 3 (Using LLM for Improving Key Event Discovery:Temporal-Guided News Stream Clustering with Event Summaries): Silhouette Score = -0.1764
Paper 4 (HyperTuning: Toward Adapting Large Language Models without Back-propagation): Silhouette Score = -0.0839
Paper 5 (LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS): Silhouette Score = -0.2910
Paper 6 (Implementation of quantum compression on IBM quantum computers): Silhouette Score = 0.0000
Paper 7 (The effect of verbal praise on prospective memory): Silhouette Score = 0.0091
Paper 8 (Categorization of disaster‑related deaths in Minamisoma city after the Fukushima nuclear disaster using clustering analysis): Silhouette Score = 0.0000
Paper 9 (Effects of a Home-Based Stretching Program on Bench Pre

## Hierarchical Clustering

In [None]:
# Hierarchal clustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Convert paper text to a list for vectorization
papers_raw_text = list(paper_dict.values())

# Vectorize the paper texts
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(papers_raw_text)

# Calculate cosine similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# Convert similarity to distance matrix (for hierarchical clustering)
distance_matrix = 1 - cosine_sim_matrix

n_clusters = 6
clustering_model = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='complete')
cluster_labels = clustering_model.fit_predict(distance_matrix)

# Calculate silhouette scores for each paper
silhouette_values = silhouette_samples(distance_matrix, cluster_labels, metric="euclidean")

# Calculate the overall silhouette score
overall_silhouette_score = silhouette_score(distance_matrix, cluster_labels, metric="euclidean")

print("Individual Silhouette Scores for Each Paper:")
for idx, score in enumerate(silhouette_values):
    print(f"Paper {idx + 1} ({list(paper_dict.keys())[idx]}): Silhouette Score = {score:.4f}")

print(f"\nOverall Silhouette Score for the Dataset: {overall_silhouette_score:.4f}")

Individual Silhouette Scores for Each Paper:
Paper 1 (The alignment between brightest cluster galaxies and host clusters): Silhouette Score = 0.0867
Paper 2 (Automated Repair of Programs from Large Language Models): Silhouette Score = 0.0000
Paper 3 (Using LLM for Improving Key Event Discovery:Temporal-Guided News Stream Clustering with Event Summaries): Silhouette Score = 0.0707
Paper 4 (HyperTuning: Toward Adapting Large Language Models without Back-propagation): Silhouette Score = 0.2973
Paper 5 (LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS): Silhouette Score = 0.1614
Paper 6 (Implementation of quantum compression on IBM quantum computers): Silhouette Score = 0.0882
Paper 7 (The effect of verbal praise on prospective memory): Silhouette Score = 0.0000
Paper 8 (Categorization of disaster‑related deaths in Minamisoma city after the Fukushima nuclear disaster using clustering analysis): Silhouette Score = 0.0000
Paper 9 (Effects of a Home-Based Stretching Program on Bench Press M



## Kmeans clustering

In [None]:
# Kmeans clustering

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans
import numpy as np

# Convert paper text to a list for vectorization
papers_raw_text = list(paper_dict.values())

# Vectorize the paper texts
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(papers_raw_text)

# Apply K-means clustering
# Choose an appropriate number of clusters
n_clusters = 6
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(tfidf_matrix)

# *** K-means works directly with the TF-IDF features, so WE SHOUDL NOT convert to a distance matrix for silhouette calculation
# Calculate silhouette scores for each paper
silhouette_values = silhouette_samples(tfidf_matrix, cluster_labels, metric='euclidean')

# Calculate the overall silhouette score
overall_silhouette_score = silhouette_score(tfidf_matrix, cluster_labels, metric='euclidean')

print("Individual Silhouette Scores for Each Paper:")
for idx, score in enumerate(silhouette_values):
    print(f"Paper {idx + 1} ({list(paper_dict.keys())[idx]}): Silhouette Score = {score:.4f}")

print(f"\nOverall Silhouette Score for the Dataset: {overall_silhouette_score:.4f}")




Individual Silhouette Scores for Each Paper:
Paper 1 (The alignment between brightest cluster galaxies and host clusters): Silhouette Score = 0.0465
Paper 2 (Automated Repair of Programs from Large Language Models): Silhouette Score = 0.0000
Paper 3 (Using LLM for Improving Key Event Discovery:Temporal-Guided News Stream Clustering with Event Summaries): Silhouette Score = 0.0207
Paper 4 (HyperTuning: Toward Adapting Large Language Models without Back-propagation): Silhouette Score = 0.1796
Paper 5 (LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS): Silhouette Score = 0.0993
Paper 6 (Implementation of quantum compression on IBM quantum computers): Silhouette Score = 0.0404
Paper 7 (The effect of verbal praise on prospective memory): Silhouette Score = 0.0000
Paper 8 (Categorization of disaster‑related deaths in Minamisoma city after the Fukushima nuclear disaster using clustering analysis): Silhouette Score = 0.0000
Paper 9 (Effects of a Home-Based Stretching Program on Bench Press M

## Other Index

In [None]:
from gensim.models import KeyedVectors
from gensim.downloader import load
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import numpy as np



papers_raw_text = [paper1raw,paper2raw, paper3raw, paper4raw,paper5raw, paper6raw, paper7raw,paper8raw, paper9raw]

# Define a preprocessing function
def preprocess(text):
    filters = [strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short]
    return preprocess_string(text, filters)

# Preprocess the documents
papers_preprocessed = [preprocess(text) for text in papers_raw_text]

# Load a pre-trained Word2Vec model from Gensim's Data Repository
model = load('word2vec-google-news-300')

# Function to vectorize a document using Word2Vec embeddings
def document_vector(doc):
    # Remove words not in model's vocabulary
    doc = [word for word in doc if word in model.key_to_index]
    if not doc:
        return np.zeros(model.vector_size)
    # Aggregate word vectors into a single document vector by averaging
    return np.mean([model[word] for word in doc], axis=0)

# Vectorize each document
doc_vectors = np.array([document_vector(doc) for doc in papers_preprocessed])

# Apply K-means clustering
n_clusters = 6  # Adjust
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(doc_vectors)

# Calculate silhouette scores for each paper
silhouette_values = silhouette_samples(doc_vectors, cluster_labels, metric='euclidean')

# Calculate the overall silhouette score
overall_silhouette_score = silhouette_score(doc_vectors, cluster_labels, metric='euclidean')

print("Individual Silhouette Scores for Each Paper:")
for idx, score in enumerate(silhouette_values):
    print(f"Paper {idx + 1}: Silhouette Score = {score:.4f}")

print(f"\nOverall Silhouette Score for the Dataset: {overall_silhouette_score:.4f}")


Individual Silhouette Scores for Each Paper:
Paper 1: Silhouette Score = 0.0000
Paper 2: Silhouette Score = 0.1615
Paper 3: Silhouette Score = 0.1758
Paper 4: Silhouette Score = 0.1260
Paper 5: Silhouette Score = -0.0011
Paper 6: Silhouette Score = 0.2294
Paper 7: Silhouette Score = 0.0000
Paper 8: Silhouette Score = 0.0000
Paper 9: Silhouette Score = 0.0000

Overall Silhouette Score for the Dataset: 0.0768




## Davis Bouldin and Calinski-Harabasz Index

In [None]:
#DBI
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score
import numpy as np


# # Mapping paper titles to their raw text
# paper_dict = {
#     "Improving Language Understanding by Generative Pre-Training": "paper1raw",
#     "Colour measurements by computer vision for food quality control – A review": "paper3raw",
#     "Learning techniques used in computer vision for food quality evaluation: a review": "paper2raw"
# }

# Convert paper text to a list for vectorization
papers_raw_text = list(paper_dict.values())

# Vectorize the paper texts
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(papers_raw_text)

# Define clusters based on your initial categorization, this part needs to be adjusted to your data
clusters = data

# Create a mapping of paper to its cluster number
paper_to_cluster = {}
for cluster_num, (cluster_name, papers) in enumerate(clusters.items()):
    for paper in papers:
        paper_to_cluster[paper] = cluster_num

# Create a list of cluster labels based on the order of papers in `papers_raw_text`
cluster_labels = [paper_to_cluster[title] for title in paper_dict.keys()]


# Calculate Davies-Bouldin Index
dbi_score = davies_bouldin_score(tfidf_matrix.toarray(), cluster_labels)
# Calculate Calinski-Harabasz Index
chi_score = calinski_harabasz_score(tfidf_matrix.toarray(), cluster_labels)

print("Davies-Bouldin Index for the Dataset: {:.4f}".format(dbi_score))
print("Calinski-Harabasz Index for the Dataset: {:.13f}".format(chi_score))


Davies-Bouldin Index for the Dataset: 1.0398
Calinski-Harabasz Index for the Dataset: 1.0040536177716


## Dunn Index

In [None]:
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

def dunn_index(tfidf_matrix, cluster_labels):
    # Convert TF-IDF matrix to dense array
    tfidf_dense_matrix = tfidf_matrix.toarray()

    # Calculate pairwise Euclidean distances between all points
    distances = euclidean_distances(tfidf_dense_matrix)

    unique_clusters = np.unique(cluster_labels)
    min_intercluster_distance = np.inf
    max_intracluster_diameter = 0

    for i in unique_clusters:
        cluster_i_indices = np.where(cluster_labels == i)[0]
        for j in unique_clusters:
            if i == j:
                # Skip if comparing the same cluster
                continue
            cluster_j_indices = np.where(cluster_labels == j)[0]
            inter_distances = distances[np.ix_(cluster_i_indices, cluster_j_indices)]
            if inter_distances.size > 0:
                min_intercluster_distance = min(min_intercluster_distance, np.min(inter_distances))

        intra_distances = distances[np.ix_(cluster_i_indices, cluster_i_indices)]
        if intra_distances.size > 0:
            # Calculate max only for non-zero distances to avoid self-comparison
            non_zero_distances = intra_distances[np.nonzero(intra_distances)]
            if non_zero_distances.size > 0:
                max_intracluster_diameter = max(max_intracluster_diameter, np.max(non_zero_distances))
            else:
                # Handle the case where a cluster might have only one point or all points are identical
                max_intracluster_diameter = max(max_intracluster_diameter, 0)

    dunn_index = min_intercluster_distance / max_intracluster_diameter if max_intracluster_diameter > 0 else 0
    return dunn_index

# Calculate Dunn Index
dunn_score = dunn_index(tfidf_matrix, np.array(cluster_labels))

print(f"Dunn Index for the Dataset: {dunn_score:.4f}")


Dunn Index for the Dataset: 0.8140
