In [1]:
import numpy as np
from sklearn.metrics.cluster import pair_confusion_matrix
from sklearn import metrics
import sklearn.metrics.pairwise as smp
import sklearn.cluster as sc
import sentence_transformers as st
import csv

In [2]:
sentences = []
truth_labels = []
with open("../comments.csv", newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        sentences.append(row[0])
        truth_labels.append(row[1])
n = len(sentences)

In [3]:
def cluster_and_evaluate(texts, distances,X,truth_labels):
    clustering = sc.AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=0.1,
        affinity='precomputed',
        linkage='average')
    clustering.fit(distances)
    print("\n")
    print("=== Unsupervised Metrics ===\n")
    print("== Silhouette Coefficient [0,1*] ==")
    print(metrics.silhouette_score(distances, clustering.labels_, metric='precomputed'),"\n")
    print("== Calinski-Harabasz Index [0,inf*] ==")
    print(metrics.calinski_harabasz_score(X, clustering.labels_),"\n")
    print("== Davies-Bouldin Index [0*,1] ==")
    print(metrics.davies_bouldin_score(X, clustering.labels_),"\n")
    print("\n")
    print("=== Supervised Metrics ===\n")
    print("== Rand Index [0,1*] ==")
    print(metrics.rand_score(truth_labels, clustering.labels_),"\n")
    print("== Normalized Mutual Information Score [0,1*] ==")
    print(metrics.normalized_mutual_info_score(truth_labels, clustering.labels_),"\n")
    print("== Fowlkes-Mallows Score [0,1*] ==")
    print(metrics.fowlkes_mallows_score(truth_labels, clustering.labels_),"\n")

In [4]:
print("=== SentenceTransformer ===\n")
model = st.SentenceTransformer('paraphrase-distilroberta-base-v1')
embeddings = model.encode(sentences)
embed_dist = smp.cosine_distances(embeddings)
cluster_and_evaluate(sentences, embed_dist,embeddings,truth_labels)

=== SentenceTransformer ===



=== Unsupervised Metrics ===

== Silhouette Coefficient [0,1*] ==
0.10681761 

== Calinski-Harabasz Index [0,inf*] ==
18.402048298944738 

== Davies-Bouldin Index [0*,1] ==
0.2763499200435646 



=== Supervised Metrics ===

== Rand Index [0,1*] ==
0.45641678206519704 

== Normalized Mutual Information Score [0,1*] ==
0.2779774423229414 

== Fowlkes-Mallows Score [0,1*] ==
0.0391827065695695 

