First we will demonstrate how to calculate intrinsic cluster quality metrics

In [1]:
#Import required libraries
import os
from sklearn.cluster        import KMeans
from sklearn                import metrics
import numpy as np

In [2]:
# This method uses the sklearn metrics library to generate our desired evaluations
def evaluate_clustering_performance(X, prediction):
    silhouette = metrics.silhouette_score(X, prediction)
    chs = metrics.calinski_harabasz_score(X, prediction)
    dbs = metrics.davies_bouldin_score(X, prediction)
    
    return (silhouette, chs, dbs)

In [3]:
#This method takes in an embedding file and loads them in a format to be clustered
def load_embedding(filename):
    f = open(filename, "r")
    n, d = [int(x) for x in f.readline().split(" ")]
    embedding = np.zeros((n, d))
    for i in range(n):
        l = np.asarray([float(x) for x in f.readline().split(" ")])
        for j in range(d):
            embedding[int(l[0])][j] = l[j + 1]
    f.close()

    return embedding

In [4]:
# First, we take in a file that has generated nodal embeddings
print("====================\nLoading Graph Embeddings\n")
embedding_file = (f"TADW.nv")
graph_embedding = load_embedding(embedding_file)
print("Embeddings Loaded.\n====================")

Loading Graph Embeddings

Embeddings Loaded.


In [5]:
# Second, we use K means clustering to generate 9 clusters from our loaded graph embeddings
kmeans = KMeans(n_clusters=9).fit(graph_embedding)
kmeans_prediction = kmeans.labels_

#Third, we evaluate the clustering performance through the intrinsic quality of our clusters
kmeans_performance = evaluate_clustering_performance(graph_embedding, kmeans_prediction)
print("[KMeans] Clustering Finished.")

[KMeans] Clustering Finished.


In [6]:
#Finally, we print out our stored evaluation metrics
k_s, k_c, k_d = kmeans_performance
print("Silhouette: {:.3f}\tCH: {:.3f}\tDB: {:.3f}\t".format(k_s, k_c, k_d))

Silhouette: 0.261	CH: 56.485	DB: 1.683	


Now we will demonstrate how to evaluate extrinsic metrics

In [7]:
# import additional libraries
import pandas as pd

In [8]:
# For this evaluation, you need to read in a csv of your ground truth labels and a csv of the predicted labels
# Step 1: Load ground truth data
# Import labeled ground truth dataset into pandas dataframe from csv
repos_true_labels = pd.read_csv("nrel_repos_true_labels.csv")

# Create copies of each dataframe to manipulate
repos_true_df = repos_true_labels.copy()

# Slice dataframe to get index number from ground truth dataset. Use column names for selection.
repos_index = repos_true_df['id']

# Slice dataframe to get label from ground truth dataset. Use column names for selection.
repo_gold = repos_true_df['label']

In [9]:
# Step 2: Load predicted data
# Read in the predicted cluster labels as dataframes
repos_predicted = pd.read_csv("nrel_labels.csv")

# Slice dataframe to get index number and cluster labels from predicted data. Use column names for selection.
repos_predicted = repos_predicted[['ID', 'VADW_6']]

# Match the predicted data on the ground truth data based on ID of the data.
repos_predicted = repos_predicted[repos_predicted['ID'].isin(repos_index)]
repos_true_labels = repos_true_labels[repos_true_labels['id'].isin(repos_predicted['ID'])]

# Create copies of each dataframe to manipulate
repos_true_df = repos_true_labels.copy()

# Slice dataframe to get index number from ground truth dataset. Use column names for selection.
repos_index = repos_true_df['id']

# Slice dataframe to get label from ground truth dataset. Use column names for selection.
repo_gold = repos_true_df['label']

# Add final dataframe to list. If there is more than one dataset, just repeat the code above
# for the predicted data and add the dataframes to the appropriate lists below.
repos = [repos_predicted]

In [12]:
for i in repos:
    labels = i['VADW_6']
    labels_pred = labels.to_list()
    labels_true = repo_gold.to_list()

    ari = metrics.adjusted_rand_score(labels_true, labels_pred)
    ami = metrics.adjusted_mutual_info_score(labels_true, labels_pred)
    comp = metrics.completeness_score(labels_true, labels_pred)
    homog = metrics.homogeneity_score(labels_true, labels_pred)
    vmeasure = metrics.v_measure_score(labels_true, labels_pred)
    print("ARI: " + str(ari))
    print("AMI: " + str(ami))
    print("Completeness: " + str(comp))
    print("Homogeneity: " + str(homog))
    print("V-Measure: " + str(vmeasure))

ARI: 0.012010711577649737
AMI: -0.00023805946030836558
Completeness: 0.0778597178686849
Homogeneity: 0.07467474878292164
V-Measure: 0.07623398173257341
