In [1]:
%pip install python-dotenv datasets transformers mteb scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [37]:
# Helper Functions & Dependencies
from dotenv import load_dotenv

if load_dotenv(".env") == False:
    print("No .env file found")
from datasets import load_dataset
import numpy as np
import pandas as pd
import sklearn
import sklearn.cluster
from sentence_transformers import SentenceTransformer


def evaluator(model, sentences, labels, max_context_size=512, batch_size=1, clustering_batch_size=500):
    """
    Evaluate clustering using binned dataset
    """

    model.max_seq_length = max_context_size
    corpus_embeddings = np.asarray(
        model.encode(sentences, batch_size=batch_size)
    )
    
    clustering_model = sklearn.cluster.MiniBatchKMeans(
        n_clusters=len(set(labels)), batch_size=clustering_batch_size, n_init="auto"
    )
    clustering_model.fit(corpus_embeddings)

    cluster_assignment = clustering_model.labels_

    v_measure = sklearn.metrics.cluster.v_measure_score(labels, cluster_assignment)

    return {"v_measure": v_measure}

def evaluate(model, dataset, max_context_size, **kwargs):
    v_measures = []
    for cluster_set in dataset:
        metrics = evaluator(model, cluster_set["sentences"], cluster_set["labels"], max_context_size=max_context_size)
        v_measures.append(metrics["v_measure"])

    v_mean = np.mean(v_measures)
    v_std = np.std(v_measures)
    return {"v_measure": v_mean, "v_measure_std": v_std}


# clustering benchmark, store results in dataframe and write to csv
def run_benchmark(model_id, model, dataset, dataset_id, split="test"):
    results = {}
    max_lengths = [10, 20, 50]

    for max_length in max_lengths:
        # for split in dataset:
        print(f"Running with {max_length} max length.")
        results[max_length] = evaluate(
            model, dataset[split], max_length
        )

    df = pd.DataFrame(results)
    df.to_csv(f"results/clustering/cutoff_{model_id}_{dataset_id}.csv")
    return df


def decimate_dataset(dataset, decimation_factor=0.1, split="test"):
    """
    Randomly decimate a clustering dataset
    """

    data = dataset[split]

    decimated_dataset = {split: [
        {"sentences": [], "labels": []} for _ in range(len(data))
    ]}


    for c in range(len(data)):
        for i in range(len(data[c]["sentences"])):
            if np.random.rand() < decimation_factor or i == 0:
                decimated_dataset[split][c]["sentences"].append(data[c]["sentences"][i])
                decimated_dataset[split][c]["labels"].append(data[c]["labels"][i])


    return decimated_dataset

In [4]:
big_patent_dataset = load_dataset("jinaai/big-patent-clustering")
wiki_cities_dataset = load_dataset("jinaai/cities_wiki_clustering")

  table = cls._concat_blocks(blocks, axis=0)


In [None]:
model = SentenceTransformer('thenlper/gte-base')
run_benchmark("gte_base", model, big_patent_dataset, "big_patent")

In [None]:
run_benchmark("gte_base", model, wiki_cities_dataset, "wiki_cities")

In [None]:
model = SentenceTransformer(
    "jinaai/jina-embeddings-v2-base-en"
).to("mps")
run_benchmark("jina", model, big_patent_dataset, "big_patent")

In [None]:
run_benchmark("jina", model, wiki_cities_dataset, "wiki_cities")