In [1]:
%pip install python-dotenv datasets transformers mteb

Note: you may need to restart the kernel to use updated packages.


In [1]:
# Helper Functions & Dependencies
from dotenv import load_dotenv
if(load_dotenv(".env") == False):
    print("No .env file found")
from datasets import load_dataset
from transformers import AutoModel
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from mteb.evaluation.evaluators import ClusteringEvaluator
from sentence_transformers import SentenceTransformer


def split_dataset_by_word_count(
    dataset, split="test", splits=[500, 1000, 2000, 5000, 10000, 20000, 50000, 100000]
):
    # split dataset into bins of words of max length:
    binned_clusters = {split: {"sentences": [], "labels": []} for split in splits}

    for cluster_set in dataset[split]:
        for i in range(len(cluster_set["sentences"])):
            sentence = cluster_set["sentences"][i]
            label = cluster_set["labels"][i]
            words = len(sentence.split())
            for split in splits:
                if words <= split:
                    binned_clusters[split]["sentences"].append(sentence)
                    binned_clusters[split]["labels"].append(label)
                    break

    return binned_clusters

def evaluate_binned(model, sentences, labels, **kwargs):
  """
  Evaluate clustering using binned dataset
  """
  v_measures = []

  sentence_chunk = sentences
  label_chunk = labels

  evaluator = ClusteringEvaluator(sentence_chunk, label_chunk, **kwargs, batch_size=1)
  metrics = evaluator(model)
  v_measures.append(metrics["v_measure"])

  v_mean = np.mean(v_measures)
  v_std = np.std(v_measures)
  return {"v_measure": v_mean, "v_measure_std": v_std}

def run_binned_evaluation(model, split_dataset):
    # run evaluation on each bin
    results = {}
    
    for split in split_dataset:
        print(f"Running evaluation on {split} ")
        results[split] = evaluate_binned(model, split_dataset[split]["sentences"], split_dataset[split]["labels"])
    return results

# clustering benchmark, store results in dataframe and write to csv
def run_binned_benchmark(model_id, model, binned_dataset, dataset_id):
    results = {}


    for split in binned_dataset:
        print(f"Running evaluation on {split} samples")
        results[split] = evaluate_binned(model, binned_dataset[split]["sentences"], binned_dataset[split]["labels"])

    df = pd.DataFrame(results)
    df.to_csv(f"results/clustering/binned_{model_id}_{dataset_id}.csv")
    return df

def decimate_dataset(dataset, decimation_factor=0.1):
    """
    Randomly decimate a clustering dataset
    """
    
    decimated_dataset = {split: {"sentences": [], "labels": []} for split in dataset}

    for split in dataset:
        for i in range(len(dataset[split]["sentences"])):
            if np.random.rand() < decimation_factor or i == 0:
                decimated_dataset[split]["sentences"].append(dataset[split]["sentences"][i])
                decimated_dataset[split]["labels"].append(dataset[split]["labels"][i])    

    return decimated_dataset

In [2]:
big_patent_dataset = load_dataset("jinaai/big-patent-clustering")
wiki_cities_dataset = load_dataset("jinaai/cities_wiki_clustering")

  table = cls._concat_blocks(blocks, axis=0)


In [3]:
binned_big_patent_dataset = split_dataset_by_word_count(big_patent_dataset, "test", [500, 1000, 2000, 5000, 10000, 20000, 50000, 100000])
binned_wiki_cities_dataset = split_dataset_by_word_count(wiki_cities_dataset, "test", [20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000])

In [4]:
decimated_big_patent_split = decimate_dataset(binned_big_patent_dataset, 0.001)
# display lenghts for each split
for split in decimated_big_patent_split:
    print(f"{split}: {len(decimated_big_patent_split[split]['sentences'])}")


500: 1
1000: 1
2000: 6
5000: 38
10000: 17
20000: 6
50000: 2
100000: 1


In [23]:
model = SentenceTransformer('thenlper/gte-base')
run_binned_benchmark("gte_base", model, decimated_big_patent_split, "big_patent")

Running evaluation on 500 samples
Running evaluation on 1000 samples
Running evaluation on 2000 samples
Running evaluation on 5000 samples
Running evaluation on 10000 samples
Running evaluation on 20000 samples
Running evaluation on 50000 samples
Running evaluation on 100000 samples


Unnamed: 0,500,1000,2000,5000,10000,20000,50000,100000
v_measure,1.0,1.0,0.631307,0.44023,0.539501,1.0,1.0,1.0
v_measure_std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
run_binned_benchmark("gte_base", model, binned_wiki_cities_dataset, "wiki_cities")

In [5]:
model = SentenceTransformer(
    "jinaai/jina-embeddings-v2-base-en"
).to("mps")
run_binned_benchmark("jina", model, decimated_big_patent_split, "big_patent")

Running evaluation on 500 samples


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running evaluation on 1000 samples
Running evaluation on 2000 samples
Running evaluation on 5000 samples


In [None]:
run_binned_benchmark("jina", model, binned_wiki_cities_dataset, "wiki_cities")