diff --git a/clustering/download.py b/clustering/download.py new file mode 100644 index 0000000..3fdf3ff --- /dev/null +++ b/clustering/download.py @@ -0,0 +1,71 @@ +import requests +import logging + +import trafilatura + +from transformers import pipeline +from transformers import AutoTokenizer + +import numpy as np + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +max_embedding_characters = 128 # This is a deliberately low value, as the current model is not intended for document embedding + +feature_extractor_checkpoint = 'sentence-transformers/LaBSE' +tokenizer_checkpoint = 'gpt2' + +feature_extractor = pipeline('feature-extraction', framework='pt', model=feature_extractor_checkpoint) +tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) + +def fetch_and_parse(url): + try: + response = requests.get(url, timeout=10) + + response.raise_for_status() + except (requests.HTTPError, requests.ConnectionError, requests.Timeout) as error: + logging.error(f'Failed to fetch {url}: {error}') + + return None, None + + content = response.text + + markdown = trafilatura.extract(content, output_format='txt', include_formatting=True, \ + include_tables=True, include_images=True, no_fallback=True, include_links=True) + + return content, markdown + +def embed(text): + embedding = feature_extractor(text) + + return embedding + +def tokenize(text): + tokens = tokenizer.encode(text) + + return tokens + +def process_url(url): + content, markdown = fetch_and_parse(url) + + content_short = content[:max_embedding_characters] + + tokens = tokenize(content) + embedding = embed(content_short) + + embedding = np.array(embedding) + + return content, markdown, tokens, embedding + +def main(): + url = 'https://huggingface.co' + + content, markdown, tokens, embedding = process_url(url) + + for current in [content, markdown, embedding.shape]: + print(f'{"-" * 32}\n{current}') + + print('-' * 32) + +if __name__ == '__main__': + main() diff --git a/clustering/feature_extractor.py b/clustering/feature_extractor.py new file mode 100644 index 0000000..a873603 --- /dev/null +++ b/clustering/feature_extractor.py @@ -0,0 +1,49 @@ +import torch + +from transformers import AutoTokenizer, AutoModelForCausalLM + + +class FeatureExtractor: + def __init__(self, device='cpu', model_id='bigscience/bloom-560m', num_decoder_blocks=8): + self.device = device + + self.num_decoder_blocks = num_decoder_blocks + self.model_id = model_id + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id) + + self.model = AutoModelForCausalLM.from_pretrained(self.model_id) + + h = self.model.transformer.h[:num_decoder_blocks] # Note that this will change for different families of models + self.model.transformer.h = h + + self.model = self.model.to(device) + + + def encode(self, text): + tokens = self.tokenizer(text, padding=True, return_tensors='pt').to(self.device) + + output = self.model(**tokens, output_hidden_states=True).hidden_states[-1] + output = output.detach().cpu().numpy() + + return output + + + def __call__(self, text): + output = self.encode(text) + + return output + + +def main(): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + print(f'Using {device} device') + + feature_extractor = FeatureExtractor(device=device) + + output = feature_extractor('Hello world!') + print(output) + + +if __name__ == '__main__': + main() diff --git a/clustering/hierarchical_clustering.py b/clustering/hierarchical_clustering.py new file mode 100644 index 0000000..b5c99ae --- /dev/null +++ b/clustering/hierarchical_clustering.py @@ -0,0 +1,248 @@ +import math +import random + +import torch +import torch.nn as nn +from torch.nn import functional as F +from torch import arange, argmax +from tqdm import tqdm +from collections import Counter + +import uuid + +import numpy as np +from fast_pytorch_kmeans import KMeans + +from feature_extractor import FeatureExtractor +from memmap_utils import np_memmap, get_np_memmap_length + + +class ClusterAnalysis(nn.Module): + def __init__( + self, + mmap_file=None, + embed_dim=128, + dtype=np.float32, + ): + super().__init__() + + self.mmap_file = mmap_file + self.embed_dim = embed_dim + + self.dtype = dtype + + self.clusters = {} + self.span_to_cluster_label = {} + + + @staticmethod + def _cluster_one_batch( + true_k, + spans, + clusters, + span_to_cluster_label, + level, + cluster_embeddings, + min_overlap_merge_cluster, + device + ): + with torch.no_grad(): + embeddings = torch.from_numpy(cluster_embeddings) + + km = KMeans(n_clusters=true_k, mode='cosine') + km_labels = km.fit_predict(embeddings.to(device=device, dtype=torch.float32)).tolist() + + embeddings = None + + if not clusters: + label_to_label = {} + + for span, label in zip(spans, km_labels): + label = (label, level) + + if label not in label_to_label: + label_to_label[label] = (span[0], level) + + label = label_to_label[label] + + clusters[label] = clusters.get(label, []) +[ span] + span_to_cluster_label[span] = label + + output = list(clusters.keys()) + + return output + + tmp_cluster = {} + + for span, label in zip(spans, km_labels): + tmp_cluster[label] = tmp_cluster.get(label, [])+[span] + + new_labels = [] + + for a_cluster in tmp_cluster.values(): + for span in a_cluster: + need_labels = [span for span in a_cluster if span not in span_to_cluster_label or span_to_cluster_label[span][1] != level] + cluster_labels = [span_to_cluster_label[span] for span in a_cluster if span in span_to_cluster_label and span_to_cluster_label[span][1] == level] + + if not need_labels: + continue + + if not cluster_labels: + + label = (span[0], level) + + else: + most_common = Counter(cluster_labels).most_common(1)[0] + + if most_common[1] < min_overlap_merge_cluster: + label = (span[0], level) + + else: + label = most_common[0] + + new_labels.append(label) + + for span in need_labels: + clusters[label] = clusters.get(label, []) + [span] + span_to_cluster_label[span] = label + + return new_labels + + + def create_hiearchical_clusters( + self, + force_recluster_idxs=None, + max_level=4, + max_cluster_size=32, # Small value for debug purposes + min_overlap_merge_cluster=2, + prefered_leaf_node_size=None, + kmeans_batch_size=250000, + use_tqdm=False, + device='cuda:0' + ): + mmap_file = self.mmap_file + embed_dim = self.embed_dim + dtype = self.dtype + + mmap_len = get_np_memmap_length(mmap_file, [0, embed_dim], dtype=dtype) + + clusters = self.clusters + span_to_cluster_label = self.span_to_cluster_label + + if force_recluster_idxs: + force_recluster_idxs = set(force_recluster_idxs) + else: + force_recluster_idxs = () + + already_clustered = set([span[0] for span in span_to_cluster_label if span[1] == 0 and span[0] not in force_recluster_idxs]) + + idxs = [] + + if force_recluster_idxs: + idxs = list(force_recluster_idxs) + force_recluster_idxs = None + + idxs.extend([idx for idx in range(mmap_len) if idx not in already_clustered]) + + if not idxs: + return + + already_clustered = list(already_clustered) + + if len(already_clustered) > int(0.5 * kmeans_batch_size): + idxs.extend(random.sample(already_clustered, int(0.5 * kmeans_batch_size))) + else: + idxs.extend(already_clustered) + + already_clustered = None + + idxs.extend([span[0] for span in span_to_cluster_label if span[1] != 0]) + idxs = list(set(idxs)) + random.shuffle(idxs) + + if not prefered_leaf_node_size: + prefered_leaf_node_size= int(max_cluster_size * 0.7) + + for level in range(max_level): + all_spans = [(idx, level) for idx in idxs] + len_spans = len(all_spans) + + step_size = int(0.7 * kmeans_batch_size) + num_times = max(3, math.ceil(len_spans / step_size)) + + if use_tqdm: + num_times_2 = tqdm.tqdm(range(num_times)) + + else: + num_times_2 = range(num_times) + + for times in num_times_2: + max_rng = min(len_spans, step_size) + + spans = all_spans[:max_rng] + + not_already_clustered = [span for span in all_spans[:max_rng - step_size] if span not in span_to_cluster_label] + + if len(not_already_clustered) > int(0.5 * kmeans_batch_size): + spans.extend(random.sample(not_already_clustered, int(0.5 * kmeans_batch_size))) + else: + spans.extend(not_already_clustered) + + if len(spans) == 0: break + + already_clustered = [span for span in all_spans[:max_rng - step_size] if span in span_to_cluster_label] + + if len(already_clustered) > int(0.5 * kmeans_batch_size): + spans.extend(random.sample(already_clustered, int(0.5 * kmeans_batch_size))) + + else: + spans.extend(already_clustered) + + embedding_idxs = [span[0] for span in spans] + + if level == 0: + true_k = int(len(embedding_idxs) / prefered_leaf_node_size) + + else: + true_k = int(len(embedding_idxs ) / max_cluster_size) + + cluster_embeddings = np_memmap(mmap_file, shape=[mmap_len, embed_dim], idxs=embedding_idxs, dtype=dtype) + + new_labels = self._cluster_one_batch(true_k, spans, clusters, span_to_cluster_label, level, cluster_embeddings, min_overlap_merge_cluster, device) + + if not new_labels: + break + + need_more = False + + assert prefered_leaf_node_size <= max_cluster_size, 'prefered_leaf_node_size Must not exceed max_cluster_size' + + if times <= num_times - 2: + for label in new_labels: + if len(clusters[label]) < prefered_leaf_node_size: + del clusters[label] + + need_more = True + + if not need_more: + break + + idxs = [val[0][0] for key, val in clusters.items() if key[1] == level] + + if len(idxs) < max_cluster_size: + break + + +def main(): + cluster_analysis = ClusterAnalysis( + mmap_file='output/embeddings.mmap', + embed_dim=1024 + ) + + cluster_analysis.create_hiearchical_clusters() + + print(list(cluster_analysis.clusters.keys())) + + +if __name__ == '__main__': + main() diff --git a/clustering/memmap_utils.py b/clustering/memmap_utils.py new file mode 100644 index 0000000..65b194e --- /dev/null +++ b/clustering/memmap_utils.py @@ -0,0 +1,64 @@ +import os + +import numpy as np + + +def is_contiguous(arr): + start = None + prev = None + contiguous = True + + for idx in arr: + if start is None: + start = idx + if prev is None or idx == prev + 1: + prev = idx + + continue + + contiguous = False + + break + + return contiguous, start, idx + 1 + + +def np_memmap(file_name, data=None, idxs=None, shape=None, dtype=np.float32, offset=0, order='C'): + if not file_name.endswith('.mmap'): + file_name += '.mmap' + + if os.path.exists(file_name): + mode = 'r+' + else: + mode = 'w+' + + if shape is None and data is not None: + shape = data.shape + + if not shape: + shape = [0, 1] + + memmap = np.memmap(file_name, mode=mode, dtype=dtype, shape=tuple(shape), offset=offset, order=order) + + if idxs: + contiguous, start, end = is_contiguous(idxs) + + if data is not None: + if tuple(shape) == tuple(data.shape): + memmap[:] = data + elif contiguous: + memmap[start:end] = data + else: + memmap[idxs] = data + + return memmap + + +def get_np_memmap_length(file_name, shape, dtype=np.float32): + if not os.path.exists(file_name): + return shape[0] + + else: + size = np.dtype(dtype).itemsize * np.prod(shape[1:]) + + return int(os.path.getsize(file_name) / size) diff --git a/clustering/train_clusterer.py b/clustering/train_clusterer.py new file mode 100644 index 0000000..de9f373 --- /dev/null +++ b/clustering/train_clusterer.py @@ -0,0 +1,169 @@ +import argparse +import itertools + +import numpy as np + +import pickle +from pathlib import Path + +import torch + +from tqdm.auto import tqdm + +from torch.utils.data import DataLoader, IterableDataset + +from kmeans_pytorch import KMeans as BalancedKMeans + +from transformers import pipeline + +from datasets import load_dataset + +from sklearn.manifold import TSNE +from sklearn.decomposition import PCA + +import matplotlib.pyplot as plt + +from feature_extractor import FeatureExtractor +from memmap_utils import np_memmap + + +def load_model(path_to_model: Path): + with open(path_to_model, 'rb') as file: + output = pickle.load(file) + + file.close() + + return output + + +def extract_features(corpus, feature_extractor, batch_size=32, max_chars=256): + corpus = [element[:max_chars] for element in corpus] + batches = np.array_split(corpus, len(corpus) // batch_size, axis=0) + + features = [] + + for batch in tqdm(batches): + batch = list(batch) # batches is a list of numpy arrays + + features_current = feature_extractor(batch) + features_current = np.max(features_current, axis=1) + + features.append(features_current) + + features = np.concatenate(features, axis=0) + + return features + + +def train_kmeans(features, n_clusters, path_to_kmeans, balanced=False, device='cpu'): + kmeans = BalancedKMeans(n_clusters=n_clusters, device=device, balanced=balanced) + + batch_size = 512 # Hyperparameter + batch_size = min(batch_size, len(features)) + + batches = np.array_split(features, features.shape[0] // batch_size, axis=0) + + for idx, batch in tqdm(enumerate(batches)): + kmeans.fit(torch.from_numpy(batch), iter_limit=20, online=True, iter_k=idx) + + with open(path_to_kmeans, 'wb+') as file: + pickle.dump(kmeans, file) + + file.close() + + return kmeans + + +def main(n_clusters=16, balanced=False, output_dir=Path('cluster_output/'), shuffle_dataset=True, take_sample=None, embed_only=False, seed=42, visualize=False): + dataset_name_train = 'JeanKaddour/minipile' + content_column_train = 'text' + + subset_train = None # 'p3' + split_train = 'train' + + dataset_train = load_dataset(dataset_name_train, subset_train, split=split_train, streaming=(take_sample is not None)) + + if shuffle_dataset: + dataset_train = dataset_train.shuffle(seed=seed) + + corpus = [] + + for idx, element in enumerate(dataset_train): + corpus.append(element[content_column_train]) + + if take_sample: + if idx >= take_sample: + break + + if not output_dir.is_dir(): + output_dir.mkdir(parents=True, exist_ok=True) + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + print(f'Using {device} device') + + feature_extractor_batch_size = 1 + + feature_extractor_checkpoint = 'bigscience/bloom-560m' # 'sentence-transformers/LaBSE' # 'xlm-roberta-large' + feature_extractor = FeatureExtractor(device=device) # pipeline('feature-extraction', framework='pt', model=feature_extractor_checkpoint) + + features = extract_features(corpus, feature_extractor, batch_size=feature_extractor_batch_size) + + memmap_file_path = 'output/embeddings.mmap' # TODO: Create a configs.py file + + np_memmap(memmap_file_path, data=features) + + if embed_only: + return + + path_to_kmeans = output_dir / 'kmeans.pkl' + kmeans = train_kmeans(features, n_clusters, path_to_kmeans, balanced=balanced, device=device) + + if visualize: + tsne = TSNE(n_components=2) + features_2d = tsne.fit_transform(features) + + plt.scatter(features_2d[:, 0], features_2d[:, 1], c=kmeans.predict(torch.from_numpy(features)).cpu()) + plt.show() + + return kmeans + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument('--n-clusters', required=True, type=int) + parser.add_argument('--balanced', action='store_true') + parser.add_argument('--output-dir', required=True, type=Path) + parser.add_argument('--eval-only', action='store_true') + parser.add_argument('--shuffle-dataset', required=False, type=bool, default=True) + parser.add_argument('--take-sample', required=False, type=int) + parser.add_argument('--embed-only', required=False, type=bool, default=False) + parser.add_argument('--visualize', action='store_true') + + args = parser.parse_args() + + if not args.eval_only: + kmeans = main( + n_clusters=args.n_clusters, + balanced=args.balanced, + output_dir=args.output_dir, + take_sample=args.take_sample, + shuffle_dataset=args.shuffle_dataset, + embed_only=args.embed_only, + visualize=args.visualize + ) + + path_to_kmeans = args.output_dir / 'kmeans.pkl' + kmeans = load_model(path_to_kmeans) + + +# Usage + +# python3 train_clusterer.py --n-clusters 4 --output-dir output/ --take-sample 128 --embed-only False --visualize + +# Warning! You need to install kmeans from https://github.com/kernelmachine/balanced-kmeans.git + +# cd .. +# git clone https://github.com/kernelmachine/balanced-kmeans.git +# cd balanced-kmeans +# pip3 install -e .