<a href="https://colab.research.google.com/github/igor531205/applied_machine_learning_tasks/blob/hw3_vector_quantization/hw3_vector_quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries

In [1]:
!pip install -U -q transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m798.3 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

Standard libraries

In [2]:
import numpy as np
import torch

Datasets

In [2]:
from datasets import load_dataset

Models

In [4]:
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans

# Product Quantization

Function receiving tokens

In [None]:
def get_tokens(text):
    """Function receiving tokens"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy().astype(np.float64)

Function to quantize vector

In [None]:
def quantize_vector(vector):
    """Function vector quantization"""
    quantized = []
    for i in range(4):
        sub_vec = vector[i*sub_vector_size:(i+1)*sub_vector_size]
        cluster_index = kmeans_models[i].predict([sub_vec])[0]
        quantized.append(cluster_index)
    return quantized

Function to reconstruct vector

In [None]:
def reconstruct_vector(quantized):
    """Function reconstruction vector"""
    reconstructed = []
    for i, index in enumerate(quantized):
        reconstructed.append(codebooks[i][index])
    return np.concatenate(reconstructed)

Functions to compute errors

In [None]:
def absolute_error(original, reconstructed):
    """Function absolute error"""
    return np.linalg.norm(original - reconstructed)

def relative_error(original, reconstructed):
    """Function relative error"""
    return np.linalg.norm(original - reconstructed) / np.linalg.norm(original)

Load IMDb dataset from [Hugging Face Hub](https://huggingface.co/docs/datasets/v1.18.2/load_hub.html?highlight=imdb)

In [3]:
dataset = load_dataset("imdb", split="train[:10000]", token=False)

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Load BERT model and tokenizer

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=False)
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Get tokens from dataset

In [None]:
tokens = np.array([get_tokens(text) for text in dataset['text']])

Split each vector into 4 sub-vectors

In [None]:
sub_vector_size = tokens.shape[1] // 4
sub_vectors = [tokens[:, i*sub_vector_size:(i+1)*sub_vector_size] for i in range(4)]

Cluster each sub-vector space

In [None]:
kmeans_models = []
codebooks = []
for i, sub_vector in enumerate(sub_vectors):
    kmeans = KMeans(n_clusters=16, random_state=42)
    kmeans.fit(sub_vector)
    kmeans_models.append(kmeans)
    codebooks.append(kmeans.cluster_centers_)

In [None]:
num_vectors = 3
for i in range(num_vectors):
    original = tokens[i]
    quantized = quantize_vector(original)
    reconstructed = reconstruct_vector(quantized)
    abs_error = absolute_error(original, reconstructed)
    rel_error = relative_error(original, reconstructed)

    print(f'Vector {i}:')
    print(f'Quantized vector (indices): {quantized}')
    print(f'Original vector: {original[:5]}...')
    print(f'Reconstruct vector: {reconstructed[:5]}...')
    print(f'Absolute error: {abs_error}')
    print(f'Relative error: {rel_error}')
    print()

Vector 0:
Quantized vector (indices): [1, 10, 8, 4]
Original vector: [-0.08982041  0.11553691  0.13738132 -0.04512089  0.19212367]...
Reconstruct vector: [-0.01780938  0.14799872  0.15954467 -0.05591925  0.15473078]...
Absolute error: 1.9992087499919782
Relative error: 0.257449324675138

Vector 1:
Quantized vector (indices): [6, 9, 11, 13]
Original vector: [ 0.0834522   0.13337018  0.14444205 -0.10837111  0.06469466]...
Reconstruct vector: [-0.02588233  0.17954449  0.27947472 -0.12994814  0.17023175]...
Absolute error: 2.755848426345642
Relative error: 0.3344302412748106

Vector 2:
Quantized vector (indices): [6, 2, 10, 12]
Original vector: [-0.11614087  0.0521897   0.45533344 -0.15447225  0.0885984 ]...
Reconstruct vector: [-0.02588233  0.17954449  0.27947472 -0.12994814  0.17023175]...
Absolute error: 3.023846071441988
Relative error: 0.37138642346022377



In [None]:
num_codebooks = 3
print("Codebook:")
for i, codebook in enumerate(codebooks[:num_codebooks]):
    print(f"Subspace {i}:")
    print(codebook)
    print()

Codebook:
Subspace 0:
[[-0.12397082 -0.0008253   0.1535068  ... -0.07013917 -0.22727505
  -0.14699323]
 [-0.01780938  0.14799872  0.15954467 ... -0.05859479 -0.32382939
  -0.23088411]
 [ 0.03720932  0.07906274  0.1718761  ... -0.02710158 -0.30776504
  -0.12258076]
 ...
 [ 0.02586716  0.07330529  0.26294561 ... -0.01486768 -0.35285923
  -0.149416  ]
 [-0.0238441  -0.04381591  0.14008164 ... -0.0167679  -0.33518584
  -0.1641866 ]
 [-0.16334624  0.06281071  0.25424083 ... -0.08745143 -0.22696291
  -0.11772701]]

Subspace 1:
[[-0.27879947 -0.19763387  0.19154129 ... -0.45958251  0.0534585
  -0.21830159]
 [-0.27655088 -0.13905019  0.29255002 ... -0.51868938  0.00500778
  -0.07313702]
 [-0.26178369 -0.18606756  0.21774697 ... -0.26983715  0.10780738
  -0.31411593]
 ...
 [-0.24198651 -0.12022492  0.30193998 ... -0.34548259  0.0679724
  -0.26806974]
 [-0.28765427 -0.15995447  0.28459638 ... -0.42412193  0.05692881
  -0.1746778 ]
 [-0.23291974 -0.16730227  0.25244743 ... -0.27162378  0.08714467

In [None]:
total_abs_error = 0
total_rel_error = 0
for vec in tokens:
    quantized = quantize_vector(vec)
    reconstructed = reconstruct_vector(quantized)
    total_abs_error += absolute_error(vec, reconstructed)
    total_rel_error += relative_error(vec, reconstructed)

average_abs_error = total_abs_error / len(tokens)
average_rel_error = total_rel_error / len(tokens)
print(f"Average absolute error on the dataset: {average_abs_error}")
print(f"Average relative error on the dataset: {average_rel_error}")

Average absolute error on the dataset: 2.4475081558410485
Average relative error on the dataset: 0.30320309683798974


Около 30% исходного вектора теряется во время восстановления