<a href="https://colab.research.google.com/github/igor531205/applied_machine_learning_tasks/blob/hw3_vector_quantization/hw3_vector_quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries

In [1]:
!pip install -U -q transformers datasets

Standard libraries

In [2]:
import numpy as np
import pandas as pd
import torch
from statistics import mean

Datasets

In [3]:
from datasets import load_dataset

Models

In [4]:
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans

# Product Quantization

Function receiving tokens

In [5]:
def get_tokens(text):
    """Function receiving tokens"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy().astype(np.float64)

Function to quantize vector

In [6]:
def quantize_vector(vector):
    """Function vector quantization"""
    quantized = []
    for i in range(4):
        sub_vec = vector[i*sub_vector_size:(i+1)*sub_vector_size]
        cluster_index = kmeans_models[i].predict([sub_vec])[0]
        quantized.append(cluster_index)
    return quantized

Function to reconstruct vector

In [7]:
def reconstruct_vector(quantized):
    """Function reconstruction vector"""
    reconstructed = []
    for i, index in enumerate(quantized):
        reconstructed.append(codebooks[i][index])
    return np.concatenate(reconstructed)

Functions to compute errors

In [8]:
def absolute_error(original, reconstructed):
    """Function absolute error"""
    return np.linalg.norm(original - reconstructed)

def relative_error(original, reconstructed):
    """Function relative error"""
    return np.linalg.norm(original - reconstructed) / np.linalg.norm(original)

Load IMDb dataset from [Hugging Face Hub](https://huggingface.co/docs/datasets/v1.18.2/load_hub.html?highlight=imdb)

In [None]:
dataset = load_dataset("imdb", split="train[:10000]", token=False)

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Load BERT model and tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=False)
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Get tokens from dataset

In [None]:
tokens = np.array([get_tokens(text) for text in dataset['text']])

Split each vector into 4 sub-vectors

In [10]:
sub_vector_size = tokens.shape[1] // 4
sub_vectors = [tokens[:, i*sub_vector_size:(i+1)*sub_vector_size] for i in range(4)]

Cluster each sub-vector space

In [11]:
kmeans_models = []
codebooks = []
for i, sub_vector in enumerate(sub_vectors):
    kmeans = KMeans(n_clusters=16, random_state=42)
    kmeans.fit(sub_vector)
    kmeans_models.append(kmeans)
    codebooks.append(kmeans.cluster_centers_)

In [18]:
data = []
for i in range(len(tokens)):
    original = tokens[i]
    quantized = quantize_vector(original)
    reconstructed = reconstruct_vector(quantized)
    abs_error = absolute_error(original, reconstructed)
    rel_error = relative_error(original, reconstructed)
    vector_data = {
        'Vector_Index': i,
        'Quantized_Indices': quantized,
        'Original_Vector': mean(original),
        'Reconstructed_Vector': mean(reconstructed),
        'Absolute_Error': abs_error,
        'Relative_Error': rel_error
    }
    data.append(vector_data)

pd.DataFrame(data)

Unnamed: 0,Vector_Index,Quantized_Indices,Original_Vector,Reconstructed_Vector,Absolute_Error,Relative_Error
0,0,"[1, 10, 8, 4]",-0.013060,-0.012449,1.999209,0.257449
1,1,"[6, 9, 11, 13]",-0.013599,-0.013005,2.755848,0.334430
2,2,"[6, 2, 10, 12]",-0.014038,-0.014517,3.023846,0.371386
3,3,"[9, 3, 8, 13]",-0.013222,-0.013099,2.550707,0.320993
4,4,"[11, 4, 3, 12]",-0.013512,-0.014906,2.370013,0.307747
...,...,...,...,...,...,...
9995,9995,"[11, 15, 10, 12]",-0.015476,-0.014203,2.429611,0.298753
9996,9996,"[3, 8, 1, 6]",-0.012354,-0.012113,2.386514,0.291047
9997,9997,"[7, 7, 15, 1]",-0.013585,-0.014414,2.935955,0.367882
9998,9998,"[7, 10, 2, 1]",-0.013223,-0.014911,1.839239,0.240572


In [15]:
num_codebooks = 3
print("Codebooks:")
for i, codebook in enumerate(codebooks[:num_codebooks]):
    codebook_df = pd.DataFrame(codebook, columns=[f'Component_{j}' for j in range(codebook.shape[1])])
    print(f"Subspace {i} codebook:\n{codebook_df.head()}")
    print()

Codebooks:
Subspace 0 codebook:
   Component_0  Component_1  Component_2  Component_3  Component_4  \
0    -0.123971    -0.000825     0.153507    -0.059117     0.268319   
1    -0.017809     0.147999     0.159545    -0.055919     0.154731   
2     0.037209     0.079063     0.171876    -0.027905     0.210179   
3     0.128692     0.086029     0.201520    -0.034690     0.162486   
4    -0.086777    -0.024903     0.247149    -0.048851     0.272943   

   Component_5  Component_6  Component_7  Component_8  Component_9  ...  \
0    -0.037508     0.069039     0.545046     0.078734    -0.086820  ...   
1    -0.064150     0.132728     0.636008     0.096023    -0.131347  ...   
2    -0.060985     0.064872     0.542210     0.203767    -0.054786  ...   
3    -0.119507     0.136100     0.728353     0.086460    -0.091128  ...   
4    -0.141947     0.092434     0.598652     0.068393    -0.179315  ...   

   Component_182  Component_183  Component_184  Component_185  Component_186  \
0       0.119809

In [14]:
total_abs_error = 0
total_rel_error = 0
for vec in tokens:
    quantized = quantize_vector(vec)
    reconstructed = reconstruct_vector(quantized)
    total_abs_error += absolute_error(vec, reconstructed)
    total_rel_error += relative_error(vec, reconstructed)

average_abs_error = total_abs_error / len(tokens)
average_rel_error = total_rel_error / len(tokens)
print(f"Average absolute error on the dataset: {average_abs_error}")
print(f"Average relative error on the dataset: {average_rel_error}")

Average absolute error on the dataset: 2.4475081558410485
Average relative error on the dataset: 0.30320309683798974


Около 30% исходного вектора теряется во время восстановления