# LLM-архиватор

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm.notebook import tqdm
import time
import gc

In [None]:
def clear_memory(var_names=None):
    if var_names is None:
        var_names = [
            'model','tokenizer','enc','dec','logits','probs','inp','ctx',
            'ids','recovered_ids','all_bits','all_original_ids'
        ]
    for var in var_names:
        if var in globals():
            del globals()[var]
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

## Общие и вспомогательные функции кодирования и декодирования

In [None]:
def probs_to_cdf_int(prob, total=1 << 20):
    cdf = np.cumsum(prob, dtype=np.float64)
    cdf_int = np.floor(cdf * total).astype(np.int64)
    cdf_int = np.maximum.accumulate(cdf_int)
    cdf_int = np.concatenate(([0], cdf_int))
    cdf_int[-1] = total
    return cdf_int

In [None]:
class ArithmeticEncoder:
    def __init__(self, precision=32):
        self.precision = precision
        self.half  = 1 << (precision - 1)
        self.quarter = self.half >> 1
        self.mask  = (1 << precision) - 1
        self.low   = 0
        self.high  = self.mask
        self.pending = 0
        self.out = []

    def update(self, cdf_low, cdf_high, total):
        # subdivide interval
        rng = self.high - self.low + 1
        self.high = self.low + (rng * cdf_high) // total - 1
        self.low  = self.low + (rng * cdf_low)  // total

        # renormalize
        while True:
            # E1: MSB equal
            if self.high < self.half:
                self._emit(0)
            elif self.low >= self.half:
                self._emit(1)
                self.low  -= self.half
                self.high -= self.half
            # E3: underflow
            elif self.low >= self.quarter and self.high < 3 * self.quarter:
                self.pending += 1
                self.low  -= self.quarter
                self.high -= self.quarter
            else:
                break

            # shift out
            self.low   = (self.low   << 1) & self.mask
            self.high  = ((self.high << 1) & self.mask) | 1

    def _emit(self, bit):
        self.out.append(bit)
        for _ in range(self.pending):
            self.out.append(1 - bit)
        self.pending = 0

    def finish(self):
        self.pending += 1
        if self.low < self.quarter:
            self._emit(0)
        else:
            self._emit(1)
        for _ in range(self.precision):
            self.out.append((self.low >> (self.precision - 1)) & 1)
            self.low = (self.low << 1) & self.mask

In [None]:
class ArithmeticDecoder:
    def __init__(self, bits, precision=32):
        self.bits = bits
        self.precision = precision
        self.half  = 1 << (precision - 1)
        self.quarter = self.half >> 1
        self.mask  = (1 << precision) - 1

        self.low   = 0
        self.high  = self.mask
        self.value = 0
        self.idx   = 0
        for _ in range(precision):
            self.value = ((self.value << 1) & self.mask) | self._read()

    def _read(self):
        if self.idx < len(self.bits):
            b = self.bits[self.idx]
            self.idx += 1
            return b
        return 0

    def decode(self, cdf_int, total):
        # find symbol
        rng = self.high - self.low + 1
        scaled = ((self.value - self.low + 1) * total - 1) // rng
        symbol = np.searchsorted(cdf_int, scaled, side='right') - 1

        # narrow interval
        c_lo, c_hi = cdf_int[symbol], cdf_int[symbol+1]
        self.high = self.low + (rng * c_hi) // total - 1
        self.low  = self.low + (rng * c_lo) // total

        # renormalize
        while True:
            if self.high < self.half:
                pass
            elif self.low >= self.half:
                self.value -= self.half
                self.low   -= self.half
                self.high  -= self.half
            elif self.low >= self.quarter and self.high < 3 * self.quarter:
                self.value -= self.quarter
                self.low   -= self.quarter
                self.high  -= self.quarter
            else:
                break

            self.low   = (self.low   << 1) & self.mask
            self.high  = ((self.high << 1) & self.mask) | 1
            self.value = ((self.value << 1) & self.mask) | self._read()

        return symbol

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def load_model(model_name: str):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    model.eval()
    model.to(device)

    return tokenizer, model

In [None]:
def get_ids(size: int, tokenizer, path: str = 'enwik8') -> list[int]:
    with open(path, 'rb') as f:
        data = f.read(size)
    text = data.decode('latin-1')
    ids = tokenizer.encode(text, add_special_tokens=False)
    return data, ids

In [None]:
def encode_ids(ids, model, total: int = 1 << 30):
    start_time = time.perf_counter()
    enc = ArithmeticEncoder()
    max_len = model.config.max_position_embeddings

    for i in tqdm(range(1, len(ids)), desc="Encoding", unit="tok"):
        # Prepare context window
        ctx = ids[max(0, i - max_len + 1):i]
        inp = torch.tensor([ctx], device=device)

        # Get probability distribution for next token
        with torch.no_grad():
            logits = model(inp).logits[0, -1]
            probs = torch.softmax(logits, dim=-1).cpu().numpy()

        # Convert to CDF and update encoder
        cdf = probs_to_cdf_int(probs, total)
        token_id = ids[i]
        enc.update(cdf[token_id], cdf[token_id + 1], total)

    enc.finish()
    encoding_time = time.perf_counter() - start_time
    print(f"Total encoding time: {encoding_time:.2f} seconds")

    return enc.out

In [None]:
def decode_ids(enc_out, ids, model, total: int = 1 << 30):
    start_time = time.perf_counter()
    dec = ArithmeticDecoder(enc_out)
    recovered = [ids[0]]
    max_len = model.config.max_position_embeddings

    for i in tqdm(range(1, len(ids)), desc="Decoding", unit="tok"):
        # Собираем контекст из оригинальных ID (как в encode)
        ctx = ids[max(0, i - max_len + 1):i]
        inp = torch.tensor([ctx], device=device)

        # Получаем предсказание распределения для следующего токена
        with torch.no_grad():
            logits = model(inp).logits[0, -1]
            probs = torch.softmax(logits, dim=-1).cpu().numpy()

        # Преобразуем в CDF и декодируем следующий токен
        cdf = probs_to_cdf_int(probs, total)
        recovered.append(dec.decode(cdf, total))

    decoding_time = time.perf_counter() - start_time
    print(f"Total decoding time: {decoding_time:.2f} seconds")

    return recovered

## Подсчет статистики и проверки корректности декодирования

In [None]:
def compare_sequences(orig, dec):
    for k, (o, d) in enumerate(zip(orig, dec)):
        if o != d:
            print(f"Расхождение на позиции {k}: orig={o}  decoded={d}")
            return
    if len(orig) != len(dec):
        print(f"Длины списков отличаются: orig={len(orig)}  decoded={len(dec)}")
        return
    print("Совпадают полностью!")

In [None]:
def compression_stats(data: bytes, enc_out: bytes):
    original_bits = len(data) * 8
    compressed_bits = len(enc_out)
    ratio = compressed_bits / original_bits

    print(f"Исходный размер:   {original_bits} бит")
    print(f"Размер после сжатия: {compressed_bits} бит")
    print(f"Коэффициент сжатия: {ratio:.4f}")

In [None]:
def decode_text_from_ids(ids, tokenizer) -> str:
    return tokenizer.decode(
        ids,
        clean_up_tokenization_spaces=False,
        skip_special_tokens=False
    )

## EleutherAI/pythia-70m

In [None]:
tokenizer, model = load_model('EleutherAI/pythia-70m')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

In [None]:
data, ids = get_ids(50000, tokenizer)

In [None]:
encoded_output = encode_ids(ids, model)

Encoding:   0%|          | 0/13004 [00:00<?, ?tok/s]

Total encoding time: 733.66 seconds


In [None]:
recovered_ids = decode_ids(encoded_output, ids, model)

Decoding:   0%|          | 0/13004 [00:00<?, ?tok/s]

Total decoding time: 731.65 seconds


In [None]:
compare_sequences(ids, recovered_ids)

Совпадают полностью!


In [None]:
compression_stats(data, encoded_output)

Исходный размер:   400000 бит
Размер после сжатия: 60471 бит
Коэффициент сжатия: 0.1512


### Разбиваем на чанки

In [None]:
def get_data(size: int, tokenizer, path: str = 'enwik8') -> list[int]:
    with open(path, 'rb') as f:
        data = f.read(size)
    text = data.decode('latin-1')
    return data

In [None]:
def encode_chunks(data: bytes, tokenizer, model, chunk_size: int = 2000, total: int = 1 << 30):
    chunk_data = []
    start_time = time.perf_counter()

    for start in tqdm(range(0, len(data), chunk_size), desc='Encoding chunks'):
        chunk = data[start:start + chunk_size]
        text  = chunk.decode('latin-1')
        ids   = tokenizer.encode(text, add_special_tokens=False)

        enc = ArithmeticEncoder()
        max_len = model.config.max_position_embeddings
        for i in tqdm(range(1, len(ids)), desc='  Tokens in chunk', leave=False):
            ctx = ids[max(0, i - max_len + 1):i]
            inp = torch.tensor([ctx], device=device)
            with torch.no_grad():
                logits = model(inp).logits[0, -1]
                probs  = torch.softmax(logits, dim=-1).cpu().numpy()
            cdf = probs_to_cdf_int(probs, total)
            token_id = ids[i]
            enc.update(cdf[token_id], cdf[token_id + 1], total)

        enc.finish()
        chunk_data.append({'ids': ids, 'bits': enc.out})

    encoding_time = time.perf_counter() - start_time
    print(f"Total encoding time: {encoding_time:.2f} seconds")

    return chunk_data

In [None]:
def decode_chunks(chunk_data, model, total: int = 1 << 30):
    start_time = time.perf_counter()
    all_recovered_ids = []

    for entry in tqdm(chunk_data, desc='Decoding chunks'):
        ids  = entry['ids']
        bits = entry['bits']
        dec  = ArithmeticDecoder(bits)

        # Восстанавливаем токены для текущего чанка
        rec = [ids[0]]
        max_len = model.config.max_position_embeddings
        for i in tqdm(range(1, len(ids)), desc='  Tokens in chunk', leave=False):
            ctx = rec[max(0, i - max_len + 1):i]
            inp = torch.tensor([ctx], device=device)
            with torch.no_grad():
                logits = model(inp).logits[0, -1]
                probs  = torch.softmax(logits, dim=-1).cpu().numpy()
            cdf = probs_to_cdf_int(probs, total)
            rec.append(dec.decode(cdf, total))

        all_recovered_ids.extend(rec)

    decoding_time = time.perf_counter() - start_time
    print(f"Total decoding time: {decoding_time:.2f} seconds")

    return all_recovered_ids

In [None]:
data = get_data(50000, tokenizer)

In [None]:
chunk_data = encode_chunks(data, tokenizer, model)

Encoding chunks:   0%|          | 0/25 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/668 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/732 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/572 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/589 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/577 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/546 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/453 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/505 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/453 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/568 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/500 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/511 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/487 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/456 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/465 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/514 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/426 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/537 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/494 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/600 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/473 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/494 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/495 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/442 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/452 [00:00<?, ?it/s]

Total encoding time: 138.17 seconds


In [None]:
recovered_ids = decode_chunks(chunk_data, model)

Decoding chunks:   0%|          | 0/25 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/668 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/732 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/572 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/589 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/577 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/546 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/453 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/505 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/453 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/568 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/500 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/511 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/487 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/456 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/465 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/514 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/426 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/537 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/494 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/600 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/473 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/494 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/495 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/442 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/452 [00:00<?, ?it/s]

Total decoding time: 152.33 seconds


In [None]:
orig_ids = sum([entry['ids'] for entry in chunk_data], [])
encoded_output = sum([entry['bits'] for entry in chunk_data], [])

In [None]:
compare_sequences(orig_ids, recovered_ids)

Совпадают полностью!


In [None]:
compression_stats(data, encoded_output)

Исходный размер:   400000 бит
Размер после сжатия: 66303 бит
Коэффициент сжатия: 0.1658


In [None]:
recovered_text = decode_text_from_ids(recovered_ids, tokenizer)

## EleutherAI/pythia-160m

In [None]:
clear_memory()

In [None]:
tokenizer, model = load_model('EleutherAI/pythia-160m')

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/375M [00:00<?, ?B/s]

In [None]:
data = get_data(50000, tokenizer)

In [None]:
chunk_data = encode_chunks(data, tokenizer, model)

Encoding chunks:   0%|          | 0/25 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/668 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/732 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/572 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/589 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/577 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/546 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/453 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/505 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/453 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/568 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/500 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/511 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/487 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/456 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/465 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/514 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/426 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/537 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/494 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/600 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/473 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/494 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/495 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/442 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/452 [00:00<?, ?it/s]

Total encoding time: 335.48 seconds


In [None]:
recovered_ids = decode_chunks(chunk_data, model)

Decoding chunks:   0%|          | 0/25 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/668 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/732 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/572 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/589 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/577 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/546 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/453 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/505 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/453 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/568 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/500 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/511 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/487 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/456 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/465 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/514 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/426 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/537 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/494 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/600 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/473 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/494 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/495 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/442 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/452 [00:00<?, ?it/s]

Total decoding time: 339.62 seconds


In [None]:
orig_ids = sum([entry['ids'] for entry in chunk_data], [])
encoded_output = sum([entry['bits'] for entry in chunk_data], [])

In [None]:
compare_sequences(orig_ids, recovered_ids)

Совпадают полностью!


In [None]:
compression_stats(data, encoded_output)

Исходный размер:   400000 бит
Размер после сжатия: 57478 бит
Коэффициент сжатия: 0.1437


## GPT2

In [None]:
clear_memory()

In [None]:
tokenizer, model = load_model('gpt2')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
data = get_data(50000, tokenizer)

In [None]:
chunk_data = encode_chunks(data, tokenizer, model)

Encoding chunks:   0%|          | 0/25 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/896 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/984 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/607 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/597 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/595 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/561 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/459 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/509 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/454 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/563 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/498 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/510 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/502 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/416 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/456 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/497 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/420 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/521 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/486 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/610 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/460 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/500 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/491 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/434 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/461 [00:00<?, ?it/s]

Total encoding time: 379.86 seconds


In [None]:
recovered_ids = decode_chunks(chunk_data, model)

Decoding chunks:   0%|          | 0/25 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/896 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/984 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/607 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/597 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/595 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/561 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/459 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/509 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/454 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/563 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/498 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/510 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/502 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/416 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/456 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/497 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/420 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/521 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/486 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/610 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/460 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/500 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/491 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/434 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/461 [00:00<?, ?it/s]

Total decoding time: 380.13 seconds


In [None]:
orig_ids = sum([entry['ids'] for entry in chunk_data], [])
encoded_output = sum([entry['bits'] for entry in chunk_data], [])

In [None]:
compare_sequences(orig_ids, recovered_ids)

Расхождение на позиции 6434: orig=30109  decoded=29994


In [None]:
compression_stats(data, encoded_output)

Исходный размер:   400000 бит
Размер после сжатия: 61388 бит
Коэффициент сжатия: 0.1535


## Open_llama_7b

In [None]:
!pip install -U bitsandbytes



In [None]:
def load_quantized_model(model_name: str,
                         load_in_8bit: bool = True,
                         llm_int8_enable_fp32_cpu_offload: bool = True):
    # Конфиг для 8‑битной загрузки
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=load_in_8bit,
        llm_int8_enable_fp32_cpu_offload=llm_int8_enable_fp32_cpu_offload,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",           # авто‑распределение слоёв
        torch_dtype=torch.float16,   # половинная точность для оставшихся параметров
        low_cpu_mem_usage=True,      # минимальное потребление CPU‑памяти
    )
    model.eval()
    return tokenizer, model

In [None]:
clear_memory()

In [None]:
tokenizer, model = load_quantized_model("openlm-research/open_llama_7b")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
data = get_data(50000, tokenizer)

In [None]:
chunk_data = encode_chunks(data, tokenizer, model)

Encoding chunks:   0%|          | 0/25 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/677 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/771 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/629 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/647 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/627 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/598 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/472 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/548 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/506 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/620 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/536 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/570 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/574 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/465 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/507 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/561 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/441 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/566 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/517 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/654 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/491 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/519 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/513 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/461 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/468 [00:00<?, ?it/s]

Total encoding time: 3971.34 seconds


In [None]:
recovered_ids = decode_chunks(chunk_data, model)

Decoding chunks:   0%|          | 0/25 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/677 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/771 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/629 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/647 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/627 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/598 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/472 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/548 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/506 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/620 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/536 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/570 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/574 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/465 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/507 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/561 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/441 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/566 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/517 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/654 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/491 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/519 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/513 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/461 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/468 [00:00<?, ?it/s]

Total decoding time: 3957.65 seconds


In [None]:
orig_ids = sum([entry['ids'] for entry in chunk_data], [])
encoded_output = sum([entry['bits'] for entry in chunk_data], [])

In [None]:
compare_sequences(orig_ids, recovered_ids)

Расхождение на позиции 12000: orig=31705  decoded=31699


In [None]:
compression_stats(data, encoded_output)

Исходный размер:   400000 бит
Размер после сжатия: 36262 бит
Коэффициент сжатия: 0.0907


## Open_llama_3b

In [None]:
clear_memory()

In [None]:
tokenizer, model = load_quantized_model("openlm-research/open_llama_3b")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


pytorch_model.bin:   0%|          | 0.00/6.85G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.85G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
data = get_data(50000, tokenizer)

In [None]:
chunk_data = encode_chunks(data, tokenizer, model)

Encoding chunks:   0%|          | 0/25 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/677 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/771 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/629 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/647 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/627 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/598 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/472 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/548 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/506 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/620 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/536 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/570 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/574 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/465 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/507 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/561 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/441 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/566 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/517 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/654 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/491 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/519 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/513 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/461 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/468 [00:00<?, ?it/s]

Total encoding time: 3297.45 seconds


In [None]:
recovered_ids = decode_chunks(chunk_data, model)

Decoding chunks:   0%|          | 0/25 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/677 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/771 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/629 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/647 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/627 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/598 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/472 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/548 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/506 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/620 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/536 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/570 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/574 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/465 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/507 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/561 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/441 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/566 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/517 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/654 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/491 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/519 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/513 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/461 [00:00<?, ?it/s]

  Tokens in chunk:   0%|          | 0/468 [00:00<?, ?it/s]

Total decoding time: 3263.61 seconds


In [None]:
orig_ids = sum([entry['ids'] for entry in chunk_data], [])
encoded_output = sum([entry['bits'] for entry in chunk_data], [])

In [None]:
compare_sequences(orig_ids, recovered_ids)

Совпадают полностью!


In [None]:
compression_stats(data, encoded_output)

Исходный размер:   400000 бит
Размер после сжатия: 38099 бит
Коэффициент сжатия: 0.0952
