In [1]:
from decimal import Decimal, getcontext
from fractions import Fraction
import math

In [171]:
class FrequencyTable:

    def __init__(self, freqs):
        self.counts = freqs
        self.total = sum(freqs.values())
        self.freqs = self.__generate_freqs()
        self.freq_ranges = self.__generate_freqs_range()

    def __generate_freqs(self, round=True):
        freqs = {symbol: Fraction(self.counts[symbol], self.total) for symbol in self.symbols}
        if not round:
            return freqs

        freqs = {symbol: self.floor_quantize(freqs[symbol]) for symbol in freqs}

        probs_sum = sum(freqs.values())
        for symbol, prob in freqs.items():
            if probs_sum >= 1:
                break
            if prob < self.floor_quantize(Fraction(1, 1) - probs_sum):
                freqs[symbol] *= 2
                probs_sum = sum(freqs.values())

        return freqs

    def __generate_freqs_range(self):
        freq_ranges = {}

        lower = Fraction(0, 1)
        for symbol, prob in self.freqs.items():
            freq_ranges[symbol] = (lower, prob)

            lower += prob

        return freq_ranges

    @property
    def symbols(self):
        return sorted(self.counts, key=lambda x: self.counts[x], reverse=True)

    @staticmethod
    def from_text(text):
        freqs = {}
        for symbol in set(text):
            freqs[symbol] = text.count(symbol)

        return FrequencyTable(freqs=freqs)

    @staticmethod
    def floor_quantize(number):
        floor_exponent = abs(math.floor(math.log2(number)))
        return Fraction(1, 2 ** floor_exponent)

In [166]:
from bisect import bisect_right
from tqdm import tqdm

class ArithmeticCoder:

    def encode(self, text):
        freqs = FrequencyTable.from_text(text)

        probs_sum = sum(freqs.freqs.values())
        lower, delta = Fraction(0, 1), Fraction(1, 1)
        for symbol in tqdm(text, desc='Encoding'):
            current_lower, current_delta = freqs.freq_ranges[symbol]
            lower += current_lower * delta
            delta *= current_delta

        return lower, len(text), freqs

    def decode(self, encoded, length, freqs):
        range_lower_bounds = [symbol_range[0] for symbol_range in freqs.freq_ranges.values()]

        decoded = ''
        for _ in range(length):
            symbol_idx = bisect_right(range_lower_bounds, encoded) - 1
            symbol = freqs.symbols[symbol_idx]
            decoded += symbol

            lower, delta = freqs.freq_ranges[symbol]
            encoded = (encoded - lower) / delta

        return decoded

In [30]:
with open('dickens.txt', 'rb') as file:
    data = file.read()

In [174]:
text = data[:10000].decode()

enc = ArithmeticCoder()
encoded, length, freqs = enc.encode(text)
decoded = enc.decode(encoded, len(text), freqs)

print(f'Encoded: {encoded}')
print(f'Decoded: {decoded}')
print(f'Successfully: {decoded == text}')

Encoding: 100%|██████████| 10000/10000 [00:09<00:00, 1036.78it/s]


Encoded: 1708420657063621795593592096828092184078698131252425438356806580280754970807828771206273233667433885508840868206218006320353550625302200490250250469644385435704473320216180445951034516734651039572877217623523678321095540647886521844470732516340592329742580733901037747526577423012962905375439542102158574699788176997207496067909021708934327802510805071214842778560396544016151339431000129794655106965877512129412177856117996380800547297415549641928734219392348379887526014419979285010109389430680886646197354242308337751878659436167857533157617775468831836314715245319954752410336830247714763183154531636247318485029924731810679698302537002472633558390111294984650064158280902275685795689034550463063360628439068894976129651445127862905640518731531536726706903076661926029408015916024200874074868418073171110498531031406348104651339079049934744240022711074457099797097543447866559516544323902505527330142415175921626472513403506994034707308185570984974770736578165038964709122105235504296557

In [18]:
import math

def normalize_number(fraction):
    precision = math.ceil(math.log2(fraction.numerator + 1) / 8)
    number = int((2 ** (precision * 8)) * fraction)

    return number, precision

In [175]:
x, precision = normalize_number(encoded)
math.log2(x) / 8

5575.997408150627

In [None]:
class Serializer:

    def __init__(self, file_name):
        self.stream = open(file_name)