# Ejercicio EvCont4 (b) - Compresión de texto con LZZ7

* Óscar Sementé Solà
* Abdelkarim Azzouguagh Ouniri
* Rodrigo Cabezas Quirós

In [50]:
import math, re, time

class LZZ7Compressor:
    
    def __init__(self, win, wsld):
        self.win = win
        self.wsld = wsld
        self.win_b = int(math.log2(win))
        self.wsld_b = int(math.log2(wsld))
    
    def __is_power_of_two(self, v):
        return math.log2(v).is_integer()
    
    def __ld_to_bin(self, l, d, useLD):
        """
        Transforma (L, D) a formato binario de longitud fija.
        """
        b_win = lambda x : ''.join(reversed( [str((x >> i) & 1) for i in range(self.win_b)]))
        b_wsld = lambda x : ''.join(reversed( [str((x >> i) & 1) for i in range(self.wsld_b)]))
        if useLD:
            return (l, d)
        return b_win(l) + b_wsld(d)
    
    def __match_pattern(self, slide, inp, useLD):
        """
        Busca un patrón de la ventana de entrada en la ventana deslizante. Si lo encuentra devuelve
        (L, D) en decimal o en formato binario (según valor de 'useLD') y la longitud del patrón
        encontrado. Si no lo encuentra devuelve False y 1 (incremento del offset).
        """
        occ, seqs = [], [inp[:i] for i in range(len(inp), 0, -1)]
        while seqs:
            current = seqs.pop(0)
            occ = [(x.start(0), x.end(0)) for x in re.finditer(current, slide)]
            if occ:
                si, ei = occ[-1]
                return self.__ld_to_bin(len(slide[si:ei]), len(slide) - si, useLD), ei - si
        return False, 1
    
    def compress_bin(self, m, useLD=False):
        """
        Compresión de una string binaria a partir de la configuración del compresor.
        """
        if not self.win <= self.wsld and not len(m) >= self.win + self.wsld:
            """
            Que los bits de conf. sean potencia de 2 no tiene ningún sentido. El ejercicio evaluado en
            clase tenía por parametros 8 y 6 bits. Aplicando la norma de la potencia de 2, con 6 bits no
            funcionaria el algoritmo. Así que hemos aplicado esta política.
            or \ not self.__is_power_of_two(self.win) \ or not self.__is_power_of_two(self.wsld):
            """
            return -1
        # Se aplica método de inserción de bit, para los casos problematicos.
        offset, m = 0, self.__add_insertion_bits(m)
        ret = [m[:self.wsld]]
        while offset + self.wsld + self.win <= len(m):
            # Búsqueda de patrón de la ventana de entrada en la deslizante.
            slide = m[offset:offset + self.wsld]
            inp = m[offset + self.wsld:offset + self.wsld + self.win]
            found, flen = self.__match_pattern(slide, inp, useLD)
            if found:
                # Si se encuentra se añade al retorno.
                ret.append(found)
            else:
                # Si no se guarda el símbolo (en formato (L, D) tal y como especifica el enunciado).
                ret.append(self.__ld_to_bin(1, 1, useLD))
            offset -= -flen
        # Si al acabar quedan bits fuera de las ventanas, se añaden al final de la cadena.
        if offset < len(m):
            ret.append(m[offset + self.wsld:])
        if useLD:
            return ret
        return "".join(ret)
    
    def uncompress_bin(self, m):
        """
        Descompresión de una string binaria a partir de la configuración del compresor.
        """
        ret = m[:self.wsld]
        div = [m[i:i+self.wsld_b+self.win_b] for i in range(self.wsld, len(m), self.wsld_b + self.win_b)]
        lf = lambda x: 2**self.win_b if x == 0 else x
        df = lambda x: 2**self.wsld_b if x == 0 else x
        while div:
            c = div.pop(0)
            if len(c) < self.wsld_b + self.win_b:
                ret = ret + c
            else:
                l, d = lf(int(c[:self.win_b], 2)), df(int(c[self.win_b:], 2))
                ret = ret + ret[len(ret) - d: len(ret) + l - d]
        return self.__remove_insertion_bits(ret)
    
    def __add_insertion_bits(self, m):
        """
        Aplica método de inserción de bit para las entradas con un solo carácter.
        """
        asd = [m[i:i + self.wsld - 1] for i in range(0, len(m), self.wsld - 1)]
        opposite = lambda x: "0" if x == "1" else "1"
        for i in range(len(asd)):
            if asd[i] == asd[i][0] * len(asd[i]) and len(asd[i]) == self.wsld - 1:
                asd[i] = asd[i] + opposite(asd[i][0])
        return "".join(asd)
    
    def __remove_insertion_bits(self, m):
        """
        Desaplica método de inserción de bit para las entradas con un solo carácter.
        """
        opposite = lambda x: "0" if x == "1" else "1"
        for i in range(0, len(m), self.wsld):
            p = m[i:i + self.wsld]
            if p[:-1] == p[0] * len(p[:-1]) and p[-1] == opposite(p[0]):
                m = m[:i + self.wsld-1] + m[i+self.wsld:]
        return m
    
    def __text_to_bin(self, m):
        return "".join([bin(i)[2:].zfill(8) for i in bytearray(m, "utf-8")])
    
    def __bin_to_text(self, b):
        return "".join([chr(int(b[i:i+8], 2)) for i in range(0, len(b), 8)])
    
    def compress_text(self, m):
        startTime = time.time()
        b = self.__text_to_bin(m)
        c = self.compress_bin(b)
        print("[Info] Compression took {} seconds".format(time.time() - startTime))
        return c
    
    def uncompress_text(self, m):
        startTime = time.time()
        uBin = self.uncompress_bin(m)
        u = self.__bin_to_text(uBin)
        print("[Info] Uncompression took {} seconds".format(time.time() - startTime))
        return u
    
    def compress_text_from_file(self, fn):
        with open(fn, "r") as f:
            text =  f.read()
        f.close()
        return self.compress_text(text)

#### Apartado 1] Modificar el compresor para que sea capaz de leer ficheros de texto y devolverlos como una string binaria procesable por vustro compresor LZZ7. Ha de calcular el tiempo de compresión y descompresión. Comprobad el correcto funcionamiento del programa.

In [65]:
comp = LZZ7Compressor(4, 8)
msg = "It is practically impossible to teach good programming to students that have had a prior exposure to BASIC: as potential programmers they are mentally mutilated beyond hope of regeneration."
asd = comp.compress_text(msg)
print("\nCompressed:\t{}\n\nUncompressed:\t{}".format(asd, comp.uncompress_text(asd)))

[Info] Compression took 0.007001638412475586 seconds
[Info] Uncompression took 0.0010001659393310547 seconds

Compressed:	01001001100100100111100010100100110010001010011110000100101101110000001110010010011100101111101001010010000000000111010000010100111100000011111000001110001010010011100000111111011111000101000001001000000110110101101001000000000011101001111101111011100101111110000100101101100000110111110110100010011001011000110110010111100111010000000000110111001101010110111100000111000001101100000111001001000101101001101101111010010100110100000001111110100101011011010011100101111101001010011010000000111011001110010000001001000000001110000010011010100100110010001100000001111110001010010101110111111100000000000000001011101001001100101001101011101001010010000000000111010000011011001011001111100000001010010101110111110011110000001101100000000000000011101000001101111100000001110000101100111001011111010010100110100000001111110100101011011010011100101111101001010010010100000101000101001001

In [57]:
fname = "hamlet_short.txt"
asd = comp.compress_text_from_file(fname)
comp.uncompress_text(asd)

[Info] Compression took 0.03600263595581055 seconds
[Info] Uncompression took 0.009000539779663086 seconds


'BERNARDO Who\'s there?\nFRANCISCO Nay, answer me: stand, and unfold yourself.\nBERNARDO Long live the king!\nFRANCISCO Bernardo?\nBERNARDO He.\nFRANCISCO You come most carefully upon your hour.\nBERNARDO \'Tis now struck twelve; get thee to bed, Francisco.\nFRANCISCO For this relief much thanks: \'tis bitter cold, And I am sick at heart.\nBERNARDO Have you had quiet guard?\nFRANCISCO Not a mouse stirring. %!"©\' ©"\'\x90+²¶6\x16\x103··²\x1074³´:\x17\x10$³\x10<·º\x9027\x906²²º\x10$7¹0º4·\x900·2\x10&°¹1²¶6:¹\x96\x10*42\x9094»0¶9\x907³\x106¼\x90;°º1´\x16\x1014²\x10:42¶\x906°µ²\x9040¹º2\x97\x05#) §!¤©¡§\x90$\x90:44·5\x90$\x9042°¹\x10:42¶\x97\x10)º0·2\x16\x1047\x90\x90+´7\x93¹\x90:42¹2\x9f\x85\x05"·:2¹\x10$\'© ª$§\x900·2\x10&\xa0©!¢¦&*©\x85\x05$\'© ª$§\x90#94²·29\x90:7\x90:44¹\x903¹7º·2\x17\x05&\xa0©!¢¦&*©\x90 ·2\x1064²³²¶²·\x10:7\x90:42\x90"0·2\x97\x05#) §!¤©¡§\x90#´»2\x90<·º\x903··²\x1074³´:\x17\x05&\xa0©!¢¦&*©\x90\'\x96\x1030¹2»²¶6\x16\x1047·2¹º\x109·¶24²¹\x1d\x10+´7\x9040º4\x1092¶4²»2²

In [53]:
fname = "quijote_short.txt"
asd = comp.compress_text_from_file(fname)
print(comp.uncompress_text(asd))

[Info] Compression took 0.03618574142456055 seconds
[Info] Uncompression took 0.008816957473754883 seconds
En un lugar de la Mancha, de cuyo nombre no quiero acordarme, no ha mucho tiempo que vivÃ­a un hidalgo de los de lanza en astillero, adarga antigua, rocÃ­n flaco y galgo corredor. %*·07¶60220¶3·6áÐ¹;0±°8º²1°¹72¹79°¶84±áÙ·60¹6áÐ¹77±´2¹2:²¶7¹<8º²±90·:7¹67¹9áÐ±0²7¹62·:2µ0¹67¹;4²¹72¹0¶3áÝ780¶7¶´·7220áØ°²4²:¹067¹27¶´·3·¹1··9º¶áÖ°·60¹:92¹80¹:2¹229º40±´²·20                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      