# Debug the Jyputer Notebook

from IPython.core.debugger import set_trace

(https://medium.com/@chrieke/jupyter-tips-and-tricks-994fdddb2057)
# 

In [39]:
#-------------------------------------------------------------------------------
# Trivial Compression and DeCompress class
#-------------------------------------------------------------------------------
class CompressedGene:

  #-----------------------------------------------------------------------------
  # c-tor, note it compresses the string passed in
  #-----------------------------------------------------------------------------
  def __init__(self, gene: str) -> None:
    self._compress(gene)

  #-----------------------------------------------------------------------------
  # member method to decompress the compressed string passed in to this class
  #-----------------------------------------------------------------------------
  def decompress(self):
    rslt: str = ""

    # excluding the sentinel (ie, the first 01 two bits)
    # turned out bit_length() returns the index value, not the physical value
    shifts: int = int( (self.bit_str.bit_length() + 1) / 2) - 1

    print("bit_str.bit_length(): {},  shifts: {}".format(self.bit_str.bit_length(), shifts))

    # extract two-bytes every step
    # note the range exclusive for 2nd arg
    for shft in range(1, shifts + 1):

      # DBG: print("dbg: i={}".format(shft))

      # get 2 relevant bits
      # and the 1st shift should not push bits out
      bits: int = self.bit_str >> ( (shft -1) * 2) & 0b11 

      if bits == 0b00:
        rslt += "A"
      elif bits == 0b01:
        rslt += "C"
      elif bits == 0b10:
        rslt += "G"
      elif bits == 0b11:
        rslt += "T"
      else:
        raise ValueError("Invalid bits: {}".format(bits))

    # note, the reversing logic; it is because the bytes were read     
    return rslt[::-1]            

  #----------------------------------------------------------------------------
  # similar to Java's toString() 
  #----------------------------------------------------------------------------
  def __str__(self) -> str:         # does "->" mean function return type?
    return self.decompress()


  #----------------------------------------------------------------------------
  # compress the passed in string to the class and store it in self.bit_str
  #----------------------------------------------------------------------------
  def _compress(self, gene: str):

    # sentinel  (so, the self.bit_Str will always have 01 at the beginning)
    self.bit_str: int = 1
    
    # check each letter, appends it to the end of the self.bit_str
    for l in gene.upper():
       
       self.bit_str <<= 2

       if l == "A":
         self.bit_str |= 0b00
       elif l == "C":
         self.bit_str |= 0b01
       elif l == "G":
         self.bit_str |= 0b10
       elif l == "T":
         self.bit_str |= 0b11
       else:
         raise ValueError("Invalid letter: {}".format(l)) 

  #----------------------------------------------------------------------------
  # get the compressed string, represented in byte string
  #   (tunred out unnecessary, because self.bt_str has public access)
  #----------------------------------------------------------------------------
  def getCompressed(self):
    return self.bit_str     


#-------------------------------------------------------------------------------
# Program, main function                                                      --
#-------------------------------------------------------------------------------
if __name__ == "__main__":
  from sys import getsizeof

  #
  from IPython.core.debugger import set_trace
  # set_trace()

  # (ATGC 0100111001)
  # note, when the origin string is small/short (ie, if just ATGC), 
  # and then the compressed bytes are the same as that of origin's
  origin: str = "ATGC"   # "ATGTCAT" * 2

  compressed: CompressedGene = CompressedGene(origin)

  print("The original Gene Sequence bytes: {}, bytes after compressed: {}, compressed to {}"
    .format(getsizeof(origin), getsizeof(compressed.getCompressed()), bin(compressed.bit_str)))
  
  print("decompressing the origin string to {} for the origin string {}".format(compressed, origin))

  print("origin: {}, compressed to {}, after decompression, they are same: {}"
        .format(origin, bin(compressed.getCompressed()), (origin == compressed.decompress())))


The original Gene Sequence bytes: 53, bytes after compressed: 28, compressed to 0b100111001
bit_str.bit_length(): 9,  shifts: 4
decompressing the origin string to ATGC for the origin string ATGC
bit_str.bit_length(): 9,  shifts: 4
origin: ATGC, compressed to 0b100111001, after decompression, they are True
