In [1]:
import sys
from random import choice

# Size in bytes of common data types

In [2]:
integer = 1
flo = 1.0
char = 'a'
string = 'hello'
boolean = True

In [3]:
sys.getsizeof(integer)

28

In [4]:
sys.getsizeof(flo)

24

In [5]:
sys.getsizeof(char)

50

In [6]:
sys.getsizeof(string)

54

In [7]:
sys.getsizeof(boolean)

28

In [8]:
sys.getsizeof('ATG')

52

# Binary numbers

In [9]:
print('bin( 0):  ', bin(0))
print('bin( 1):  ', bin(1))
print('bin( 3):  ', bin(3))
print('bin(-1): ', bin(-1))

bin( 0):   0b0
bin( 1):   0b1
bin( 3):   0b11
bin(-1):  -0b1


In [10]:
type(bin(1))

str

In [11]:
int(bin(8), 2)

8

## Bitwise left shift: x << y
This will return x with the bits shifted to the left by y places, new bits on the right side are zeros.

This is equivalent to $x \thinspace 2^y$.

In [12]:
print('bin(5)   :  ', bin(5))
print('bin(5<<1):  ', bin(5<<1))

bin(5)   :   0b101
bin(5<<1):   0b1010


## Bitwise right shift: x >> y
This will shift x to the right by y places.

This is equivalent to $x \thinspace // \thinspace 2^y$.

In [13]:
print('bin(5)   :  ', bin(5))
print('bin(5>>1):  ', bin(5>>1))

bin(5)   :   0b101
bin(5>>1):   0b10


## Bitwise AND: x & y
Each bit in the output is 1 if both bits are 1 in the input, else 0.

In [14]:
print('bin(5)  :  ', bin(5))
print('bin(6)  :  ', bin(6))
print('bin(5&6):  ', bin(5&6))

bin(5)  :   0b101
bin(6)  :   0b110
bin(5&6):   0b100


## Bitwise OR: x | y
Each bit in the output is 1 if at least one bit in the input is 1, else 0.

In [15]:
print('bin(5)  :  ', bin(5))
print('bin(6)  :  ', bin(6))
print('bin(5|6):  ', bin(5|6))

bin(5)  :   0b101
bin(6)  :   0b110
bin(5|6):   0b111


## Bitwise complement: ~x
This returns the complement of x minus the number you get by flipping 1 to 0 and vice versa. The same as -x - 1.

In [16]:
print('bin(5) :', bin(~5))

bin(5) : -0b110


## Bitwise XOR: x ^ y
Each bit in the output is 1 if bits in the output are different, else 0.

In [17]:
print('bin(5)  :  ', bin(5))
print('bin(6)  :  ', bin(6))
print('bin(5^6):  ', bin(5^6))

bin(5)  :   0b101
bin(6)  :   0b110
bin(5^6):   0b11


# Convert nucleotide string to bytes string

In [18]:
class CompressedGene:
    def __init__(self, gene: str) -> None:
        self._compress(gene)
        
    def _compress(self, gene: str) -> None:
        self.bitstr: int = 1
        for nucleotide in gene.upper():
            self.bitstr <<= 2
            if nucleotide == 'A':
                self.bitstr |= 0b00
            elif nucleotide == 'G':
                self.bitstr |= 0b01
            elif nucleotide == 'T':
                self.bitstr |= 0b10
            elif nucleotide == 'C':
                self.bitstr |= 0b11
            else:
                raise ValueError(f'Invalid nucleotide: {nucleotide}')
                
    def _inflate(self) -> str:
        gene: str = ''
            
        # we iterate until length - 1 to exclude sentinel
        for i in range(0, self.bitstr.bit_length() - 1, 2):
            
            # shift i bits (0, 2, 4, ...) and get last two bits
            bits: int = self.bitstr >> i & 0b11

            # convert last two bits into nucleotide
            if bits == 0b00:
                gene += 'A'
            elif bits == 0b01:
                gene += 'G'
            elif bits == 0b10:
                gene += 'T'
            elif bits == 0b11:
                gene += 'C'
            else:
                raise ValueError(f'Invalid bits: {bits}')
            
        # revert the nucleotide string because we read it from right to left
        return gene[::-1]
    
    def __str__(self):
        return self._inflate()

In [19]:
%timeit gene = ''.join([choice('ATGC') for _ in range(1000000)])

629 ms ± 10.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
def random_gene():
    gene = ''
    for _ in range(1000000):
        gene += choice('ATGC')
    return gene

In [21]:
%timeit gene = random_gene()

685 ms ± 8.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
gene = ''.join([choice('ATGC') for _ in range(1000000)])

In [27]:
#compressed = CompressedGene(gene)
#print('original gene: ATC')
#print('encoded gene:', bin(compressed.bitstr))
print(f'size of encoded gene (bytes): {sys.getsizeof(compressed.bitstr)/1000}KB')
print(f'size of original gene (bytes): {sys.getsizeof(gene)/1000}KB')

size of encoded gene (bytes): 266.692KB
size of original gene (bytes): 1000.049KB


In [28]:
print('bin(1)   :', bin(1))
print('bin(1<<2):', bin(1<<2))

bin(1)   : 0b1
bin(1<<2): 0b100


In [29]:
print('bin(0b1001011 >> 0):', bin(0b1001011 >> 0))
print('bin(0b1001011 >> 0 & 0b11):', bin(0b1001011 >> 0 & 0b11))

bin(0b1001011 >> 0): 0b1001011
bin(0b1001011 >> 0 & 0b11): 0b11


In [30]:
print('bin(0b1001011 >> 2):', bin(0b1001011 >> 2))
print('bin(0b1001011 >> 2 & 0b11):', bin(0b1001011 >> 2 & 0b11))

bin(0b1001011 >> 2): 0b10010
bin(0b1001011 >> 2 & 0b11): 0b10


In [31]:
print('bin(0b1001011 >> 4):', bin(0b1001011 >> 4))
print('bin(0b1001011 >> 4 & 0b11):', bin(0b1001011 >> 4 & 0b11))

bin(0b1001011 >> 4): 0b100
bin(0b1001011 >> 4 & 0b11): 0b0
