Krishna Sharma | AP22110010128

In [1]:
documents = {
    1: "data algorithm structure",
    2: "database query system", 
    3: "algorithm analysis optimization",
    4: "machine learning data model",
    5: "network protocol security",
    6: "operating system process scheduling",
    7: "computer architecture hardware",
    8: "data mining machine learning",
}

BLOCKING DICTIONARY STRING COMPRESSION

In [2]:
vocabulary = set()
for text in documents.values():
    words = text.split()
    for word in words:
        vocabulary.add(word)

In [3]:
word_to_index = {}
for i, word in enumerate(vocabulary):
    word_to_index[word] = i

print(f"\nWord to Index Mapping:")
for word, index in word_to_index.items():
    print(f"'{word}' -> {index}")


Word to Index Mapping:
'structure' -> 0
'process' -> 1
'security' -> 2
'system' -> 3
'operating' -> 4
'hardware' -> 5
'architecture' -> 6
'algorithm' -> 7
'protocol' -> 8
'query' -> 9
'optimization' -> 10
'mining' -> 11
'network' -> 12
'database' -> 13
'learning' -> 14
'analysis' -> 15
'model' -> 16
'data' -> 17
'scheduling' -> 18
'machine' -> 19
'computer' -> 20


In [4]:
compressed_docs = {}
for doc_id, text in documents.items():
    words = text.split()
    compressed = []
    for word in words:
        compressed.append(word_to_index[word])
    compressed_docs[doc_id] = compressed

In [None]:
print(f"\nCompressed Documents:")
for doc_id, compressed in compressed_docs.items():
    print(f"Doc {doc_id}: {compressed}")


Compressed Documents:
Doc 1: [17, 7, 0]
Doc 2: [13, 9, 3]
Doc 3: [7, 15, 10]
Doc 4: [19, 14, 17, 16]
Doc 5: [12, 8, 2]
Doc 6: [4, 3, 1, 18]
Doc 7: [20, 6, 5]
Doc 8: [17, 11, 19, 14]


In [6]:
original_size = 0
for text in documents.values():
    original_size += len(text)

compressed_size = 0
for compressed in compressed_docs.values():
    compressed_size += len(compressed)

In [7]:
print(f"Original size (characters): {original_size}")
print(f"Compressed size (indices): {compressed_size}")
print(f"Compression ratio: {original_size/compressed_size:.2f}:1")

Original size (characters): 221
Compressed size (indices): 27
Compression ratio: 8.19:1


VARIABLE BYTE ENCODING

In [8]:
numbers = [1, 5, 127, 128, 255, 256, 1000, 2047]

In [None]:
vb_encoded = {}
for num in numbers:
    if num == 0:
        vb_encoded[num] = [128]  
    else:
        bytes_list = []
        temp_num = num
        
        while temp_num >= 128:
            bytes_list.insert(0, temp_num % 128)  
            temp_num = temp_num // 128
        
        bytes_list.insert(0, temp_num + 128)
        vb_encoded[num] = bytes_list

In [10]:
print("Variable Byte Encoded:")
for num, encoded in vb_encoded.items():
    binary_repr = []
    for byte_val in encoded:
        binary_repr.append(f"{byte_val:08b}")
    print(f"{num:4d} -> {encoded} -> {' '.join(binary_repr)}")

Variable Byte Encoded:
   1 -> [129] -> 10000001
   5 -> [133] -> 10000101
 127 -> [255] -> 11111111
 128 -> [129, 0] -> 10000001 00000000
 255 -> [129, 127] -> 10000001 01111111
 256 -> [130, 0] -> 10000010 00000000
1000 -> [135, 104] -> 10000111 01101000
2047 -> [143, 127] -> 10001111 01111111


In [12]:
print("Variable Byte Decoding:")
for num, encoded in vb_encoded.items():
    decoded = 0
    for byte_val in encoded:
        if byte_val >= 128:
            decoded = decoded * 128 + (byte_val - 128)
        else:
            decoded = decoded * 128 + byte_val
    print(f"{encoded} -> {decoded}")

Variable Byte Decoding:
[129] -> 1
[133] -> 5
[255] -> 127
[129, 0] -> 128
[129, 127] -> 255
[130, 0] -> 256
[135, 104] -> 1000
[143, 127] -> 2047


GAMMA CODES

In [13]:
numbers = [1, 5, 127, 128, 255, 256, 1000, 2047]

In [14]:
gamma_encoded = {}
for num in numbers:
    if num <= 0:
        print(f"Cannot encode {num} with Gamma code (must be positive)")
        continue
    
    temp = num
    bit_length = 0
    while temp > 1:
        temp = temp // 2
        bit_length += 1
    
    unary_prefix = '0' * bit_length + '1'
    binary_suffix = bin(num)[3:]  
    
    gamma_code = unary_prefix + binary_suffix
    gamma_encoded[num] = gamma_code

In [None]:
print("Gamma Encoded:")
for num, encoded in gamma_encoded.items():
    print(f"{num:4d} -> {encoded} (length: {len(encoded)} bits)")

Gamma Encoded:
   1 -> 1 (length: 1 bits)
   5 -> 00101 (length: 5 bits)
 127 -> 0000001111111 (length: 13 bits)
 128 -> 000000010000000 (length: 15 bits)
 255 -> 000000011111111 (length: 15 bits)
 256 -> 00000000100000000 (length: 17 bits)
1000 -> 0000000001111101000 (length: 19 bits)
2047 -> 000000000011111111111 (length: 21 bits)


In [16]:
print("Gamma Decoding:")
for num, encoded in gamma_encoded.items():
    leading_zeros = 0
    for bit in encoded:
        if bit == '0':
            leading_zeros += 1
        else:
            break

    binary_suffix = encoded[leading_zeros + 1:]
    
    if binary_suffix:
        binary_repr = '1' + binary_suffix
        decoded = int(binary_repr, 2)
    else:
        decoded = 1 
    
    print(f"{encoded} -> {decoded}")

Gamma Decoding:
1 -> 1
00101 -> 5
0000001111111 -> 127
000000010000000 -> 128
000000011111111 -> 255
00000000100000000 -> 256
0000000001111101000 -> 1000
000000000011111111111 -> 2047


Krishna Sharma | AP22110010128