## ハフマン符号を生成する


In [16]:
import heapq
from collections import defaultdict


class Node:
    def __init__(self, char, freq):
        self.char = char
        self.freq = freq
        self.left = None
        self.right = None

    def __lt__(self, other):
        return self.freq < other.freq


def calculate_frequency(s):
    total = len(s)
    frequency = defaultdict(int)

    for char in s:
        frequency[char] += 1

    for char, freq in frequency.items():
        frequency[char] = freq / total * 100  # Convert to percentage

    return frequency


def build_huffman_tree(frequency):
    heap = [Node(char, freq) for char, freq in frequency.items()]
    heapq.heapify(heap)

    while len(heap) > 1:
        left = heapq.heappop(heap)
        right = heapq.heappop(heap)

        merged = Node(None, left.freq + right.freq)
        merged.left = left
        merged.right = right

        heapq.heappush(heap, merged)

    return heap[0]


def generate_huffman_codes(node, code="", mapping=None):
    if mapping is None:
        mapping = {}

    if node is not None:
        if node.char is not None:
            mapping[node.char] = code

        generate_huffman_codes(node.left, code + "0", mapping)
        generate_huffman_codes(node.right, code + "1", mapping)

    return mapping


def adjust_codes(huffman_codes, fixed_code):
    for char, code in huffman_codes.items():
        if code.startswith(fixed_code):
            huffman_codes[char] = "1" + code
    return huffman_codes


def calculate_compression_ratio(frequency, huffman_codes):
    huffman_length = sum(
        frequency[char] * len(huffman_codes[char]) for char in huffman_codes
    )
    fixed_length = 3 * 100  # 2 bits for each character, and the total frequency is 100%
    return (huffman_length / fixed_length) * 100


def encode_string(s, huffman_codes):
    return sum(len(huffman_codes[char]) for char in s)


def huffman_coding(frequency):
    huffman_tree = build_huffman_tree(frequency)
    huffman_codes = generate_huffman_codes(huffman_tree)
    return huffman_codes

In [17]:
##各文字とその出現頻度を渡すと、ハフマン符号を返す関数
if __name__ == "__main__":
    frequency = {"A": 10, "C": 15, "G": 15, "H": 5, "P": 5, "T": 50}
    huffman_codes = huffman_coding(frequency)
    print("各文字と頻度:", frequency)
    print("ハフマンコード:", huffman_codes)
    ##ハフマンコードで文字列をエンコードすると、各文字に2bitずつ割り当てた場合の約何%の長さとなるかを返す関数
    compression_ratio = calculate_compression_ratio(frequency, huffman_codes)
    print(f"Compression ratio compared to 2-bit fixed length: {compression_ratio:.2f}%")

各文字と頻度: {'A': 10, 'C': 15, 'G': 15, 'H': 5, 'P': 5, 'T': 50}
ハフマンコード: {'T': '0', 'A': '100', 'P': '1010', 'H': '1011', 'G': '110', 'C': '111'}
Compression ratio compared to 2-bit fixed length: 70.00%


In [18]:
##文字列を与えるとその文字列のハフマン符号を返す
if __name__ == "__main__":
    s = "GGCCGGGCGCGGTGGCTCACGCCTGTAATC"
    frequency2 = calculate_frequency(s)
    huffman_codes2 = huffman_coding(frequency2)
    ##元の文字列を生成した符号でエンコードした結果の bit 長を返す関数
    encoded_length = encode_string(s, huffman_codes2)
    print("各文字と頻度:", frequency2)
    print("ハフマンコード:", huffman_codes2)
    print(f"Encoded string length: {encoded_length} bits")

各文字と頻度: defaultdict(<class 'int'>, {'G': 40.0, 'C': 33.33333333333333, 'T': 16.666666666666664, 'A': 10.0})
ハフマンコード: {'G': '0', 'A': '100', 'T': '101', 'C': '11'}
Encoded string length: 56 bits


In [19]:
## 平均bit数を計算する関数
def average_bits(frequency, huffman_codes):
    return sum(frequency[char] * len(huffman_codes[char]) for char in frequency)


## 文字列全体をハフマン符号化した結果を返す関数
def huffman_code_to_string(s, huffman_codes):
    return "".join(huffman_codes[char] for char in s)


print("文字列全体をハフマン符号化した結果:", huffman_code_to_string(s, huffman_codes2))
print("平均bit数:", average_bits(frequency, huffman_codes))

文字列全体をハフマン符号化した結果: 00111100011011001010011101111001101111101010110010010111
平均bit数: 210


文字に符号の指定がある場合


In [20]:
def huffman_coding_with_fixed_code(frequency, fixed_char, fixed_code):
    total_freq_excluding_fixed = sum(
        freq for char, freq in frequency.items() if char != fixed_char
    )
    adjusted_frequency = {
        char: (freq / total_freq_excluding_fixed) * 100
        for char, freq in frequency.items()
        if char != fixed_char
    }

    # ハフマンツリーを構築
    huffman_tree = build_huffman_tree(adjusted_frequency)

    # ハフマンコードを生成
    huffman_codes = generate_huffman_codes(huffman_tree)

    # 固定されたコードを追加
    huffman_codes[fixed_char] = fixed_code

    return huffman_codes


# 各文字とその出現頻度
frequency = {"A": 10, "C": 15, "G": 15, "H": 5, "P": 5, "T": 50}

# ハフマン符号の生成（Aに000を割り当てる）
huffman_codes = {"A": "100", "C": "010", "G": "110", "H": "1000", "P": "0000", "T": "1"}
print("ハフマンコード:", huffman_codes)
compression_ratio = calculate_compression_ratio(frequency, huffman_codes)
print(f"Compression ratio compared to 2-bit fixed length: {compression_ratio:.2f}%")

ハフマンコード: {'A': '100', 'C': '010', 'G': '110', 'H': '1000', 'P': '0000', 'T': '1'}
Compression ratio compared to 2-bit fixed length: 70.00%
