# notebooks.classification_task

We start with a bit-string similarity problem related to protozoan mithocondrial genetic codes extracted from https://www.bioinformatics.org/sms2/genetic_code.html.

If needed, execute the following commands
```
!git clone https://github.com/leonardoLavagna/qhdc
import os
os.chdir('qhdc')
!pip install -r requirements.txt
```

## Setup and data

In [2]:
import math
import hashlib
import operator
from functions.patterns_utilities import encode_bitstring

In [3]:
YEAST     = "----------------------------------MM----------------------------"
PROTOZOAN = "--MM---------------M------------MMMM---------------M------------"
BACTERIAL = "---M---------------M------------MMMM---------------M------------"
codes = {"YEAST": YEAST,"PROTOZOAN": PROTOZOAN,"BACTERIAL":BACTERIAL}

In [4]:
def compress(binary_string, k):
    """
    Compress a binary string to a fixed length k using SHA-256 hashing.

    Args:
        binary_string (str): The binary string to be compressed.
        k (int): The length of the compressed binary string.

    Returns:
        str: The compressed binary string of length k.

    Raises:
        AssertionError: If the length of the binary string is not a multiple of 8.
    """
    assert len(binary_string) % 8 == 0, "Binary string length must be a multiple of 8"
    byte_length = len(binary_string) // 8
    byte_data = int(binary_string, 2).to_bytes(byte_length, byteorder='big')
    hash_object = hashlib.sha256(byte_data)
    hash_digest = hash_object.digest()
    hash_binary_string = ''.join(format(byte, '08b') for byte in hash_digest)
    compressed_binary_string = hash_binary_string[:k]
    return compressed_binary_string


def retrieve_original_from_compressed(compressed_string, lookup_table):
    """
    Retrieve the original binary string from the compressed string using a lookup table.

    Args:
        compressed_string (str): The compressed binary string.

    Returns:
        str: The original binary string, or None if not found.
    """
    return lookup_table.get(compressed_string, None)


def find_keys_by_value(d, target_value):
    """
    Find all keys in a dictionary that have a specific target value.

    Args:
        d (Dict): The dictionary to search.
        target_value: The value to search for.

    Returns:
        List: A list of keys that have the target value.
    """
    keys = [key for key, value in d.items() if value == target_value]
    return keys

## QuAM-based architecture

In [5]:
YEAST_bin = list(YEAST)
PROTOZOAN_bin = list(PROTOZOAN)
BACTERIAL_bin = list(BACTERIAL)
for i in range(len(YEAST_bin)):
    if YEAST_bin[i] == "-":
        YEAST_bin[i] = 0
    if YEAST_bin[i] == "M":
        YEAST_bin[i] = 1
for i in range(len(PROTOZOAN_bin)):
    if PROTOZOAN_bin[i] == "-":
        PROTOZOAN_bin[i] = 0
    if PROTOZOAN_bin[i] == "M":
        PROTOZOAN_bin[i] = 1
for i in range(len(BACTERIAL_bin)):
    if BACTERIAL_bin[i] == "-":
        BACTERIAL_bin[i] = 0
    if BACTERIAL_bin[i] == "M":
        BACTERIAL_bin[i] = 1

In [6]:
patterns = [''.join(str(bit) for bit in YEAST_bin), ''.join(str(bit) for bit in BACTERIAL_bin)]
search = [''.join(str(bit) for bit in PROTOZOAN_bin)]
search = search[0]
search

'0011000000000000000100000000000011110000000000000001000000000000'

**Remark .** If you try the following

```
result = grover_search(qc=None, x=None, c=None, output=None, xc=None, cc=None, R=None, s=search, patterns=patterns, problem="similarity")
max(result.circuit_results[0], key=result.circuit_results[0].get)
```
you will get an error:

```
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-182-82261044df0e> in <cell line: 1>()
----> 1 result = grover_search(qc=None, x=None, c=None, output=None, xc=None, cc=None, R=None, s=search, patterns=patterns, problem="similarity")
      2 max(result.circuit_results[0], key=result.circuit_results[0].get)

1 frames
/usr/local/lib/python3.10/dist-packages/qiskit/quantum_info/states/statevector.py in from_label(cls, label)
    698         # Initialize Z eigenstate vector
    699         num_qubits = len(label)
--> 700         data = np.zeros(1 << num_qubits, dtype=complex)
    701         pos = int(z_label, 2)
    702         data[pos] = 1

ValueError: Maximum allowed dimension exceeded
```
hashing is needed...

In [7]:
k = 8
patterns_comp = []
for pattern in patterns:
    pattern = compress(pattern,k)
    patterns_comp.append(pattern)
search_comp = compress(search,k)
print(patterns_comp, search_comp)

lookup_table = {}
for original_string in patterns:
    compressed_string = compress(original_string,k)
    lookup_table[compressed_string] = original_string
lookup_table

['10011000', '01001110'] 10110001


{'10011000': '0000000000000000000000000000000000110000000000000000000000000000',
 '01001110': '0001000000000000000100000000000011110000000000000001000000000000'}

Now we can apply Grover...

In [8]:
from functions.QUAM import *

result = grover_search(qc=None, x=None, c=None, output=None, xc=None, cc=None, R=None, s=search_comp, patterns=patterns_comp, problem="similarity")
sorted_results = dict(sorted(result.circuit_results[0].items(), key=operator.itemgetter(1), reverse=True))


In [9]:
recovered_answers = []
for i in range(len(list(sorted_results.keys()))):
  if retrieve_original_from_compressed(list(sorted_results.keys())[i],lookup_table) is not None:
    recovered_answers.append(retrieve_original_from_compressed(list(sorted_results.keys())[i],lookup_table))

In [10]:
original_string = retrieve_original_from_compressed(max(result.circuit_results[0], key=result.circuit_results[0].get),lookup_table)
answer = [0]*len(original_string)
for i in range(len(original_string)):
    if original_string[i] == "0":
        answer[i] = "-"
    if original_string[i] == "1":
        answer[i] = "M"
answer = ''.join(answer)
key = find_keys_by_value(codes, answer)
item = max(result.circuit_results[0], key=result.circuit_results[0].get)
print(f'YEAST is most similar to {key[0]} with similarity {sorted_results[item]}')

YEAST is most similar to YEAST with similarity 0.0347909927368156


In [11]:
other_string = recovered_answers[1]
other_answer = [0]*len(other_string)
for i in range(len(other_string)):
    if other_string[i] == "0":
        other_answer[i] = "-"
    if other_string[i] == "1":
        other_answer[i] = "M"
other_answer = ''.join(other_answer)
key = find_keys_by_value(codes, other_answer)
item = find_keys_by_value(lookup_table, other_string)[0]
print(f'YEAST is also similar to {key[0]} with similarity {sorted_results[item]}')

YEAST is also similar to BACTERIAL with similarity 0.0037851333618163


## Circuit-based architecture

In [12]:
n = math.ceil(math.log2(len(YEAST))) + 1
qr = QuantumRegister(n)
cr = ClassicalRegister(n)
qc_yeast = encode_bitstring(YEAST, qr, cr)
qc_protozoan = encode_bitstring(PROTOZOAN, qr, cr)
qc_bacterial = encode_bitstring(BACTERIAL, qr, cr)
circs = {"YEAST": qc_yeast, "PROTOZOAN": qc_protozoan, "BACTERIAL": qc_bacterial}

In [13]:
inverse_qc_yeast = encode_bitstring(YEAST,qr,cr, inverse=True)
inverse_qc_protozoan = encode_bitstring(PROTOZOAN, qr, cr, inverse=True)
inverse_qc_bacterial = encode_bitstring(BACTERIAL, qr, cr, inverse=True)
inverse_circs = {"YEAST": inverse_qc_yeast, "PROTOZOAN": inverse_qc_protozoan, "BACTERIAL": inverse_qc_bacterial}

In [14]:
key = "PROTOZOAN"
shots = 1024

combined_circs = {}
count = {}

most_similar, most_similar_score = "", -1.0

for other_key in inverse_circs:
    if other_key == key:
        continue
    combined_circs[other_key] = circs[key].compose(inverse_circs[other_key])
    backend = Aer.get_backend("qasm_simulator")
    t_qc = transpile(combined_circs[other_key], backend=backend)
    job = backend.run(t_qc)
    st = job.result().get_counts(combined_circs[other_key])
    if "0"*n in st:
        sim_score = st["0"*n]/shots
    else:
        sim_score = 0.0

    print("Similarity score of",key,"and",other_key,"is",sim_score)
    if most_similar_score < sim_score:
        most_similar, most_similar_score = other_key, sim_score

print(key,"is most similar to", most_similar)

Similarity score of PROTOZOAN and YEAST is 0.0126953125
Similarity score of PROTOZOAN and BACTERIAL is 0.021484375
PROTOZOAN is most similar to BACTERIAL
