# Dataset Generation
Notebook for generating and saving different quantum circuit datasets.

In [2]:
# Install genQC in editable mode if running locally
# !pip install -e ../genQC

import os
import time
import sys
import torch
import numpy as np
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import random
from dataclasses import asdict

from qiskit.circuit.library import get_standard_gate_name_mapping

sys.path.append(".")

import my_genQC

# genQC related imports
from my_genQC.pipeline.multimodal_diffusion_pipeline import MultimodalDiffusionPipeline_ParametrizedCompilation
from my_genQC.utils.misc_utils import infer_torch_device, set_seed
from my_genQC.platform.circuits_generation import generate_circuit_dataset, CircuitConditionType  
from my_genQC.platform.simulation import Simulator, CircuitBackendType  
from my_genQC.platform.tokenizer.circuits_tokenizer import CircuitTokenizer  
from my_genQC.pipeline.diffusion_pipeline import DiffusionPipeline  
from my_genQC.pipeline.compilation_diffusion_pipeline import DiffusionPipeline_Compilation  
from my_genQC.scheduler.scheduler import Scheduler  
from my_genQC.scheduler.scheduler_ddim import DDIMScheduler
from my_genQC.models.config_model import ConfigModel
from my_genQC.models.unet_qc import QC_Cond_UNet, QC_Compilation_UNet
from my_genQC.dataset.mixed_cached_dataset import MixedCachedOpenCLIPDataset  
from my_genQC.platform.simulation import Simulator
from my_genQC.models.frozen_open_clip import CachedFrozenOpenCLIPEmbedderConfig
from my_genQC.utils.misc_utils import DataLoaders
from my_genQC.models.unitary_encoder import Unitary_encoder_config

import my_genQC.dataset.circuits_dataset as circuits_dataset

device = infer_torch_device()
print(f'Using device: {device}')

string_to_gate_map = get_standard_gate_name_mapping()

Using device: cpu


In [3]:
def generate_and_save_dataset(dataset_config, text_encoder_config, dataset_path):
    """
    This function generates a quantum circuit dataset using genQC, based on a given configuration.
    """
    
    # Setup  
    vocabulary = {gate: idx for gate, idx in zip(dataset_config["AVAILABLE_GATES"], range(len(dataset_config["AVAILABLE_GATES"])))}
    simulator = Simulator(CircuitBackendType.QISKIT)  
    tokenizer = CircuitTokenizer(vocabulary)
      
    # Generate dataset with custom gate set  
    tensors, ys, Us, params = generate_circuit_dataset(  
        backend=simulator.backend,   
        tokenizer=tokenizer,  
        condition=dataset_config["CONDITION"],  
        total_samples=dataset_config["NUM_SAMPLES"],  
        num_of_qubits=dataset_config["NUM_QUBITS"],   
        min_gates=dataset_config["MIN_GATES"],   
        max_gates=dataset_config["MAX_GATES"],  
        min_sub_gate_pool_cnt=2,  # Minimum gates in pool  
        fixed_sub_gate_pool=dataset_config["AVAILABLE_GATES"]  # Your custom gate set  
    )
    
    # meta-data of dataset
    paras = {}
    
    if dataset_config["CONDITION"] is CircuitConditionType.SRV:
        paras["store_dict"]     = {'x':'tensor', 'y':'numpy'}   #what is in the datset, with type
    elif dataset_config["CONDITION"] is CircuitConditionType.UNITARY:
        paras["store_dict"]     = {'x':'tensor', 'y':'numpy', 'u':'tensor'}
        
    paras["optimized"]      = True    
    paras["dataset_to_gpu"] = True if device=="cuda" else False
    paras["random_samples"] = dataset_config["NUM_SAMPLES"]
    paras["num_of_qubits"]  = dataset_config["NUM_QUBITS"]
    paras["min_gates"]      = dataset_config["MIN_GATES"]
    paras["max_gates"]      = dataset_config["MAX_GATES"]
    paras["gate_pool"]      = dataset_config["AVAILABLE_GATES"]
    paras["max_params"]     = 0
    paras["pad_constant"]   = len(vocabulary) + 1

    # Step 2: Create dataset object  
    text_encoder = ConfigModel.from_config(text_encoder_config, device)  # Text encoder config  
    my_dataset = circuits_dataset.CircuitsConfigDataset(device=device, **paras)  
    my_dataset.x = tensors  
    my_dataset.y = ys  

    if dataset_config["CONDITION"] is CircuitConditionType.SRV:
        mixed_dataset = my_dataset

    elif dataset_config["CONDITION"] is CircuitConditionType.UNITARY:
        my_dataset.U = Us.float()
        datasets_list = [my_dataset]
        
        parameters = asdict(my_dataset.params_config)
        parameters["model_scale_factor"] = 4
        mixed_dataset, mixed_dataset_test = circuits_dataset.MixedCircuitsConfigDataset.from_datasets(datasets_list,
                                                                                  balance_maxes=[int(1e8)],          # what the maximum prompt (y) balance limit is, can be used to balance SRVs for different qubit numbers                                      
                                                                                  pad_constant=paras["pad_constant"],
                                                                                  device=device, 
                                                                                  bucket_batch_size=-1,         # if we use bucket padding
                                                                                  max_samples=[int(1e8)],
                                                                                  **parameters)

    mixed_dataset.save_dataset(save_path=dataset_path + "/dataset/ds", config_path=dataset_path+"/config.yaml")

## Stabilizer Circuits with Unitaries

In [9]:
# core configurations

dataset_path = "./datasets/unitary_clifford"

dataset_config = {
    "MIN_GATES": 2,
    "MAX_GATES": 16,
    "AVAILABLE_GATES": ['h', 'cx', 'cz', 's', 'x', 'y', 'z'],  # clifford gates
    "NUM_QUBITS": 3,
    "NUM_SAMPLES": 512,
    "CONDITION": CircuitConditionType.UNITARY,
}

time_stamp = time.strftime('%m/%d/%y %H:%M:%S', time.localtime())
text_encoder_config = {
     'target': 'genQC.training.frozen_open_clip.CachedFrozenOpenCLIPEmbedder',
     'save_path': None,
     'save_datetime': time_stamp,
     'save_type': None,
     'params': {"arch":'ViT-B-32', "version":'laion2b_s34b_b79k', "max_length":77, "freeze":True, "layer":'penultimate', "enable_cache_token_limit":True}
}

In [10]:
generate_and_save_dataset(dataset_config, text_encoder_config, dataset_path)

[INFO]: Generated 510 valid circuits.
[INFO]: After filtering unique circuits: 510.
[INFO]: `genQC.models.frozen_open_clip.CachedFrozenOpenCLIPEmbedder` instantiated from given `config` on cpu.
[INFO]: `genQC.models.frozen_open_clip.CachedFrozenOpenCLIPEmbedder`. Found no key `save_type` in `config`. No state dict loaded.
[INFO]: `genQC.models.frozen_open_clip.CachedFrozenOpenCLIPEmbedder`. Freeze model: True


  0%|          | 0/1 [00:00<?, ?it/s]

 - balance_tensor_dataset, njobs=1, number of samples=510
 - uniquify_tensor_dataset, number of samples now 510
 - balancing


  0%|          | 0/1 [00:00<?, ?it/s]

 - dataset size after balancing 510
[INFO]: allocate memory for U (510, 2, 8, 8) on cpu approx. 0.000 GB
Split: Train 485 - Test 25 

[INFO]: Saving tensor to `./datasets/unitary_clifford/dataset/ds_x.safetensors`.
[INFO]: Saving tensor to `./datasets/unitary_clifford/dataset/ds_y.safetensors`.
[INFO]: Saving tensor to `./datasets/unitary_clifford/dataset/ds_U.safetensors`.
[INFO]: Saving tensor to `./datasets/unitary_clifford/dataset/ds_z.safetensors`.


## Stabilizer Circuits with SRVs

In [20]:
# core configurations

dataset_path = "./datasets/srv_clifford"

dataset_config = {
    "MIN_GATES": 2,
    "MAX_GATES": 16,
    "AVAILABLE_GATES": ['h', 'cx', 'cz', 's', 'x', 'y', 'z'],  # clifford gates
    "NUM_QUBITS": 3,
    "NUM_SAMPLES": 512,
    "CONDITION": CircuitConditionType.SRV,
}

time_stamp = time.strftime('%m/%d/%y %H:%M:%S', time.localtime())
text_encoder_config = {
     'target': 'genQC.training.frozen_open_clip.CachedFrozenOpenCLIPEmbedder',
     'save_path': None,
     'save_datetime': time_stamp,
     'save_type': None,
     'params': {"arch":'ViT-B-32', "version":'laion2b_s34b_b79k', "max_length":77, "freeze":True, "layer":'penultimate', "enable_cache_token_limit":True}
}

In [21]:
generate_and_save_dataset(dataset_config, text_encoder_config, dataset_path)

[INFO]: Generated 512 valid circuits.
[INFO]: After filtering unique circuits: 511.
[INFO]: `genQC.models.frozen_open_clip.CachedFrozenOpenCLIPEmbedder` instantiated from given `config` on cpu.
[INFO]: `genQC.models.frozen_open_clip.CachedFrozenOpenCLIPEmbedder`. Found no key `save_type` in `config`. No state dict loaded.
[INFO]: `genQC.models.frozen_open_clip.CachedFrozenOpenCLIPEmbedder`. Freeze model: True
[INFO]: Saving tensor to `./datasets/srv_clifford/dataset/ds_x.safetensors`.
[INFO]: Saving tensor to `./datasets/srv_clifford/dataset/ds_y.safetensors`.
