In [1]:
import zarr
import pysam
import numpy as np

In [2]:
m = np.array([1,2,4])
n = np.array([1,3,4])

np.stack((n,m), axis=0)

array([[1, 3, 4],
       [1, 2, 4]])

In [3]:
store = zarr.storage.MemoryStore()
z = zarr.create_array(store=store, shape=(10, 0), chunks=(10, 100), dtype='int32')
z.shape



(10, 0)

In [4]:
m = np.arange(200).reshape((10,20))
z.append(m, axis = 1)
z.shape

(10, 20)

In [5]:

z2 = zarr.create_array(zarr.storage.MemoryStore(), shape=(1,0), chunks=(1,10), dtype='int32')
z2.append(np.array([1,2]).reshape(1,-1), axis=1)
z2[:,:]

array([[1, 2]], dtype=int32)

In [6]:
z[:,:]

array([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19],
       [ 20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,
         33,  34,  35,  36,  37,  38,  39],
       [ 40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59],
       [ 60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,
         73,  74,  75,  76,  77,  78,  79],
       [ 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
         93,  94,  95,  96,  97,  98,  99],
       [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
        113, 114, 115, 116, 117, 118, 119],
       [120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
        133, 134, 135, 136, 137, 138, 139],
       [140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
        153, 154, 155, 156, 157, 158, 159],
       [160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 1

In [7]:
da1 = zarr.open_group('../data/01_processed/ssl_sets/da1_subset_10k.zarr')

In [8]:
read=50
da1['data'][:,int(da1['indptr'][read]):int(da1['indptr'][read+1])]

array([[ 0,  2,  2, ...,  2,  1,  1],
       [93, 44, 93, ..., 93, 93, 93],
       [33, 36, 40, ..., 24, 46, 18],
       [11, 15, 18, ...,  7, 30, 33],
       [ 8, 43, 15, ..., 69,  9, 35],
       [33, 33,  8, ..., 20, 14, 15]], shape=(6, 21159), dtype=uint8)

In [9]:
da1['indptr'][1]


array(20986, dtype=uint32)

In [10]:
da1['data'][:,0:5]

array([[ 0,  3,  3,  1,  1],
       [93, 70, 93, 76, 93],
       [13, 13, 12, 12, 17],
       [12, 16,  8, 51, 13],
       [ 5, 21, 15, 27, 15],
       [14,  8, 17, 37, 24]], dtype=uint8)

In [12]:
import numpy as np
import timeit

# --- 1. SETUP: Create Dummy Data ---
# A typical PacBio read might be 15k - 25k bases
SEQ_LEN = 20_000 
seq_map = {'A': 1, 'C': 2, 'G': 3, 'T': 4, 'N': 0}

# Generate a random DNA string
rng = np.random.default_rng(42)
bases = np.array(list("ACGTN"))
seq_str = "".join(rng.choice(bases, size=SEQ_LEN))

print(f"Testing with sequence length: {SEQ_LEN:,} bases")

# --- 2. OLD METHOD: List Comprehension ---
def slow_method():
    # Loop -> Dictionary Lookup -> Create List -> Create Array
    return np.array([seq_map.get(base, 0) for base in seq_str], dtype=np.uint8)

# --- 3. NEW METHOD: Vectorized Lookup ---
# Create the lookup table (Run this ONCE outside the loop)
lookup_table = np.zeros(128, dtype=np.uint8)
for base, val in seq_map.items():
    lookup_table[ord(base)] = val

def fast_method():
    # String -> Bytes -> Ints -> Lookup
    # 1. seq_str.encode('ascii') gets raw bytes
    # 2. np.frombuffer views bytes as uint8 (ASCII values)
    # 3. lookup_table[] uses those ASCII values as indices
    return lookup_table[np.frombuffer(seq_str.encode('ascii'), dtype=np.uint8)]

# --- 4. VERIFICATION ---
# Ensure they produce the exact same result
assert np.array_equal(slow_method(), fast_method())
print("Verification: Results match perfectly.\n")

# --- 5. BENCHMARKING ---
# Run 100 iterations of each
n_loops = 1000
t_slow = timeit.timeit(slow_method, number=n_loops)
t_fast = timeit.timeit(fast_method, number=n_loops)

print(f"Slow method avg: {t_slow/n_loops*1000:.3f} ms")
print(f"Fast method avg: {t_fast/n_loops*1000:.3f} ms")
print(f"Speedup: {t_slow/t_fast:.1f}x")

Testing with sequence length: 20,000 bases
Verification: Results match perfectly.

Slow method avg: 1.595 ms
Fast method avg: 0.039 ms
Speedup: 40.5x
