# Ch3: Learning the Logic of DNA

In [1]:
import pandas as pd

path1 = r"C:\Users\justc\Documents\ml-study-notes\dlfb-clone\assets\dna\datasets\CTCF_train_sequences.csv"
train_df = pd.read_csv(path1)
train_df

Unnamed: 0,sequence,label,transcription_factor,subset
0,TACCACATGAGTTCTCTTTCAGTTTGCTATGGAAGACACAAAAACC...,1,CTCF,train
1,CATCAACACTCGTGCGACGCCCTCGCATTTTCATTAATGATGGCCT...,0,CTCF,train
2,GCACACAGCGCAGGAACCTGGCACTGGAGAAGCCACCCAGGCTGTG...,1,CTCF,train
3,GCCAGCCGAAGCAGAGAGCAGTGCACATGCGTGCAGCTACCAGCAT...,1,CTCF,train
4,GTCCCGTCCCTTTTCGTGCCTCTGCCCCCACAGCTGCTGCAGTCTG...,1,CTCF,train
...,...,...,...,...
61078,AATATGACCCTGCTGGCCTTAGGCCTACTCCTGTACCACAAGTGCC...,0,CTCF,train
61079,GATAAACCAAGGTCGTAAGTTCAGGCTCCGCCTCCCCGCAGGGCCT...,1,CTCF,train
61080,CCTCCCTCCCATCCCCCACACAGTTTAATGTCTAGAAGGTTGCCTG...,1,CTCF,train
61081,CAGGAATGCACCGGAAGTCCGCCTCCCGGGACCCGCCGCCGGTCCC...,0,CTCF,train


In [2]:
train_df["label"].value_counts()

label
1    30545
0    30538
Name: count, dtype: int64

In [3]:
import numpy as np

def dna_to_one_hot(dna_sequence: str) -> np.ndarray:
    """Convert DNA into a one-hot encoded format with channel ordering ACGT."""
    base_to_one_hot = {
        "A": (1, 0, 0, 0),
        "C": (0, 1, 0, 0),
        "G": (0, 0, 1, 0),
        "T": (0, 0, 0, 1),
        "N": (1, 1, 1, 1), # N represents any unknown or ambiguous base
    }
    one_hot_encoded = np.array([base_to_one_hot[base] for base in dna_sequence])
    return one_hot_encoded

In [4]:
dna_to_one_hot("AAACGT")

array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1]])

In [5]:
x_train = np.array([dna_to_one_hot(seq) for seq in train_df["sequence"]])
y_train = train_df["label"].values[:, None]

In [6]:
def load_dataset(sequence_db) -> dict[str, np.ndarray]:
    """Load sequences and labels from a CSV into numpy arrays"""
    df = pd.read_csv(sequence_db)
    return {
        "labels": df["label"].to_numpy()[:, None],
        "sequences": np.array([dna_to_one_hot(seq) for seq in df["sequence"]]),
    }

In [8]:
import tensorflow as tf
def convert_to_tfds(dataset, batch_size: int | None = None, is_training: bool = False):
    """Convert DNA sequences and labels to a TensorFlow dataset"""
    ds = tf.data.Dataset.from_tensor_slices(dataset)
    if is_training:
        ds = ds.shuffle(buffer_size=len(dataset["sequences"]))
        ds = ds.repeat()
    batch_size = batch_size or len(dataset["labels"])
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
    return ds 
batch_size = 32

train_ds = convert_to_tfds(
    load_dataset(r"C:\Users\justc\Documents\ml-study-notes\dlfb-clone\assets\dna\datasets\CTCF_train_sequences.csv"),
    batch_size = batch_size,
    is_training=True,
)

In [9]:
batch = next(train_ds.as_numpy_iterator())
print(f'Batch sequence shape: {batch["sequences"].shape}')
print(f'Batch sequence instances: {batch["sequences"][:3, :3, ]}...')
print(f'Batch labels shape: {batch["labels"].shape}')
print(f'Batch labels instances: {batch["labels"][:3,]}')

Batch sequence shape: (32, 200, 4)
Batch sequence instances: [[[0 1 0 0]
  [0 0 0 1]
  [0 0 0 1]]

 [[0 0 1 0]
  [1 0 0 0]
  [1 0 0 0]]

 [[1 0 0 0]
  [0 0 1 0]
  [0 0 0 1]]]...
Batch labels shape: (32, 1)
Batch labels instances: [[1]
 [0]
 [0]]
