In [1]:
import numpy as np
import requests
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import random
import os
import hashlib

Function to fetch CRY1 gene sequence from UCSC API

In [2]:
def get_CRY1_gene():
    response = requests.get("https://api.genome.ucsc.edu/getData/sequence?genome=hg38;chrom=chr12;start=106991364;end=107093549")
    if response.status_code == 200:
        return response.json().get("dna", "").upper()
    else:
        return None

One-hot encode sequence and return a 1D array

In [None]:
def onehotencoder(fasta_sequence, max_length=102500):
    sequence_array = np.array(list(fasta_sequence))
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(sequence_array)
    onehotencoder = OneHotEncoder(sparse_output=False, dtype=np.float32)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_sequence = onehotencoder.fit_transform(integer_encoded).astype(np.float32)

    if onehot_sequence.shape[0] < max_length:
        pad_size = max_length - onehot_sequence.shape[0]
        padding = np.zeros((pad_size, onehot_sequence.shape[1]))
        onehot_sequence = np.vstack([onehot_sequence, padding])
    else:
        onehot_sequence = onehot_sequence[:max_length, :]
    return onehot_sequence.flatten()

Augment sequence by introducing substitutions, deletions, or insertions

In [None]:
def augment_sequence(seq, mutation_rate=0.1):
    """
    Randomly mutates a subset of the sequence.
    - `mutation_rate`: Fraction of positions to mutate (default 10% of sequence).
    """
    random.seed(os.urandom(4))  # Ensure true randomness per sequence

    seq_list = list(seq)
    num_mutations = int(len(seq_list) * mutation_rate)

    for _ in range(num_mutations):
        idx = random.randint(0, len(seq_list) - 1)
        mutation_type = random.choice(["substitution", "deletion", "insertion"])

        if mutation_type == "substitution":
            seq_list[idx] = random.choice(["A", "G", "C", "T"])
        elif mutation_type == "deletion":
            del seq_list[idx]
        elif mutation_type == "insertion":
            seq_list.insert(idx, random.choice(["A", "G", "C", "T"]))

    return ''.join(seq_list)

Process augmented sequence and save to output file

In [3]:
def hash_sequence(sequence):
    return hashlib.sha256(sequence.encode()).hexdigest()

# Ensure sequence is unique before adding
def is_duplicate(sequence, existing_hashes):
    return hash_sequence(sequence) in existing_hashes

# Process augmented sequences and save to output file
def process_data_augmentation(cry1_seq, output_path, num_augmented_sequences=6000, batch_size=250):
    """
    Augments sequences, ensuring diversity, and saves them in batches.
    - `num_augmented_sequences`: Total number of augmented sequences to generate.
    - `batch_size`: Number of sequences to save in one batch.
    """
    existing_data = []
    existing_hashes = set()

    # Load existing data if available
    if os.path.exists(output_path):
        loaded_data = np.load(output_path, allow_pickle=True)["arr_0"].tolist()
        existing_data.extend(loaded_data)
        existing_hashes.update(hash_sequence(seq) for seq in loaded_data)

    seq_count = 0
    rows_save = []

    while seq_count < num_augmented_sequences:
        augmented_seq = augment_sequence(cry1_seq)

        # Ensure uniqueness
        if not is_duplicate(augmented_seq, existing_hashes):
            print(f"Processed augmented sequence {seq_count + 1}.")
            encoded_seq = onehotencoder(augmented_seq)
            seq_count += 1
            rows_save.append(encoded_seq)
            existing_hashes.add(hash_sequence(augmented_seq))

            if len(rows_save) >= batch_size:
                existing_data.extend(rows_save)
                np.savez_compressed(output_path, arr_0=np.array(existing_data))
                rows_save = []

    # Save remaining data
    if rows_save:
        existing_data.extend(rows_save)
        np.savez_compressed(output_path, arr_0=np.array(existing_data))

    print(f"Processed {seq_count} augmented sequences and saved to {output_path}.")

# Path to output file
output_file = "AUGMENTED_DATA_TRAINING_6000.npz"

# Fetch CRY1 sequence and process data with augmentation
cry1_seq = get_CRY1_gene()
if cry1_seq:
    process_data_augmentation(cry1_seq, output_file)
else:
    print("Failed to fetch CRY1 gene sequence.")

NameError: name 'augment_sequence' is not defined

Path to output file

Fetch CRY1 sequence and process data with augmentation

In [4]:
nonmutated_data = np.load("/content/AUGMENTED_DATA_TRAINING_6000_TRUE.npz", allow_pickle=True)
nonmutated_test = nonmutated_data['arr_0'][:1000]
output_test = "AUGMENTED_DATA_TEST_1000_6000"
np.savez_compressed(output_test, arr_0=np.array(nonmutated_test))


In [5]:
nonmutated_val = nonmutated_data['arr_0'][5000:]
output_val = "AUGMENTED_DATA_TEST_VAL_1000_6000"
np.savez_compressed(output_val, arr_0=np.array(nonmutated_val))


In [6]:
nonmutated_train = nonmutated_data['arr_0'][1000:5000]
output_train = "AUGMENTED_DATA_TRAIN_5000_6000"
np.savez_compressed(output_train, arr_0=np.array(nonmutated_train))


In [None]:
import hashlib
import os
import numpy as np
import random
import requests
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Hash function to ensure uniqueness
def hash_sequence(sequence):
    return hashlib.sha256(sequence.encode()).hexdigest()

# Ensure sequence is unique before adding
def is_duplicate(sequence, existing_hashes):
    return hash_sequence(sequence) in existing_hashes

# Apply random mutations to a sequence
def apply_random_mutations(sequence, mutation_rate=0.05, insertion_rate=0.02, deletion_rate=0.02):
    """
    Apply random mutations to a sequence including substitutions, insertions, and deletions.
    - `mutation_rate`: Probability of a nucleotide being substituted.
    - `insertion_rate`: Probability of a nucleotide being inserted at a random position.
    - `deletion_rate`: Probability of a nucleotide being deleted from a random position.
    """
    seq = list(sequence)  # Convert to a list to modify it

    # Substitutions: Replace some nucleotides with random ones
    for i in range(len(seq)):
        if random.random() < mutation_rate:
            seq[i] = random.choice(['A', 'T', 'C', 'G'])  # Randomly choose a nucleotide

    # Insertions: Insert random nucleotides at random positions
    for _ in range(int(len(seq) * insertion_rate)):  # Insertions based on rate
        pos = random.randint(0, len(seq))  # Random position for insertion
        seq.insert(pos, random.choice(['A', 'T', 'C', 'G']))  # Insert random nucleotide

    # Deletions: Remove nucleotides at random positions
    for _ in range(int(len(seq) * deletion_rate)):  # Deletions based on rate
        pos = random.randint(0, len(seq) - 1)  # Random position for deletion
        del seq[pos]  # Delete nucleotide at the chosen position

    return "".join(seq)

# Add jittering functions
def jitter_shift(sequence, max_shift=5):
    """Shift sequence randomly."""
    shift = random.randint(-max_shift, max_shift)
    return sequence[shift:] + sequence[:shift] if shift >= 0 else sequence[shift:] + sequence[:shift]

def jitter_insertion(sequence, max_insertions=3, insertion_rate=0.05):
    """Insert random nucleotides into the sequence."""
    seq_list = list(sequence)
    for _ in range(int(len(sequence) * insertion_rate)):
        pos = random.randint(0, len(seq_list))
        seq_list.insert(pos, random.choice(['A', 'T', 'C', 'G']))
    return ''.join(seq_list)

def jitter_deletion(sequence, max_deletions=3, deletion_rate=0.05):
    """Delete random nucleotides from the sequence."""
    seq_list = list(sequence)
    for _ in range(int(len(sequence) * deletion_rate)):
        pos = random.randint(0, len(seq_list) - 1)
        del seq_list[pos]
    return ''.join(seq_list)

def jitter_substitution(sequence, substitution_rate=0.05):
    """Substitute random nucleotides in the sequence."""
    seq_list = list(sequence)
    for i in range(len(seq_list)):
        if random.random() < substitution_rate:
            seq_list[i] = random.choice(['A', 'T', 'C', 'G'])
    return ''.join(seq_list)

def jitter_rotation(sequence, max_rotation=5):
    """Randomly rotate the sequence."""
    rotation = random.randint(1, max_rotation)
    return sequence[rotation:] + sequence[:rotation]

# Dummy augmentation function
def augment_sequence(sequence):
    """Apply jittering and mutations to the sequence."""
    sequence = jitter_shift(sequence)
    sequence = jitter_insertion(sequence)
    sequence = jitter_deletion(sequence)
    sequence = jitter_substitution(sequence)
    sequence = jitter_rotation(sequence)
    return apply_random_mutations(sequence)

# One-hot encoding for DNA sequences
def onehotencoder(fasta_sequence, max_length=13000):
    sequence_array = np.array(list(fasta_sequence))
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(sequence_array)
    onehotencoder = OneHotEncoder(sparse_output=False, dtype=np.float32)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_sequence = onehotencoder.fit_transform(integer_encoded).astype(np.float32)
    if onehot_sequence.shape[0] < max_length:
        pad_size = max_length - onehot_sequence.shape[0]
        padding = np.zeros((pad_size, onehot_sequence.shape[1]))
        onehot_sequence = np.vstack([onehot_sequence, padding])
    else:
        onehot_sequence = onehot_sequence[:max_length, :]
    return onehot_sequence.flatten()

# Process augmented sequences and save to output file
def process_data_augmentation(cry1_seq, output_path, num_augmented_sequences=6000, batch_size=3000):
    """
    Augments sequences, ensuring diversity, and saves them in batches.
    - `num_augmented_sequences`: Total number of augmented sequences to generate.
    - `batch_size`: Number of sequences to save in one batch.
    """
    existing_data = []
    existing_hashes = set()

    # Load existing data if available
    if os.path.exists(output_path):
        loaded_data = np.load(output_path, allow_pickle=True)["arr_0"].tolist()
        existing_data.extend(loaded_data)
        existing_hashes.update(hash_sequence(seq) for seq in loaded_data)

    seq_count = 0
    rows_save = []

    while seq_count < num_augmented_sequences:
        augmented_seq = augment_sequence(cry1_seq)

        # Ensure uniqueness
        if not is_duplicate(augmented_seq, existing_hashes):
            print(f"Processed augmented sequence {seq_count + 1}.")
            encoded_seq = onehotencoder(augmented_seq)
            seq_count += 1
            rows_save.append(encoded_seq)
            existing_hashes.add(hash_sequence(augmented_seq))

            if len(rows_save) >= batch_size:
                existing_data.extend(rows_save)
                np.savez_compressed(output_path, arr_0=np.array(existing_data))
                rows_save = []

    # Save remaining data
    if rows_save:
        existing_data.extend(rows_save)
        np.savez_compressed(output_path, arr_0=np.array(existing_data))

    print(f"Processed {seq_count} augmented sequences and saved to {output_path}.")

# Path to output file
output_file = "AUGMENTED_DATA_TRAINING_6000_TRUE.npz"

# Fetch CRY1 sequence and process data with augmentation
cry1_seq = get_CRY1_gene()  # Replace with the actual method to fetch your CRY1 sequence
if cry1_seq:
    process_data_augmentation(cry1_seq, output_file)
else:
    print("Failed to fetch CRY1 gene sequence.")
