In [None]:
import numpy as np
import pandas as pd
import os
import requests
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

Function to fetch CRY1 gene sequence from UCSC API

In [None]:
def get_CRY1_gene():
    response = requests.get("https://api.genome.ucsc.edu/getData/sequence?genome=hg38;chrom=chr12;start=106991364;end=107093549")
    if response.status_code == 200:
        return response.json().get("dna", "").upper()
    else:
        return None

One-hot encode sequence and return a 1D array

In [None]:
def onehotencoder(fasta_sequence, max_length = 102500):
    sequence_array = np.array(list(fasta_sequence))
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(sequence_array)
    onehotencoder = OneHotEncoder(sparse_output=False, dtype = np.float32)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_sequence = onehotencoder.fit_transform(integer_encoded).astype(np.float32)
    if onehot_sequence.shape[0] < max_length:
        pad_size = max_length - onehot_sequence.shape[0]
        padding = np.zeros((pad_size, onehot_sequence.shape[1]))
        onehot_sequence = np.vstack([onehot_sequence, padding])
    else:
        onehot_sequence = onehot_sequence[:max_length, :]
    return(onehot_sequence.flatten())
# Function to add mutation to the CRY1 sequence
def add_mutation(seq, start, end, refnuc, altnuc):
    seq_start = start
    seq_end = end
    if seq[seq_start:seq_end] != refnuc:
        print(f"Mutation mismatch at positions {start}-{end}.")
        return None
    else:
        mutated_seq = seq[:seq_start] + altnuc + seq[seq_end:]
        return mutated_seq

Process the mutation CSV and apply mutations to the CRY1 sequence

In [None]:
def process_mutations(csv_path, cry1_seq, output_path):
    df = pd.read_csv(csv_path, usecols=['chromEnd', 'ref', 'alt', 'AF', 'genes', 'variation_type', '_displayName'])
    df = df[df["variation_type"].str.contains("intron_variant", na=False, case=False)]

    # Initialize existing data for appending
    if os.path.exists(output_path):
        existing_data = np.load(output_path, allow_pickle=True)["arr_0"].tolist()
    else:
        existing_data = []

    # Process each row in the CSV
    seq_count = 0
    batch = 500
    rows_save = []
    max_seq_count = 5000
    for index, row in df.iterrows():
        if seq_count >= max_seq_count:
            print("Reached maximum sequence count.")
            break
        gnomAD_ID = row["_displayName"]
        refnuc = str(row["ref"]) if pd.notna(row["ref"]) else ""
        start = row["chromEnd"] - 106991364 - len(row["ref"])
        end = row["chromEnd"] - 106991364
        altnuc = str(row["alt"]) if pd.notna(row["alt"]) else ""

        # Add mutation to the sequence
        mutated_seq = add_mutation(cry1_seq, start, end, refnuc, altnuc)
        print(f"{seq_count}: Processed mutation {gnomAD_ID} at positions {start}-{end}.")
        if mutated_seq:
            encoded_seq = onehotencoder(mutated_seq)
            seq_count += 1
            rows_save.append(encoded_seq)
            if len(rows_save) >= batch:
                existing_data.extend(rows_save)
                np.savez_compressed(output_path, arr_0=np.array(existing_data))
                rows_save = []

    if rows_save:
        existing_data.extend(rows_save)
        np.savez_compressed(output_path, arr_0=np.array(existing_data))
    print(f"Processed {seq_count} mutated sequences and saved to {output_path}.")

Path to CSV file with mutations and output path

In [None]:
csv_file = "cry1realvariations (1).csv"
output_file = "E:\\datasets\\processeddata\\MUTATION_DATA_TRAINING_5000_4.npz"

Fetch CRY1 sequence and process mutations

In [None]:
cry1_seq = get_CRY1_gene()
if cry1_seq:
    process_mutations(csv_file, cry1_seq, output_file)
else:
    print("Failed to fetch CRY1 gene sequence.")

0: Processed mutation chr12-106992013-G-A at positions 648-649.
1: Processed mutation chr12-106992014-G-A at positions 649-650.
2: Processed mutation chr12-106992015-G-A at positions 650-651.
3: Processed mutation chr12-106992017-G-A at positions 652-653.
4: Processed mutation chr12-106992017-G-GA at positions 652-653.
5: Processed mutation chr12-106992018-A-C at positions 653-654.
6: Processed mutation chr12-106992018-A-G at positions 653-654.
7: Processed mutation chr12-106992017-GA-G at positions 652-654.
8: Processed mutation chr12-106992031-G-A at positions 666-667.
9: Processed mutation chr12-106992037-C-G at positions 672-673.
10: Processed mutation chr12-106992042-T-G at positions 677-678.
11: Processed mutation chr12-106992042-T-C at positions 677-678.
12: Processed mutation chr12-106992045-C-A at positions 680-681.
13: Processed mutation chr12-106992045-C-T at positions 680-681.
14: Processed mutation chr12-106992051-A-G at positions 686-687.
15: Processed mutation chr12-1069

In [None]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU Memory Growth Enabled")
    except RuntimeError as e:
        print(e)


GPU Memory Growth Enabled


In [1]:
import numpy as np
mutated_data = np.load("/content/MUTATION_DATA_TRAINING_6000.npz", allow_pickle=True)
mutated_test = mutated_data['arr_0'][:1000]
output_test = "MUTATED_DATA_TEST_1000_6000"
np.savez_compressed(output_test, arr_0=np.array(mutated_test))
mutated_val = mutated_data['arr_0'][5000:]
output_val = "MUTATED_DATA_VAL_1000_6000"
np.savez_compressed(output_val, arr_0=np.array(mutated_val))
mutated_train = mutated_data['arr_0'][1000:5000]
output_train = "MUTATED_DATA_TRAIN_5000_6000"
np.savez_compressed(output_train, arr_0=np.array(mutated_train))



FileNotFoundError: [Errno 2] No such file or directory: '/content/MUTATION_DATA_TRAINING_6000.npz'