## Define helper functions and load modules

In [1]:
import keras
import keras_tuner as kt
import tensorflow as tf
import Bio.SeqIO as SeqIO
import random
import numpy as np
import sys
import pandas as pd
import tqdm
from keras.models import Sequential 
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM, Dropout, Bidirectional, BatchNormalization
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


def balanced_accuracy(y_true, y_pred):
    # Convert tensors to NumPy arrays for processing
    y_true = tf.make_ndarray(y_true)
    y_pred = tf.make_ndarray(y_pred)

    # Calculate confusion matrix
    confusion = tf.math.confusion_matrix(y_true, y_pred, num_classes=2)

    # Calculate sensitivity (true positive rate) for each class
    tp = confusion[1, 1]
    fn = confusion[1, 0]
    sensitivity = tp / (tp + fn)

    # Calculate the balanced accuracy as the average sensitivity
    balanced_acc = sensitivity

    return balanced_acc

def remove_N(seq):
    """
    Remove Ns from sequence
    """
    return seq.upper().replace("N", "")

def onehote(seq):
    """
    One Hot encoding function
    """
    seq2=list()
    mapping = {"A":[1., 0., 0., 0.], "C": [0., 1., 0., 0.], "G": [0, 0., 1., 0.], "T":[0., 0., 0., 1.], "N":[0., 0., 0., 0.]}
    for i in seq:
      seq2.append(mapping[i]  if i in mapping.keys() else [0., 0., 0., 0.]) 
    return np.array(seq2)


2024-01-02 10:19:54.881821: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-02 10:19:54.881880: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-02 10:19:54.881913: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-02 10:19:54.888493: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using TensorFlow backend


In [10]:
MAX_LEN=3000
MIN_LEN=0
n_classes = 15
LTRs = [rec for rec in SeqIO.parse("/var/tmp/xhorvat9/ltr_bert/FASTA_files/train_LTRs.fasta", "fasta") if len(rec.seq) < MAX_LEN+500 and len(rec.seq) > MIN_LEN]
n_sequences = len(LTRs)

generated, genomic, markov = int(n_sequences*0.2), int(n_sequences*0.5), int(n_sequences*0.3)

d = pd.DataFrame({'sequence':[str(rec.seq) for rec in LTRs], 'label':[rec.description.split(" ")[4] for rec in LTRs]})

d = d[~d['label'].str.contains("copia")]
d = d[d["label"].isin(d["label"].value_counts()[:n_classes].index.tolist())]

In [11]:
import pickle
# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(d['label'])
pickle.dump(label_encoder, open("/var/tmp/xhorvat9/ltr_bert/NewClassifiers/Lineage/label_encoder.b", "wb"))

sequences = [onehote(remove_N(seq)) for seq in tqdm.tqdm(d["sequence"])]
#sequences = [onehote(str(rec.seq)) for rec in tqdm.tqdm(LTRs)] + [onehote(str(rec.seq)) for rec in tqdm.tqdm(non_LTRs)]

# Split into train and test
paddedDNA = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding="pre", maxlen=MAX_LEN)
trainX, valX, trainY, valY = train_test_split(paddedDNA, encoded_labels, test_size=0.1, random_state=42)



  0%|          | 0/134878 [00:00<?, ?it/s]

100%|██████████| 134878/134878 [00:28<00:00, 4701.44it/s]


In [12]:
#label_weights = class_weight.compute_class_weight('balanced', np.unique(d["label"]), d["label"])
label_weights = class_weight.compute_class_weight( class_weight='balanced', classes=np.unique(encoded_labels), y=encoded_labels)
weights = {c:w for c, w in zip(np.unique(encoded_labels), label_weights)}


## Run code for short sequence training

In [14]:
model2 = Sequential()

model2.add(Conv1D(filters=128, kernel_size=8, padding='same', activation='relu', input_shape=trainX[0].shape))
model2.add(Dropout(0.2))  # You can adjust the dropout rate as needed
model2.add(MaxPooling1D(pool_size=4))

model2.add(LSTM(150))
model2.add(Dense(units=n_classes, activation='softmax'))

model2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'], weighted_metrics=["sparse_categorical_accuracy"])

#model2.fit(valX, np.array(valY), epochs=3, batch_size=64,verbose = 1,validation_data=(valX, np.array(valY)), callbacks=[WandbCallback()])
model2.fit(trainX, trainY, epochs=15, batch_size=64,verbose = 1,validation_data=(valX, valY), callbacks=[EarlyStopping(monitor='val_loss', patience=3)], class_weight=weights)

model2.save("all_length_cnn_lstm.h5")

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15


  saving_api.save_model(


In [43]:
MAX_LEN=3000
MIN_LEN=700

n_classes = 15
LTRs = [rec for rec in SeqIO.parse("/var/tmp/xhorvat9/ltr_bert/FASTA_files/train_LTRs.fasta", "fasta") if len(rec.seq) < MAX_LEN and len(rec.seq) > MIN_LEN]
n_sequences = len(LTRs)

generated, genomic, markov = int(n_sequences*0.2), int(n_sequences*0.5), int(n_sequences*0.3)

d = pd.DataFrame({'sequence':[str(rec.seq) for rec in LTRs], 'label':[rec.description.split(" ")[4] for rec in LTRs]})

d = d[~d['label'].str.contains("copia")]
d = d[d["label"].isin(d["label"].value_counts()[:n_classes].index.tolist())]

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(d['label'])

sequences = [onehote(remove_N(seq)) for seq in tqdm.tqdm(d["sequence"])]
#sequences = [onehote(str(rec.seq)) for rec in tqdm.tqdm(LTRs)] + [onehote(str(rec.seq)) for rec in tqdm.tqdm(non_LTRs)]

# Split into train and test
paddedDNA = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding="pre", maxlen=MAX_LEN)
trainX, valX, trainY, valY = train_test_split(paddedDNA, encoded_labels, test_size=0.15, random_state=42)

#label_weights = class_weight.compute_class_weight('balanced', np.unique(d["label"]), d["label"])
label_weights = class_weight.compute_class_weight( class_weight='balanced', classes=np.unique(encoded_labels), y=encoded_labels)
weights = {c:w for c, w in zip(np.unique(encoded_labels), label_weights)}
weights

model2 = Sequential()

model2.add(Conv1D(filters=128, kernel_size=32, padding='same', activation='relu', input_shape=trainX[0].shape))
model2.add(Dropout(0.2))  # You can adjust the dropout rate as needed
model2.add(MaxPooling1D(pool_size=4))
model2.add(LSTM(100))
model2.add(Dense(units=n_classes, activation='softmax'))

model2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'], weighted_metrics=["sparse_categorical_accuracy"])

#model2.fit(valX, np.array(valY), epochs=3, batch_size=64,verbose = 1,validation_data=(valX, np.array(valY)), callbacks=[WandbCallback()])
model2.fit(trainX, trainY, epochs=15, batch_size=64,verbose = 1,validation_data=(valX, valY), callbacks=[EarlyStopping(monitor='val_loss', patience=3)], class_weight=weights)

model2.save("medium_seq_cnn_lstm.h5")

100%|██████████| 31706/31706 [00:34<00:00, 913.94it/s]


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15


  saving_api.save_model(
