In [None]:
import numpy as np
from scipy.stats import entropy
from Bio.Seq import Seq
from Bio import motifs
from itertools import product
from collections import Counter
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix, classification_report
import random
import os

random.seed(42)
np.random.seed(42)
os.environ['PYTHONHASHSEED'] = '42'

sequences = []
with open("B_Enhancer.txt", "r") as file:
    seq = ""
    for line in file:
        line = line.strip()
        if line.startswith(">"):
            if seq:
                sequences.append(seq)
                seq = ""
        else:
            seq += line
    if seq:
        sequences.append(seq)

labels = [1 for i in range(len(sequences))]
with open("B_NonEnhancer.txt", "r") as file:
    seq = ""
    for line in file:
        line = line.strip()
        if line.startswith(">"):
            if seq:
                sequences.append(seq)
                seq = ""
        else:
            seq += line
    if seq:
        sequences.append(seq)
labels = labels + [0 for i in range(len(sequences) - len(labels))]

new_sequences = []
with open("I_Enhancer.txt", "r") as file:
    seq = ""
    for line in file:
        line = line.strip()
        if line.startswith(">"):
            if seq:
                new_sequences.append(seq)
                seq = ""
        else:
            seq += line
    if seq:
        new_sequences.append(seq)


new_labels = [1 for i in range(len(new_sequences))]


with open("I_NonEnhancer.txt", "r") as file:
    seq = ""
    for line in file:
        line = line.strip()
        if line.startswith(">"):
            if seq:
                new_sequences.append(seq)
                seq = ""
        else:
            seq += line
    if seq:
        new_sequences.append(seq)

new_labels = new_labels + [0 for i in range(len(new_sequences) - len(new_labels))]


def load_dna2vec(path, k=3):
    embedding_dict = {}
    with open(path, 'r') as f:
        for line in f:
            values = line.strip().split()
            kmer = values[0]
            if len(kmer) == k:
                vector = np.array(values[1:], dtype=np.float32)
                embedding_dict[kmer] = vector
    return embedding_dict

def kmer_tokenize_str(sequences, k=3):
    kmer_seqs = []
    for seq in sequences:
        seq = seq.upper()
        kmers = [seq[i:i+k] for i in range(len(seq)-k+1)]
        kmer_seqs.append(kmers)
    return kmer_seqs

def build_kmer_index(kmer_seqs):
    kmer_set = set(k for seq in kmer_seqs for k in seq)
    kmer_to_idx = {k: i+1 for i, k in enumerate(sorted(kmer_set))}
    return kmer_to_idx

def encode_kmers(kmer_seqs, kmer_to_idx):
    encoded = np.zeros((len(kmer_seqs), len(kmer_seqs[0])), dtype=np.int32)
    for i, seq in enumerate(kmer_seqs):
        for j, kmer in enumerate(seq):
            encoded[i, j] = kmer_to_idx.get(kmer, 0)
    return encoded

def build_embedding_matrix(kmer_to_idx, embedding_dict, embedding_dim):
    vocab_size = len(kmer_to_idx) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for kmer, idx in kmer_to_idx.items():
        if kmer in embedding_dict:
            embedding_matrix[idx] = embedding_dict[kmer]
        else:
            embedding_matrix[idx] = np.random.normal(scale=0.1, size=(embedding_dim,))
    return embedding_matrix


k = 3
embedding_dim = 100
dna2vec_path = '/content/drive/MyDrive/dna2vec-20250728-1713-k3to8-100d-10c-20Mbp-sliding-5B7.w2v'

kmer_seqs = kmer_tokenize_str(sequences, k=k)
kmer_to_idx = build_kmer_index(kmer_seqs)

embedding_dict = load_dna2vec(dna2vec_path, k=k)
embedding_matrix = build_embedding_matrix(kmer_to_idx, embedding_dict, embedding_dim)
encoded_seqs = encode_kmers(kmer_seqs, kmer_to_idx)

kmer_seqs_new = kmer_tokenize_str(new_sequences, k=k)
encoded_seqs_new = encode_kmers(kmer_seqs_new, kmer_to_idx)

def seqs_to_embedded_vectors(encoded_seqs, embedding_matrix):
    vectors = []
    for seq in encoded_seqs:
        emb = embedding_matrix[seq]
        avg_emb = np.mean(emb, axis=0)
        vectors.append(avg_emb)
    return np.array(vectors)

X_train = seqs_to_embedded_vectors(encoded_seqs, embedding_matrix)
X_test = seqs_to_embedded_vectors(encoded_seqs_new, embedding_matrix)


def performance(labelArr, predictArr):
    TN, FP, FN, TP = metrics.confusion_matrix(labelArr, predictArr).ravel()
    ACC = metrics.accuracy_score(labelArr, predictArr)
    SN = metrics.recall_score(labelArr, predictArr)
    SP = TN/(FP + TN)
    MCC= matthews_corrcoef(labelArr, predictArr)
    return ACC,SN,SP,MCC



def compute_gc_content(seq):
    seq = seq.upper()
    gc_count = seq.count('G') + seq.count('C')
    return gc_count / len(seq) if len(seq) > 0 else 0

def compute_sequence_entropy(seq, k=3):
    seq = seq.upper()
    kmers = [seq[i:i+k] for i in range(len(seq)-k+1)]
    kmer_counts = Counter(kmers)
    probs = [count / len(kmers) for count in kmer_counts.values()]
    return entropy(probs)

def extract_stat_features(seqs, k=3):
    features = []
    for seq in seqs:
        gc = compute_gc_content(seq)
        ent = compute_sequence_entropy(seq, k=k)
        features.append([gc, ent])
    return np.array(features)


def make_pwm_from_consensus(consensus):
    instances = [Seq(consensus)]
    m = motifs.create(instances)
    pwm = m.counts.normalize(pseudocounts=0.1)
    return pwm

def pwm_score_features(sequences, motif_list):
    pwm_logodds_list = []
    for motif in motif_list:
        pwm = make_pwm_from_consensus(motif)
        log_odds = pwm.log_odds()
        pwm_logodds_list.append(log_odds)

    features = []
    for seq in sequences:
        seq = Seq(seq.upper())
        seq_scores = []
        for log_odds in pwm_logodds_list:
            scores = [score for _, score in log_odds.search(seq)]
            max_score = max(scores) if scores else 0.0
            seq_scores.append(max_score)
        features.append(seq_scores)
    return np.array(features)



def get_all_kmers(k):
    return [''.join(p) for p in product('ACGT', repeat=k)]

def kmer_frequency_features(sequences, k=3):
    all_kmers = get_all_kmers(k)
    features = []
    for seq in sequences:
        seq = seq.upper()
        kmers = [seq[i:i+k] for i in range(len(seq)-k+1)]
        kmer_counts = Counter(kmers)
        total = sum(kmer_counts.values())
        freq_vector = [kmer_counts[kmer] / total if total > 0 else 0.0 for kmer in all_kmers]
        features.append(freq_vector)
    return np.array(features)


def dinuc_freq(sequences):
    dinucs = [a+b for a in 'ACGT' for b in 'ACGT']
    features = []

    for seq in sequences:
        seq = seq.upper()
        total = len(seq) - 1
        counts = Counter([seq[i:i+2] for i in range(total)])
        freq = [counts[d]/total if total > 0 else 0 for d in dinucs]
        features.append(freq)

    return np.array(features)


X_train_dinuc = dinuc_freq(sequences)
X_test_dinuc = dinuc_freq(new_sequences)

# motifs_list = ['TATA', 'CAAT','GATA']
motifs_list = ['TATA','CCCTG','CCTGG','GCCTG','CCCAGG','CCCAGCC','TTGGGAG']
motif_features_train = pwm_score_features(sequences, motifs_list)
motif_features_test = pwm_score_features(new_sequences, motifs_list)

stat_features_train = extract_stat_features(sequences, k=3)
stat_features_test = extract_stat_features(new_sequences, k=3)

kmer_freq_train = kmer_frequency_features(sequences, k=3)
kmer_freq_test = kmer_frequency_features(new_sequences, k=3)

X_train = np.concatenate([X_train, kmer_freq_train], axis=1)
X_test = np.concatenate([X_test, kmer_freq_test], axis=1)
X_train = np.concatenate([X_train, stat_features_train], axis=1)
X_test = np.concatenate([X_test, stat_features_test], axis=1)
X_train = np.concatenate([X_train, motif_features_train], axis=1)
X_test = np.concatenate([X_test, motif_features_test], axis=1)
X_train = np.concatenate([X_train, X_train_dinuc], axis=1)
X_test = np.concatenate([X_test, X_test_dinuc], axis=1)

In [None]:
CNN

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, matthews_corrcoef


X_train = np.array(X_train)
y_train = np.array(labels)
X_test = np.array(X_test)
y_test = np.array(new_labels)
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

def build_cnn_model(input_shape):
    model = Sequential([
        Conv1D(128, 3, activation='relu', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling1D(2),
        Dropout(0.3),
        Conv1D(64, 3, activation='relu'),
        BatchNormalization(),
        MaxPooling1D(2),
        Dropout(0.3),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


model = build_cnn_model((X_train.shape[1], 1))


early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train, y_train, validation_split=0.2, epochs=30,
    batch_size=64, callbacks=[early_stop],
    verbose=1
)

y_pred_test = (model.predict(X_test) > 0.5).astype(int)

TN, FP, FN, TP = confusion_matrix(y_test, y_pred_test).ravel()
ACC = accuracy_score(y_test, y_pred_test)
SN = recall_score(y_test, y_pred_test)
SP = TN / (TN + FP)
MCC = matthews_corrcoef(y_test, y_pred_test)

print("\nTest Set Results:")
print(f"Accuracy: {ACC:.4f}")
print(f"Sensitivity (Recall): {SN:.4f}")
print(f"Specificity: {SP:.4f}")
print(f"MCC: {MCC:.4f}")


CNN+attention

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Conv1D, MaxPooling1D, Dense, Dropout, BatchNormalization,
    MultiHeadAttention, Add, LayerNormalization, GlobalAveragePooling1D
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, matthews_corrcoef

X_train = np.array(X_train)
y_train = np.array(labels)
X_test = np.array(X_test)
y_test = np.array(new_labels)
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

def build_advanced_cnn_transformer(input_shape,
                                   num_heads=8,
                                   key_dim=64,
                                   dropout_rate=0.4,
                                   dense_units=256,
                                   lr=1e-4):
    inputs = Input(shape=input_shape)

    x = Conv1D(256, 7, activation='relu', padding='same')(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(dropout_rate)(x)
    x = Conv1D(128, 5, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(dropout_rate)(x)
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(x, x)
    attn_output = Dense(x.shape[-1])(attn_output) 
    x = Add()([x, attn_output])
    x = LayerNormalization()(x)
    ffn = Dense(dense_units, activation='relu')(x)
    ffn = Dropout(dropout_rate)(ffn)
    ffn = Dense(x.shape[-1])(ffn)
    x = Add()([x, ffn])
    x = LayerNormalization()(x)
    x = GlobalAveragePooling1D()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    outputs = Dense(1, activation='sigmoid')(x)

    model = Model(inputs, outputs)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = build_advanced_cnn_transformer(
    input_shape=(X_train.shape[1], 1),
    num_heads=11,
    key_dim=64,
    dropout_rate=0.3,
    dense_units=256,
    lr=1e-4
)
model.summary()

early_stop = EarlyStopping(monitor='accuracy', patience=6, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='accuracy', factor=0.5, patience=3, verbose=1)

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50, batch_size=64, callbacks=[early_stop, reduce_lr],
    verbose=1
)

y_pred_test = (model.predict(X_test) > 0.5).astype(int)

TN, FP, FN, TP = confusion_matrix(y_test, y_pred_test).ravel()
ACC = accuracy_score(y_test, y_pred_test)
SN = recall_score(y_test, y_pred_test)
SP = TN / (TN + FP)
MCC = matthews_corrcoef(y_test, y_pred_test)

print("\n Test Set Results:")
print(f"Accuracy: {ACC:.4f}")
print(f"Sensitivity (SN): {SN:.4f}")
print(f"Specificity (SP): {SP:.4f}")
print(f"MCC: {MCC:.4f}")


lstm+cnn

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Conv1D, MaxPooling1D, Flatten, Dense, Dropout,
    BatchNormalization, LSTM, Bidirectional
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, matthews_corrcoef

X_train = np.array(X_train)
y_train = np.array(labels)
X_test = np.array(X_test)
y_test = np.array(new_labels)
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

def build_cnn_lstm_model(input_shape, lstm_units=128, dense_units=128,
                         dropout_rate=0.4, lr=1e-4):
    model = Sequential([
        Conv1D(128, 5, activation='relu', padding='same', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling1D(2),
        Dropout(0.3),
        Conv1D(64, 3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(2),
        Dropout(0.3),
        Bidirectional(LSTM(lstm_units, return_sequences=False, dropout=0.3, recurrent_dropout=0.2)),
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = build_cnn_lstm_model(
    input_shape=(X_train.shape[1], 1), lstm_units=32,
    dense_units=128,
    dropout_rate=0.3, lr=1e-4 
)

early_stop = EarlyStopping(
    monitor='accuracy', patience=5, restore_best_weights=True, verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='accuracy', factor=0.5, patience=3, verbose=1
)

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=64,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

y_pred_test = (model.predict(X_test) > 0.5).astype(int)
TN, FP, FN, TP = confusion_matrix(y_test, y_pred_test).ravel()
ACC = accuracy_score(y_test, y_pred_test)
SN = recall_score(y_test, y_pred_test)
SP = TN / (TN + FP)
MCC = matthews_corrcoef(y_test, y_pred_test)

print("\n Test Set Results:")
print(f"Accuracy: {ACC:.4f}")
print(f"Sensitivity (SN): {SN:.4f}")
print(f"Specificity (SP): {SP:.4f}")
print(f"MCC: {MCC:.4f}")


Blstm

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, matthews_corrcoef

X_train = np.array(X_train)
y_train = np.array(labels)
X_test = np.array(X_test)
y_test = np.array(new_labels)
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

def build_bilstm_model(input_shape, lr=0.001):
    model = Sequential([
        Bidirectional(LSTM(32, return_sequences=True), input_shape=input_shape),
        BatchNormalization(),
        Dropout(0.3),

        Bidirectional(LSTM(64)),
        BatchNormalization(),
        Dropout(0.3),

        Dense(128, activation='relu'),
        Dropout(0.4),

        Dense(1, activation='sigmoid')
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = build_bilstm_model((X_train.shape[1], 1))

early_stop = EarlyStopping(monitor='accuracy', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='accuracy', factor=0.5, patience=3, min_lr=1e-4, verbose=1)
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=64,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)


y_pred_test = (model.predict(X_test) > 0.5).astype(int)
TN, FP, FN, TP = confusion_matrix(y_test, y_pred_test).ravel()
ACC = accuracy_score(y_test, y_pred_test)
SN = recall_score(y_test, y_pred_test)
SP = TN / (TN + FP)
MCC = matthews_corrcoef(y_test, y_pred_test)

print("\n Test Set Results:")
print(f"Accuracy: {ACC:.4f}")
print(f"Sensitivity (Recall): {SN:.4f}")
print(f"Specificity: {SP:.4f}")
print(f"MCC: {MCC:.4f}")


lstm cnn attention

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Conv1D, MaxPooling1D, BatchNormalization, Dropout,
    Bidirectional, LSTM, Dense, Flatten, Layer
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, matthews_corrcoef


class Attention(Layer):
    def __init__(self):
        super(Attention, self).__init__()

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros")
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.backend.softmax(e, axis=1)
        output = tf.keras.backend.sum(x * a, axis=1)
        return output


X_train = np.expand_dims(np.array(X_train), axis=2)
X_test = np.expand_dims(np.array(X_test), axis=2)
y_train = np.array(labels)
y_test = np.array(new_labels)
def build_cnn_lstm_attention(input_shape, lstm_units=32, dense_units=128,
                             dropout_rate=0.3, lr=1e-4):
    inputs = Input(shape=input_shape)

    x = Conv1D(128, 5, activation='relu', padding='same')(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(0.3)(x)
    x = Conv1D(64, 3, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(0.3)(x)

    x = Bidirectional(LSTM(lstm_units, return_sequences=True,
                           dropout=0.3, recurrent_dropout=0.2))(x)
    x = Attention()(x)

    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(dropout_rate)(x)

    outputs = Dense(1, activation='sigmoid')(x)

    model = Model(inputs, outputs)

    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model


model = build_cnn_lstm_attention(
    input_shape=(X_train.shape[1], 1),
    lstm_units=32,
    dense_units=128,
    dropout_rate=0.3,
    lr=1e-4
)



early_stop = EarlyStopping(
    monitor='accuracy', patience=5, restore_best_weights=True, verbose=1
)
reduce_lr = ReduceLROnPlateau(
    monitor='accuracy', factor=0.5, patience=3, verbose=1
)

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=64,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)


y_pred_test = (model.predict(X_test) > 0.5).astype(int)
TN, FP, FN, TP = confusion_matrix(y_test, y_pred_test).ravel()
ACC = accuracy_score(y_test, y_pred_test)
SN = recall_score(y_test, y_pred_test)
SP = TN / (TN + FP)
MCC = matthews_corrcoef(y_test, y_pred_test)

print("\n Test Set Results:")
print(f"Accuracy: {ACC:.4f}")
print(f"Sensitivity (SN): {SN:.4f}")
print(f"Specificity (SP): {SP:.4f}")
print(f"MCC: {MCC:.4f}")
