In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import os
from transformers import TFEsmModel, EsmTokenizer

# Check available data files
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


2025-09-02 12:56:08.869878: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756817769.030709      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756817769.080975      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


/kaggle/input/pdb14189/PDB14189_N.txt
/kaggle/input/pdb14189/PDB14189_P.txt


In [2]:
RANDOM_SEED=229
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [3]:
def load_fasta_data(positive_file, negative_file):
    """Load protein sequences from FASTA files"""
    def parse_fasta(file_path):
        sequences = []
        current_seq = ""
        with open(file_path, 'r') as f:
            for line in f:
                line = line.strip()
                if line.startswith('>'):
                    if current_seq:
                        sequences.append(current_seq)
                    current_seq = ""
                else:
                    current_seq += line
        if current_seq:
            sequences.append(current_seq)
        return sequences
    
    positive_sequences = parse_fasta(positive_file)
    negative_sequences = parse_fasta(negative_file)
    positive_labels = [1] * len(positive_sequences)
    negative_labels = [0] * len(negative_sequences)
    all_sequences = positive_sequences + negative_sequences
    all_labels = positive_labels + negative_labels
    
    print(f"Loaded {len(positive_sequences)} positive and {len(negative_sequences)} negative sequences")
    print(f"Total sequences: {len(all_sequences)}")
    return all_sequences, all_labels

def evaluate_model(model, test_data, y_test):
    """Comprehensive model evaluation for transformer model"""
    if isinstance(test_data, list):  # For transformer model with multiple inputs
        y_pred_proba = model.predict(test_data)
    else:
        y_pred_proba = model.predict(test_data)
    
    y_pred = (y_pred_proba > 0.5).astype(int).flatten()
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1-Score: {f1:.4f}")
    print(f"Test AUC-ROC: {auc:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Non-DNA-binding', 'DNA-binding']))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1, 'auc': auc}

In [4]:
class PretrainedTransformerCNN:
    def __init__(self, transformer_model="facebook/esm2_t33_650M_UR50D", 
                 max_seq_len=1024, freeze_transformer=True):
        self.transformer_model = transformer_model
        self.max_seq_len = max_seq_len
        self.freeze_transformer = freeze_transformer
        
        print(f"Loading transformer model: {transformer_model}")
        self.tokenizer = EsmTokenizer.from_pretrained(transformer_model)
        self.transformer = TFEsmModel.from_pretrained(transformer_model)
        
        if freeze_transformer:
            for layer in self.transformer.layers:
                layer.trainable = False
            print("Transformer weights frozen for feature extraction")
    
    def build_model(self):
        """Build the Transformer → CNN → Classification pipeline"""
        input_ids = Input(shape=(self.max_seq_len,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_seq_len,), dtype=tf.int32, name='attention_mask')
        
        # Wrap transformer call in Lambda layer
        def transformer_call(inputs):
            input_ids, attention_mask = inputs
            return self.transformer(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        
        sequence_embeddings = Lambda(transformer_call)([input_ids, attention_mask])
        
        # Multi-scale CNN
        conv_layers = []
        for kernel_size, filters in [(3, 256), (5, 256), (7, 256)]:
            conv = Conv1D(filters=filters, kernel_size=kernel_size, padding='same', activation='relu')(sequence_embeddings)
            conv = BatchNormalization()(conv)
            conv = MaxPooling1D(pool_size=2)(conv)
            conv_layers.append(conv)
        
        cnn_features = Concatenate(axis=-1)(conv_layers)
        cnn_features = Conv1D(512, 3, padding='same', activation='relu')(cnn_features)
        cnn_features = BatchNormalization()(cnn_features)
        cnn_features = Dropout(0.2)(cnn_features)
        cnn_features = Conv1D(256, 3, padding='same', activation='relu')(cnn_features)
        cnn_features = BatchNormalization()(cnn_features)
        cnn_features = GlobalMaxPooling1D()(cnn_features)
        
        # Classification head
        x = Dense(512, activation='relu')(cnn_features)
        x = Dropout(0.3)(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.1)(x)
        outputs = Dense(1, activation='sigmoid')(x)
        
        model = Model(inputs=[input_ids, attention_mask], outputs=outputs, name='TransformerCNN')
        return model

    
    def preprocess_sequences(self, sequences):
        """Preprocess protein sequences for the transformer"""
        spaced_sequences = [' '.join(seq) for seq in sequences]
        encoded = self.tokenizer(spaced_sequences, padding=True, truncation=True, 
                               max_length=self.max_seq_len, return_tensors="tf")
        return {'input_ids': encoded['input_ids'], 'attention_mask': encoded['attention_mask']}


In [5]:
# Load full dataset for transformer model
sequences, labels = load_fasta_data(
    positive_file='/kaggle/input/pdb14189/PDB14189_P.txt',
    negative_file='/kaggle/input/pdb14189/PDB14189_N.txt'
)

print(f"Using dataset: {len(sequences)} sequences for Transformer CNN model")
y_full = np.array(labels)
print(f"Positive samples: {np.sum(y_full)}, Negative samples: {len(y_full) - np.sum(y_full)}")

Loaded 7129 positive and 7060 negative sequences
Total sequences: 14189
Using dataset: 14189 sequences for Transformer CNN model
Positive samples: 7129, Negative samples: 7060


In [6]:
# Create transformer model
transformer_cnn = PretrainedTransformerCNN()
transformer_model = transformer_cnn.build_model()

transformer_model.compile(
    optimizer=tf.keras.optimizers.AdamW(learning_rate=1e-4, weight_decay=1e-5),
    loss='binary_crossentropy',
    metrics=['accuracy', 'precision', 'recall']
)

# Preprocess full dataset
print("Preprocessing sequences...")
transformer_data = transformer_cnn.preprocess_sequences(sequences)

# Split data
indices = np.arange(len(sequences))
train_idx, temp_idx = train_test_split(indices, test_size=0.4, random_state=RANDOM_SEED, stratify=y_full)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=RANDOM_SEED, stratify=y_full[temp_idx])

def safe_gather(tensor, indices):
    """Safely gather elements from tensor using numpy indices"""
    if isinstance(indices, np.ndarray):
        indices = tf.convert_to_tensor(indices, dtype=tf.int32)
    return tf.gather(tensor, indices)

# Create datasets using tf.gather
train_data = [
    safe_gather(transformer_data['input_ids'], train_idx), 
    safe_gather(transformer_data['attention_mask'], train_idx)
]
val_data = [
    safe_gather(transformer_data['input_ids'], val_idx), 
    safe_gather(transformer_data['attention_mask'], val_idx)
]
test_data = [
    safe_gather(transformer_data['input_ids'], test_idx), 
    safe_gather(transformer_data['attention_mask'], test_idx)
]

y_train_full, y_val_full, y_test_full = y_full[train_idx], y_full[val_idx], y_full[test_idx]

print(f"Training set: {len(train_idx)} samples")
print(f"Validation set: {len(val_idx)} samples") 
print(f"Test set: {len(test_idx)} samples")
# Train transformer model
transformer_callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True, verbose=1),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, min_lr=1e-7, verbose=1),
    tf.keras.callbacks.ModelCheckpoint('best_transformer_model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
]

Loading transformer model: facebook/esm2_t33_650M_UR50D


tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.61G [00:00<?, ?B/s]

I0000 00:00:1756817799.937655      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFEsmModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'esm.embeddings.position_ids']
- This IS expected if you are initializing TFEsmModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFEsmModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFEsmModel were not initialized fr

Transformer weights frozen for feature extraction
Preprocessing sequences...
Training set: 8513 samples
Validation set: 2838 samples
Test set: 2838 samples


In [7]:
print("=== Training Transformer CNN Model ===")

transformer_history = transformer_model.fit(
    train_data, y_train_full,
    batch_size=8,
    epochs=20,
    validation_data=(val_data, y_val_full),
    callbacks=transformer_callbacks,
    verbose=1
)

=== Training Transformer CNN Model ===
Epoch 1/20


I0000 00:00:1756817903.196904     166 service.cc:148] XLA service 0x7ca95400d140 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1756817903.197568     166 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
W0000 00:00:1756817905.716874     166 assert_op.cc:38] Ignoring Assert operator TransformerCNN_1/lambda_1/tf_esm_model/esm/embeddings/assert_less/Assert/Assert
I0000 00:00:1756817911.128054     166 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1756817922.650615     166 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1064/1065[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m1s[0m 2s/step - accuracy: 0.7560 - loss: 0.5461 - precision: 0.7561 - recall: 0.7618

W0000 00:00:1756819823.686766     165 assert_op.cc:38] Ignoring Assert operator TransformerCNN_1/lambda_1/tf_esm_model/esm/embeddings/assert_less/Assert/Assert


[1m1065/1065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.7561 - loss: 0.5459 - precision: 0.7562 - recall: 0.7619

W0000 00:00:1756819855.323027     163 assert_op.cc:38] Ignoring Assert operator TransformerCNN_1/lambda_1/tf_esm_model/esm/embeddings/assert_less/Assert/Assert
W0000 00:00:1756820485.344132     165 assert_op.cc:38] Ignoring Assert operator TransformerCNN_1/lambda_1/tf_esm_model/esm/embeddings/assert_less/Assert/Assert



Epoch 1: val_accuracy improved from -inf to 0.91226, saving model to best_transformer_model.h5
[1m1065/1065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2650s[0m 2s/step - accuracy: 0.7562 - loss: 0.5457 - precision: 0.7562 - recall: 0.7620 - val_accuracy: 0.9123 - val_loss: 0.2247 - val_precision: 0.8639 - val_recall: 0.9797 - learning_rate: 1.0000e-04
Epoch 2/20
[1m1065/1065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.9228 - loss: 0.2189 - precision: 0.9180 - recall: 0.9298
Epoch 2: val_accuracy improved from 0.91226 to 0.92600, saving model to best_transformer_model.h5
[1m1065/1065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2526s[0m 2s/step - accuracy: 0.9228 - loss: 0.2189 - precision: 0.9180 - recall: 0.9298 - val_accuracy: 0.9260 - val_loss: 0.1908 - val_precision: 0.8873 - val_recall: 0.9769 - learning_rate: 1.0000e-04
Epoch 3/20
[1m1065/1065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.9421 - loss: 0.16

In [8]:
print("=== Transformer CNN Model Results ===")
transformer_results = evaluate_model(transformer_model, test_data, y_test_full)

=== Transformer CNN Model Results ===


W0000 00:00:1756850814.627444     163 assert_op.cc:38] Ignoring Assert operator TransformerCNN_1/lambda_1/tf_esm_model/esm/embeddings/assert_less/Assert/Assert


[1m88/89[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m6s[0m 7s/step 

W0000 00:00:1756851448.817858     166 assert_op.cc:38] Ignoring Assert operator TransformerCNN_1/lambda_1/tf_esm_model/esm/embeddings/assert_less/Assert/Assert


[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m660s[0m 7s/step
Test Accuracy: 0.9316
Test Precision: 0.9005
Test Recall: 0.9712
Test F1-Score: 0.9345
Test AUC-ROC: 0.9786

Classification Report:
                 precision    recall  f1-score   support

Non-DNA-binding       0.97      0.89      0.93      1412
    DNA-binding       0.90      0.97      0.93      1426

       accuracy                           0.93      2838
      macro avg       0.93      0.93      0.93      2838
   weighted avg       0.93      0.93      0.93      2838


Confusion Matrix:
[[1259  153]
 [  41 1385]]


In [None]:
# # Save transformer results
# with open('transformer_cnn_results.pkl', 'wb') as f:
#     pickle.dump(transformer_results, f)

# print("\nTransformer results saved to 'transformer_cnn_results.pkl'")

In [None]:
# # Load simple CNN results for comparison
# try:
#     import pickle
#     with open('simple_cnn_results.pkl', 'rb') as f:
#         simple_results = pickle.load(f)
    
#     print("\n=== Model Comparison ===")
#     print("Simple CNN+BiLSTM vs Transformer CNN")
#     print(f"Accuracy: {simple_results['accuracy']:.4f} vs {transformer_results['accuracy']:.4f}")
#     print(f"Precision: {simple_results['precision']:.4f} vs {transformer_results['precision']:.4f}")
#     print(f"Recall: {simple_results['recall']:.4f} vs {transformer_results['recall']:.4f}")
#     print(f"F1-Score: {simple_results['f1_score']:.4f} vs {transformer_results['f1_score']:.4f}")
#     print(f"AUC-ROC: {simple_results['auc']:.4f} vs {transformer_results['auc']:.4f}")
    
# except FileNotFoundError:
#     print("CNN results not found")