In [13]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, matthews_corrcoef, accuracy_score, balanced_accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import random

In [14]:
# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
tf.keras.utils.set_random_seed(SEED)  # This sets all random seeds in keras
tf.config.experimental.enable_op_determinism()  # For complete reproducibility

In [15]:
def prepare_structure_data(df):
   """Prepare and normalize structural features"""
   # Extract SASA
   sasa = df['sasa'].values.reshape(-1, 1)
   
   # Convert string arrays to numerical arrays for angles
   phi_small = np.array([eval(x) for x in df['phi_small']])
   psi_small = np.array([eval(x) for x in df['psi_small']])
   
   # Convert secondary structure to one-hot encoding
   ss = np.column_stack((df['E'], df['H'], df['L']))
   
   # Normalize SASA and angles
   scaler_sasa = StandardScaler()
   sasa_normalized = scaler_sasa.fit_transform(sasa)
   
   scaler_phi = StandardScaler()
   phi_normalized = scaler_phi.fit_transform(phi_small)
   
   scaler_psi = StandardScaler()
   psi_normalized = scaler_psi.fit_transform(psi_small)
   
   # Combine all features
   features = np.concatenate([
       sasa_normalized,
       phi_normalized,
       psi_normalized,
       ss
   ], axis=1)
   
   return features

In [16]:
def create_structure_model():
   """Create model for structural features"""
   model = tf.keras.Sequential([
       tf.keras.layers.Input(shape=(10,)),  # 1 SASA + 3 phi + 3 psi + 3 SS
       tf.keras.layers.Dense(32, activation='relu'),
       tf.keras.layers.Dropout(0.2),
       tf.keras.layers.Dense(16, activation='relu'),
       tf.keras.layers.Dropout(0.2),
       tf.keras.layers.Dense(1, activation='sigmoid')
   ])
   return model

In [None]:
def create_enhanced_structure_model():
    """Create enhanced model for structural features"""
    # Input layer
    inputs = tf.keras.layers.Input(shape=(10,))
    
    # Normalize inputs
    x = tf.keras.layers.BatchNormalization()(inputs)
    
    # First block with residual connection
    main = tf.keras.layers.Dense(64, activation='relu')(x)
    main = tf.keras.layers.BatchNormalization()(main)
    main = tf.keras.layers.Dropout(0.3)(main)
    main = tf.keras.layers.Dense(64)(main)
    skip = tf.keras.layers.Dense(64)(x)
    x = tf.keras.layers.Add()([main, skip])
    x = tf.keras.layers.Activation('relu')(x)
    
    # Second block
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    
    # Separate branches for different feature types
    # SASA branch
    sasa = tf.keras.layers.Lambda(lambda x: x[:, 0:1])(inputs)
    sasa = tf.keras.layers.Dense(8, activation='relu')(sasa)
    
    # Angles branch (phi/psi)
    angles = tf.keras.layers.Lambda(lambda x: x[:, 1:7])(inputs)
    angles = tf.keras.layers.Dense(16, activation='relu')(angles)
    
    # Secondary structure branch
    ss = tf.keras.layers.Lambda(lambda x: x[:, 7:10])(inputs)
    ss = tf.keras.layers.Dense(8, activation='relu')(ss)
    
    # Combine all features
    combined = tf.keras.layers.Concatenate()([x, sasa, angles, ss])
    
    # Final layers
    x = tf.keras.layers.Dense(32, activation='relu')(combined)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
def train_and_evaluate():
   # Load data
   print("Loading data...")
   train_df = pd.read_csv("../new/processed_data_train.csv")
   test_df = pd.read_csv("../new/processed_data_test.csv")
   
   # Shuffle both training and test data
   print("Shuffling data...")
   train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
   test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)
   
   # Print class distribution
   print("\nTraining set distribution:")
   print(train_df['label'].value_counts())
   print("\nTest set distribution:")
   print(test_df['label'].value_counts())
   
   # Prepare structure data
   print("\nPreparing structure data...")
   X_train = prepare_structure_data(train_df)
   X_test = prepare_structure_data(test_df)
   
   y_train = train_df['label'].values
   y_test = test_df['label'].values
   
   print(f"\nTraining data shape: {X_train.shape}")
   print(f"Test data shape: {X_test.shape}")
   
   # Calculate class weights
   total_samples = len(y_train)
   pos_samples = np.sum(y_train == 1)
   neg_samples = np.sum(y_train == 0)
   
   class_weights = {
       0: total_samples / (2 * neg_samples),
       1: total_samples / (2 * pos_samples)
   }
   
   print("\nClass weights:", class_weights)
   
   # Initialize cross-validation
   kfold = KFold(n_splits=5, shuffle=True, random_state=42)
   
   # Initialize metrics storage
   metrics = {'acc': [], 'balanced_acc': [], 'mcc': [], 'sn': [], 'sp': []}
   test_predictions = []
   
   # Cross-validation loop
   for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train), 1):
       print(f"\nFold {fold}/5")
       
       # Create callbacks
       early_stopping = tf.keras.callbacks.EarlyStopping(
           monitor='val_loss',  # Changed to monitor loss instead of accuracy
           patience=5,
           restore_best_weights=True
       )
       
       # Create and compile model
       model = create_structure_model()
       model.compile(
           optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
           loss='binary_crossentropy',
           metrics=['accuracy']
       )
       
       # Train model
       print("Training model...")
       history = model.fit(
           X_train[train_idx], y_train[train_idx],
           batch_size=32,
           epochs=50,
           validation_data=(X_train[val_idx], y_train[val_idx]),
           callbacks=[early_stopping],
           class_weight=class_weights,
           verbose=1
       )
       
       # Plot training history
       plt.figure(figsize=(10, 6))
       plt.plot(history.history['accuracy'], label='Train Accuracy')
       plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
       plt.title(f'Model Accuracy - Fold {fold}')
       plt.xlabel('Epoch')
       plt.ylabel('Accuracy')
       plt.legend()
       plt.show()
    #    plt.savefig(f'structure_accuracy_fold_{fold}.png')
    #    plt.close()
       
       # Evaluate on validation set
       y_pred = model.predict(X_train[val_idx])
       y_pred_binary = (y_pred > 0.5).astype(int)
       
       # Calculate metrics
       cm = confusion_matrix(y_train[val_idx], y_pred_binary)
       metrics['acc'].append(accuracy_score(y_train[val_idx], y_pred_binary))
       metrics['balanced_acc'].append(balanced_accuracy_score(y_train[val_idx], y_pred_binary))
       metrics['mcc'].append(matthews_corrcoef(y_train[val_idx], y_pred_binary))
       metrics['sn'].append(cm[1][1]/(cm[1][1]+cm[1][0]))  # Sensitivity
       metrics['sp'].append(cm[0][0]/(cm[0][0]+cm[0][1]))  # Specificity
       
       # Predict on test set
       test_pred = model.predict(X_test)
       test_predictions.append(test_pred)
       
       print(f"\nFold {fold} Results:")
       print(f"Accuracy: {metrics['acc'][-1]:.4f}")
       print(f"Balanced Accuracy: {metrics['balanced_acc'][-1]:.4f}")
       print(f"MCC: {metrics['mcc'][-1]:.4f}")
       print(f"Sensitivity: {metrics['sn'][-1]:.4f}")
       print(f"Specificity: {metrics['sp'][-1]:.4f}")
   
   # Print average cross-validation results
   print("\nAverage Cross-validation Results:")
   for metric in metrics:
       print(f"{metric.upper()}: {np.mean(metrics[metric]):.4f} ± {np.std(metrics[metric]):.4f}")
   
   # Ensemble predictions on test set
   test_pred_avg = np.mean(test_predictions, axis=0)
   test_pred_binary = (test_pred_avg > 0.5).astype(int)
   
   # Calculate final test metrics
   cm_test = confusion_matrix(y_test, test_pred_binary)
   test_balanced_acc = balanced_accuracy_score(y_test, test_pred_binary)
   
   print("\nFinal Test Set Results:")
   print(f"Accuracy: {accuracy_score(y_test, test_pred_binary):.4f}")
   print(f"Balanced Accuracy: {test_balanced_acc:.4f}")
   print(f"MCC: {matthews_corrcoef(y_test, test_pred_binary):.4f}")
   print(f"Sensitivity: {cm_test[1][1]/(cm_test[1][1]+cm_test[1][0]):.4f}")
   print(f"Specificity: {cm_test[0][0]/(cm_test[0][0]+cm_test[0][1]):.4f}")
   print("Confusion Matrix:")
   print(cm_test)
   
   return model

In [None]:
if __name__ == "__main__":
    model = train_and_evaluate()

Loading data...
Shuffling data...

Training set distribution:
label
1    4591
0    4259
Name: count, dtype: int64

Test set distribution:
label
0    2497
1     240
Name: count, dtype: int64

Preparing structure data...

Training data shape: (8850, 10)
Test data shape: (2737, 10)

Class weights: {0: 1.0389762855130311, 1: 0.9638423001524722}

Fold 1/5
Training model...
Epoch 1/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.5072 - loss: 0.7055 - val_accuracy: 0.5249 - val_loss: 0.6920
Epoch 2/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.5205 - loss: 0.6935 - val_accuracy: 0.5294 - val_loss: 0.6932
Epoch 3/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.5272 - loss: 0.6900 - val_accuracy: 0.5345 - val_loss: 0.6923
Epoch 4/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.5381 - loss: 0.6889 - val_accuracy: 0.5