In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, matthews_corrcoef, accuracy_score, balanced_accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
import random

2025-04-08 16:57:03.649442: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744131423.667578  992553 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744131423.673098  992553 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-08 16:57:03.692136: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
tf.keras.utils.set_random_seed(SEED)  # This sets all random seeds in keras
tf.config.experimental.enable_op_determinism()  # For complete reproducibility

In [3]:
def prepare_structure_data(df):
    """Structure data preparation without contacts"""
    features_list = []
    middle_pos = 16  
    
    # Normalize angles to their circular nature
    def normalize_angles(angle_array, pos):
        angles = np.array([arr[pos] for arr in angle_array])
        angle_rad = np.pi * angles / 180.0
        return np.stack([np.sin(angle_rad), np.cos(angle_rad)], axis=-1)
    
    # 1. Process angles
    angles = ['phi', 'psi', 'omega']
    for angle in angles:
        angle_arrays = np.array([np.array(eval(x)) for x in df[angle]])
        angle_features = normalize_angles(angle_arrays, middle_pos)
        features_list.append(angle_features)
        print(f"{angle} features shape: {angle_features.shape}")
    
    # 2. Process SASA
    sasa_arrays = np.array([np.array(eval(x)) for x in df['sasa']])
    scaler = RobustScaler()
    sasa_features = []
    for pos in [middle_pos-1, middle_pos, middle_pos+1]:
        sasa_pos = np.array([arr[pos] for arr in sasa_arrays]).reshape(-1, 1)
        sasa_scaled = scaler.fit_transform(sasa_pos)
        sasa_features.append(sasa_scaled)
    sasa_features = np.concatenate(sasa_features, axis=1)
    features_list.append(sasa_features)
    print(f"SASA features shape: {sasa_features.shape}")
    
    # 3. Process chi angles
    chi_angles = ['chi1', 'chi2', 'chi3', 'chi4']
    for chi in chi_angles:
        chi_arrays = np.array([np.array(eval(x)) for x in df[chi]])
        chi_features = normalize_angles(chi_arrays, middle_pos)
        features_list.append(chi_features)
        print(f"{chi} features shape: {chi_features.shape}")
    
    # 4. Process SS (optional)
    ss_arrays = np.array([list(seq) for seq in df['ss']])
    ss_center = ss_arrays[:, middle_pos]
    ss_encoded = np.zeros((len(ss_arrays), 3))
    ss_map = {'H': 0, 'E': 1, 'L': 2}
    for i, ss in enumerate(ss_center):
        ss_encoded[i, ss_map[ss]] = 1
    features_list.append(ss_encoded)
    print(f"SS features shape: {ss_encoded.shape}")
    
    # 5. Process plDDT
    plddt_arrays = np.array([np.array(eval(x)) for x in df['plDDT']])
    plddt_center = np.array([arr[middle_pos] for arr in plddt_arrays]).reshape(-1, 1)
    scaler = RobustScaler()
    plddt_scaled = scaler.fit_transform(plddt_center)
    features_list.append(plddt_scaled)
    print(f"plDDT features shape: {plddt_scaled.shape}")
    
    # Combine all features
    features = np.concatenate(features_list, axis=1)
    print(f"\nFinal combined features shape: {features.shape}")
    print("Feature list lengths:", [f.shape[1] for f in features_list])
    
    return features

In [4]:
def create_structure_model(input_dim):
    """Create a standalone model for structural features
    
    Args:
        input_dim: The dimensionality of the structural features
        
    Returns:
        A compiled Keras model
    """
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_dim,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    return model

In [5]:
def print_metrics(y_true, y_pred):
    """
    Print comprehensive evaluation metrics
    
    Parameters:
    y_true: array-like of true labels
    y_pred: array-like of predicted labels
    """
    # Calculate confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Calculate metrics
    acc = accuracy_score(y_true, y_pred)
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    sensitivity = cm[1][1] / (cm[1][1] + cm[1][0])  # True Positive Rate
    specificity = cm[0][0] / (cm[0][0] + cm[0][1])  # True Negative Rate
    
    # Print results
    print(f"Accuracy: {acc:.4f}")
    print(f"Balanced Accuracy: {balanced_acc:.4f}")
    print(f"MCC: {mcc:.4f}")
    print(f"Sensitivity: {sensitivity:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print("Confusion Matrix:")
    print(cm)

In [None]:
from sklearn.ensemble import RandomForestClassifier


def train_and_evaluate_seq_only_rf():
    """Train and evaluate structure-based model using Random Forest"""
    print("Loading structural data...")
    
    # Load data
    train_df = pd.read_csv("../../../../data/train/structure/processed_features_train.csv")
    test_df = pd.read_csv("../../../../data/test/structure/processed_features_test.csv")
    
    # Extract labels
    y_train = train_df['label'].values
    y_test = test_df['label'].values
    
    # Prepare feature data
    X_train = prepare_structure_data(train_df)
    X_test = prepare_structure_data(test_df)
    
    # Shuffle training data
    shuffle_idx = np.random.RandomState(42).permutation(len(y_train))
    X_train = X_train[shuffle_idx]
    y_train = y_train[shuffle_idx]
    
    print(f"Training data shape: {X_train.shape}")
    print(f"Testing data shape: {X_test.shape}")
    
    # Cross-validation setup
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    metrics = {'acc': [], 'balanced_acc': [], 'mcc': [], 'sn': [], 'sp': []}
    test_predictions = []
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, y_train), 1):
        print(f"\nFold {fold}/5")
        
        # Create Random Forest model
        
        model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1  # Use all processors
        )
        
        # Train model
        model.fit(X_train[train_idx], y_train[train_idx])
        
        # Evaluate on validation set
        val_pred = model.predict_proba(X_train[val_idx])[:, 1]
        val_pred_binary = (val_pred > 0.5).astype(int)
        
        # Calculate metrics
        cm = confusion_matrix(y_train[val_idx], val_pred_binary)
        metrics['acc'].append(accuracy_score(y_train[val_idx], val_pred_binary))
        metrics['balanced_acc'].append(balanced_accuracy_score(y_train[val_idx], val_pred_binary))
        metrics['mcc'].append(matthews_corrcoef(y_train[val_idx], val_pred_binary))
        metrics['sn'].append(cm[1][1]/(cm[1][1]+cm[1][0]))
        metrics['sp'].append(cm[0][0]/(cm[0][0]+cm[0][1]))
        
        # Predict on test set
        test_pred = model.predict_proba(X_test)[:, 1]
        test_predictions.append(test_pred)
        
        print(f"\nFold {fold} Results:")
        print(f"Accuracy: {metrics['acc'][-1]:.4f}")
        print(f"Balanced Accuracy: {metrics['balanced_acc'][-1]:.4f}")
        print(f"MCC: {metrics['mcc'][-1]:.4f}")
        print(f"Sensitivity: {metrics['sn'][-1]:.4f}")
        print(f"Specificity: {metrics['sp'][-1]:.4f}")
        
        # Feature importance for this fold
        feature_importance = model.feature_importances_
        print(f"\nTop 5 important features for fold {fold}:")
        top_indices = np.argsort(feature_importance)[-5:]
        for i in top_indices[::-1]:
            print(f"Feature {i}: {feature_importance[i]:.4f}")
    
    # Print average cross-validation results
    print("\nAverage Cross-validation Results:")
    for metric in metrics:
        print(f"{metric.upper()}: {np.mean(metrics[metric]):.4f} ± {np.std(metrics[metric]):.4f}")
    
    # Ensemble predictions on test set
    test_pred_avg = np.mean(test_predictions, axis=0)
    test_pred_binary = (test_pred_avg > 0.5).astype(int)
    
    # Calculate final test metrics
    cm_test = confusion_matrix(y_test, test_pred_binary)
    
    print("\nFinal Test Set Results:")
    print(f"Accuracy: {accuracy_score(y_test, test_pred_binary):.4f}")
    print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, test_pred_binary):.4f}")
    print(f"MCC: {matthews_corrcoef(y_test, test_pred_binary):.4f}")
    print(f"Sensitivity: {cm_test[1][1]/(cm_test[1][1]+cm_test[1][0]):.4f}")
    print(f"Specificity: {cm_test[0][0]/(cm_test[0][0]+cm_test[0][1]):.4f}")
    print("Confusion Matrix:")
    print(cm_test)
    
    return model, test_pred_avg

In [None]:
model, test_probs = train_and_evaluate_seq_only_rf()

Loading structural data...
phi features shape: (8853, 2)
psi features shape: (8853, 2)
omega features shape: (8853, 2)
SASA features shape: (8853, 3)
chi1 features shape: (8853, 2)
chi2 features shape: (8853, 2)
chi3 features shape: (8853, 2)
chi4 features shape: (8853, 2)
SS features shape: (8853, 3)
plDDT features shape: (8853, 1)

Final combined features shape: (8853, 21)
Feature list lengths: [2, 2, 2, 3, 2, 2, 2, 2, 3, 1]
phi features shape: (2737, 2)
psi features shape: (2737, 2)
omega features shape: (2737, 2)
SASA features shape: (2737, 3)
chi1 features shape: (2737, 2)
chi2 features shape: (2737, 2)
chi3 features shape: (2737, 2)
chi4 features shape: (2737, 2)
SS features shape: (2737, 3)
plDDT features shape: (2737, 1)

Final combined features shape: (2737, 21)
Feature list lengths: [2, 2, 2, 3, 2, 2, 2, 2, 3, 1]
Training data shape: (8853, 21)
Testing data shape: (2737, 21)

Fold 1/5

Fold 1 Results:
Accuracy: 0.6302
Balanced Accuracy: 0.6277
MCC: 0.2577
Sensitivity: 0.6931
