In [3]:
"""
Ablation Study - 消融实验
测试不同特征组合对模型性能的影响
"""

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.metrics import (roc_auc_score, accuracy_score, average_precision_score,
                             precision_score, recall_score)
import matplotlib.pyplot as plt
import seaborn as sns

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


# ==================== 数据集变体 ====================

class AblationDataset(Dataset):
    """支持不同特征组合的数据集"""
    
    def __init__(self, dataframe, aaindex_path="./data/aaindex1_pca.csv",
                 use_onehot=True, use_aa_features=True, 
                 use_tap=True, use_rank=True):
        self.data = dataframe.reset_index(drop=True)
        self.aaindex = pd.read_csv(aaindex_path)
        self.amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
        
        # 控制使用哪些特征
        self.use_onehot = use_onehot
        self.use_aa_features = use_aa_features
        self.use_tap = use_tap
        self.use_rank = use_rank
        
        # 计算特征维度
        self.feature_dim = 0
        if use_onehot:
            self.feature_dim += 20
        if use_aa_features:
            self.feature_dim += 22
        
        print(f"Feature configuration:")
        print(f"  - One-hot encoding: {use_onehot}")
        print(f"  - AA features: {use_aa_features}")
        print(f"  - TAP score: {use_tap}")
        print(f"  - Rank score: {use_rank}")
        print(f"  - Sequence feature dim: {self.feature_dim}")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        row = self.data.iloc[index]
        pseudosequence = row['pseudosequence']
        peptide = row['Peptide']
        label = row['Label']
        tap = row['tap_prediction_score']
        rank = row['%Rank_EL']
        
        # 构建序列特征
        features_list = []
        
        if self.use_onehot:
            pseudo_onehot = self._onehot_encoding(pseudosequence, 34)
            peptide_onehot = self._onehot_encoding(peptide, 11)
            features_list.append((pseudo_onehot, peptide_onehot))
        
        if self.use_aa_features:
            pseudo_aa = self._get_AA_features(pseudosequence, 34)
            peptide_aa = self._get_AA_features(peptide, 11)
            features_list.append((pseudo_aa, peptide_aa))
        
        # 拼接特征
        if len(features_list) > 0:
            pseudo_features = torch.cat([f[0] for f in features_list], dim=1)
            peptide_features = torch.cat([f[1] for f in features_list], dim=1)
        else:
            # 如果没有序列特征，使用零向量
            pseudo_features = torch.zeros(34, 1)
            peptide_features = torch.zeros(11, 1)
        
        # 构建全局特征
        global_features = []
        if self.use_tap:
            global_features.append(tap)
        if self.use_rank:
            global_features.append(rank)
        
        if len(global_features) > 0:
            global_features = torch.tensor(global_features, dtype=torch.float32)
        else:
            global_features = torch.zeros(1, dtype=torch.float32)
        
        return (pseudo_features.float(), 
                peptide_features.float(), 
                global_features, 
                torch.tensor(label, dtype=torch.long))
    
    def _onehot_encoding(self, sequence, maxlen):
        sequence = sequence.upper()[:maxlen]
        enc_seq = torch.zeros((maxlen, 20), dtype=torch.float32)
        for i, aa in enumerate(sequence):
            if aa in self.amino_acids:
                enc_seq[i, self.amino_acids.index(aa)] = 1
        return enc_seq
    
    def _get_AA_features(self, sequence, maxlen):
        sequence = sequence.ljust(maxlen, 'X')[:maxlen]
        all_node_feats = []
        for index, aa in enumerate(sequence):
            node_feats = self.aaindex[aa].to_list()
            anchar = [0, maxlen]
            seq_onehot = [0, 0]
            seq_onehot[sum([index >= i for i in anchar]) - 1] = 1
            node_feats.extend(seq_onehot)
            all_node_feats.append(node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float32)


# ==================== 灵活的CNN模型 ====================

class FlexibleCNN(nn.Module):
    """支持不同输入维度的CNN模型"""
    
    def __init__(self, seq_feature_dim, global_feature_dim):
        super(FlexibleCNN, self).__init__()
        
        self.seq_feature_dim = seq_feature_dim
        self.global_feature_dim = global_feature_dim
        
        # MHC分支
        self.pseudo_branch = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(64, 32, kernel_size=2, stride=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 1), stride=2)
        )
        
        # 肽段分支
        self.peptide_branch = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=2, stride=1, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 1), stride=2)
        )
        
        # 动态计算卷积输出维度
        self.conv_output_dim = self._get_conv_output()
        
        # 全连接层
        fc_input_dim = self.conv_output_dim + global_feature_dim
        self.classifier = nn.Sequential(
            nn.Linear(fc_input_dim, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2)
        )
    
    def _get_conv_output(self):
        with torch.no_grad():
            x1 = torch.zeros(1, 1, 34, self.seq_feature_dim)
            x2 = torch.zeros(1, 1, 11, self.seq_feature_dim)
            x1 = self.pseudo_branch(x1)
            x2 = self.peptide_branch(x2)
            x1_flat = x1.view(1, -1).size(1)
            x2_flat = x2.view(1, -1).size(1)
            return x1_flat + x2_flat
    
    def forward(self, pseudo, peptide, global_feat):
        x1 = pseudo.unsqueeze(1)
        x2 = peptide.unsqueeze(1)
        
        x1 = self.pseudo_branch(x1)
        x2 = self.peptide_branch(x2)
        
        x1 = x1.view(x1.size(0), -1)
        x2 = x2.view(x2.size(0), -1)
        
        x = torch.cat([x1, x2, global_feat], dim=1)
        x = self.classifier(x)
        return x


# ==================== 训练和评估 ====================

def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []
    
    for pseudo, peptide, global_feat, labels in tqdm(loader, desc="Training", leave=False):
        pseudo = pseudo.to(device)
        peptide = peptide.to(device)
        global_feat = global_feat.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(pseudo, peptide, global_feat)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(loader)
    accuracy = accuracy_score(all_labels, all_preds)
    return avg_loss, accuracy


def evaluate_model(model, criterion, dataloader, device):
    model.eval()
    all_labels = []
    all_predictions = []
    all_probs = []
    running_loss = 0.0
    
    with torch.no_grad():
        for pseudo, peptide, global_feat, labels in tqdm(dataloader, desc="Evaluation", leave=False):
            pseudo = pseudo.to(device)
            peptide = peptide.to(device)
            global_feat = global_feat.to(device)
            labels = labels.to(device)
            
            outputs = model(pseudo, peptide, global_feat)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            
            probs = F.softmax(outputs, dim=1)[:, 1]
            predictions = torch.argmax(outputs, dim=1)
            
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
    
    avg_loss = running_loss / len(dataloader)
    
    precision = precision_score(all_labels, all_predictions, zero_division=0)
    recall = recall_score(all_labels, all_predictions, zero_division=0)
    auc = roc_auc_score(all_labels, all_probs)
    aupr = average_precision_score(all_labels, all_probs)
    acc = accuracy_score(all_labels, all_predictions)
    
    return {
        'loss': avg_loss,
        'auc': auc,
        'aupr': aupr,
        'acc': acc,
        'precision': precision,
        'recall': recall
    }


def train_ablation_model(config_name, dataset_class, seq_dim, global_dim,
                        train_data, val_data, num_epochs=50):
    """训练一个消融实验配置"""
    
    print(f"\n{'='*60}")
    print(f"Training: {config_name}")
    print(f"{'='*60}\n")
    
    # 创建数据集
    train_dataset = dataset_class(train_data)
    val_dataset = dataset_class(val_data)
    
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)
    
    # 创建模型
    model = FlexibleCNN(seq_dim, global_dim).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.001)
    
    best_auc = 0
    best_metrics = None
    
    for epoch in range(num_epochs):
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        val_metrics = evaluate_model(model, criterion, val_loader, device)
        
        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}")
            print(f"  Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
            print(f"  Val   - AUC: {val_metrics['auc']:.4f}, AUPR: {val_metrics['aupr']:.4f}, "
                  f"Acc: {val_metrics['acc']:.4f}")
        
        if val_metrics['auc'] > best_auc:
            best_auc = val_metrics['auc']
            best_metrics = val_metrics.copy()
    
    print(f"\nBest Results:")
    print(f"  AUC: {best_metrics['auc']:.4f}")
    print(f"  AUPR: {best_metrics['aupr']:.4f}")
    print(f"  Accuracy: {best_metrics['acc']:.4f}")
    print(f"  Precision: {best_metrics['precision']:.4f}")
    print(f"  Recall: {best_metrics['recall']:.4f}")
    
    return best_metrics


# ==================== 主实验 ====================

if __name__ == "__main__":
    print("="*80)
    print("ABLATION STUDY - 消融实验")
    print("="*80)
    print("\n目标: 评估每个特征对模型性能的贡献\n")
    
    # 加载数据
    print("Loading data...")
    data = pd.read_csv("data/output_binding_tap.csv")
    train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
    
    print(f"Train samples: {len(train_data)}")
    print(f"Val samples: {len(val_data)}")
    
    # 定义消融实验配置
    ablation_configs = [
        # 完整模型（基线）
        {
            'name': 'Full Model (Baseline)',
            'dataset': lambda df: AblationDataset(df, use_onehot=True, use_aa_features=True, 
                                                  use_tap=True, use_rank=True),
            'seq_dim': 42,
            'global_dim': 2
        },
        
        # 移除One-Hot编码
        {
            'name': 'Without One-Hot',
            'dataset': lambda df: AblationDataset(df, use_onehot=False, use_aa_features=True,
                                                  use_tap=True, use_rank=True),
            'seq_dim': 22,
            'global_dim': 2
        },
        
        # 移除AA特征
        {
            'name': 'Without AA Features',
            'dataset': lambda df: AblationDataset(df, use_onehot=True, use_aa_features=False,
                                                  use_tap=True, use_rank=True),
            'seq_dim': 20,
            'global_dim': 2
        },
        
        # 移除TAP分数
        {
            'name': 'Without TAP Score',
            'dataset': lambda df: AblationDataset(df, use_onehot=True, use_aa_features=True,
                                                  use_tap=False, use_rank=True),
            'seq_dim': 42,
            'global_dim': 1
        },
        
        # 移除Rank分数
        {
            'name': 'Without Rank Score',
            'dataset': lambda df: AblationDataset(df, use_onehot=True, use_aa_features=True,
                                                  use_tap=True, use_rank=False),
            'seq_dim': 42,
            'global_dim': 1
        },
        
        # 移除所有全局特征
        {
            'name': 'Without Global Features',
            'dataset': lambda df: AblationDataset(df, use_onehot=True, use_aa_features=True,
                                                  use_tap=False, use_rank=False),
            'seq_dim': 42,
            'global_dim': 0
        },
        
        # 只用序列特征（One-Hot）
        {
            'name': 'Only One-Hot',
            'dataset': lambda df: AblationDataset(df, use_onehot=True, use_aa_features=False,
                                                  use_tap=False, use_rank=False),
            'seq_dim': 20,
            'global_dim': 0
        },
        
        # 只用AA特征
        {
            'name': 'Only AA Features',
            'dataset': lambda df: AblationDataset(df, use_onehot=False, use_aa_features=True,
                                                  use_tap=False, use_rank=False),
            'seq_dim': 22,
            'global_dim': 0
        },
    ]
    
    # 运行所有配置
    results = {}
    
    for config in ablation_configs:
        try:
            metrics = train_ablation_model(
                config['name'],
                config['dataset'],
                config['seq_dim'],
                config['global_dim'],
                train_data,
                val_data,
                num_epochs=50
            )
            results[config['name']] = metrics
        except Exception as e:
            print(f"Error in {config['name']}: {e}")
            continue
    
    # 打印对比结果
    print("\n" + "="*80)
    print("ABLATION STUDY RESULTS")
    print("="*80)
    print(f"{'Configuration':<30} {'AUC':<8} {'AUPR':<8} {'Acc':<8} {'Prec':<8} {'Recall':<8}")
    print("-"*80)
    
    baseline_auc = results.get('Full Model (Baseline)', {}).get('auc', 0)
    
    for config_name, metrics in results.items():
        auc_diff = metrics['auc'] - baseline_auc if config_name != 'Full Model (Baseline)' else 0
        marker = "  (baseline)" if config_name == 'Full Model (Baseline)' else f"  ({auc_diff:+.4f})"
        
        print(f"{config_name:<30} "
              f"{metrics['auc']:<8.4f} "
              f"{metrics['aupr']:<8.4f} "
              f"{metrics['acc']:<8.4f} "
              f"{metrics['precision']:<8.4f} "
              f"{metrics['recall']:<8.4f}"
              f"{marker}")
    
    # 保存结果
    results_df = pd.DataFrame(results).T
    results_df.to_csv('ablation_study_results.csv')
    print("\nResults saved to 'ablation_study_results.csv'")
    
    # 计算特征重要性
    print("\n" + "="*80)
    print("FEATURE IMPORTANCE ANALYSIS")
    print("="*80)
    
    baseline = results['Full Model (Baseline)']
    
    feature_importance = {
        'One-Hot Encoding': baseline['auc'] - results.get('Without One-Hot', {}).get('auc', 0),
        'AA Features': baseline['auc'] - results.get('Without AA Features', {}).get('auc', 0),
        'TAP Score': baseline['auc'] - results.get('Without TAP Score', {}).get('auc', 0),
        'Rank Score': baseline['auc'] - results.get('Without Rank Score', {}).get('auc', 0),
    }
    
    print("\nΔAUC when removing each feature:")
    for feature, importance in sorted(feature_importance.items(), key=lambda x: x[1], reverse=True):
        print(f"  {feature:<20}: {importance:+.4f}")
    
    print("\n" + "="*80)
    print("CONCLUSIONS")
    print("="*80)


Using device: cpu
ABLATION STUDY - 消融实验

目标: 评估每个特征对模型性能的贡献

Loading data...
Train samples: 6267
Val samples: 1567

Training: Full Model (Baseline)

Feature configuration:
  - One-hot encoding: True
  - AA features: True
  - TAP score: True
  - Rank score: True
  - Sequence feature dim: 42
Feature configuration:
  - One-hot encoding: True
  - AA features: True
  - TAP score: True
  - Rank score: True
  - Sequence feature dim: 42


                                                         

Epoch 5/50
  Train - Loss: 0.4749, Acc: 0.7627
  Val   - AUC: 0.8712, AUPR: 0.8041, Acc: 0.8079


                                                         

Epoch 10/50
  Train - Loss: 0.4412, Acc: 0.7881
  Val   - AUC: 0.8839, AUPR: 0.8229, Acc: 0.8003


                                                         

Epoch 15/50
  Train - Loss: 0.4315, Acc: 0.7929
  Val   - AUC: 0.8876, AUPR: 0.8295, Acc: 0.7900


                                                         

Epoch 20/50
  Train - Loss: 0.4086, Acc: 0.8033
  Val   - AUC: 0.8914, AUPR: 0.8372, Acc: 0.8149


                                                         

Epoch 25/50
  Train - Loss: 0.3981, Acc: 0.8114
  Val   - AUC: 0.8945, AUPR: 0.8432, Acc: 0.8232


                                                         

Epoch 30/50
  Train - Loss: 0.3836, Acc: 0.8157
  Val   - AUC: 0.8978, AUPR: 0.8488, Acc: 0.8200


                                                         

Epoch 35/50
  Train - Loss: 0.3701, Acc: 0.8261
  Val   - AUC: 0.9005, AUPR: 0.8540, Acc: 0.8258


                                                         

Epoch 40/50
  Train - Loss: 0.3484, Acc: 0.8398
  Val   - AUC: 0.8999, AUPR: 0.8537, Acc: 0.8239


                                                         

Epoch 45/50
  Train - Loss: 0.3269, Acc: 0.8462
  Val   - AUC: 0.9025, AUPR: 0.8568, Acc: 0.8232


                                                         

Epoch 50/50
  Train - Loss: 0.3146, Acc: 0.8567
  Val   - AUC: 0.9016, AUPR: 0.8569, Acc: 0.8200

Best Results:
  AUC: 0.9027
  AUPR: 0.8591
  Accuracy: 0.8181
  Precision: 0.7189
  Recall: 0.8365

Training: Without One-Hot

Feature configuration:
  - One-hot encoding: False
  - AA features: True
  - TAP score: True
  - Rank score: True
  - Sequence feature dim: 22
Feature configuration:
  - One-hot encoding: False
  - AA features: True
  - TAP score: True
  - Rank score: True
  - Sequence feature dim: 22


                                                         

Epoch 5/50
  Train - Loss: 0.4679, Acc: 0.7702
  Val   - AUC: 0.8699, AUPR: 0.8020, Acc: 0.7907


                                                         

Epoch 10/50
  Train - Loss: 0.4426, Acc: 0.7891
  Val   - AUC: 0.8829, AUPR: 0.8238, Acc: 0.8054


                                                         

Epoch 15/50
  Train - Loss: 0.4189, Acc: 0.7980
  Val   - AUC: 0.8910, AUPR: 0.8369, Acc: 0.8111


                                                         

Epoch 20/50
  Train - Loss: 0.3989, Acc: 0.8106
  Val   - AUC: 0.8924, AUPR: 0.8410, Acc: 0.8143


                                                         

Epoch 25/50
  Train - Loss: 0.3914, Acc: 0.8200
  Val   - AUC: 0.8934, AUPR: 0.8432, Acc: 0.8156


                                                         

Epoch 30/50
  Train - Loss: 0.3696, Acc: 0.8281
  Val   - AUC: 0.9013, AUPR: 0.8557, Acc: 0.8264


                                                         

Epoch 35/50
  Train - Loss: 0.3612, Acc: 0.8313
  Val   - AUC: 0.9022, AUPR: 0.8567, Acc: 0.8264


                                                         

Epoch 40/50
  Train - Loss: 0.3438, Acc: 0.8390
  Val   - AUC: 0.9020, AUPR: 0.8583, Acc: 0.8168


                                                         

Epoch 45/50
  Train - Loss: 0.3010, Acc: 0.8621
  Val   - AUC: 0.9018, AUPR: 0.8559, Acc: 0.8226


                                                         

Epoch 50/50
  Train - Loss: 0.2740, Acc: 0.8798
  Val   - AUC: 0.9008, AUPR: 0.8554, Acc: 0.8220

Best Results:
  AUC: 0.9056
  AUPR: 0.8618
  Accuracy: 0.8296
  Precision: 0.7698
  Recall: 0.7711

Training: Without AA Features

Feature configuration:
  - One-hot encoding: True
  - AA features: False
  - TAP score: True
  - Rank score: True
  - Sequence feature dim: 20
Feature configuration:
  - One-hot encoding: True
  - AA features: False
  - TAP score: True
  - Rank score: True
  - Sequence feature dim: 20


                                                         

Epoch 5/50
  Train - Loss: 0.4426, Acc: 0.7871
  Val   - AUC: 0.8851, AUPR: 0.8230, Acc: 0.8003


                                                         

Epoch 10/50
  Train - Loss: 0.4097, Acc: 0.8076
  Val   - AUC: 0.8949, AUPR: 0.8395, Acc: 0.8207


                                                         

Epoch 15/50
  Train - Loss: 0.3786, Acc: 0.8206
  Val   - AUC: 0.8941, AUPR: 0.8400, Acc: 0.8117


                                                         

Epoch 20/50
  Train - Loss: 0.3421, Acc: 0.8398
  Val   - AUC: 0.9019, AUPR: 0.8490, Acc: 0.8264


                                                         

Epoch 25/50
  Train - Loss: 0.3051, Acc: 0.8610
  Val   - AUC: 0.8998, AUPR: 0.8481, Acc: 0.8188


                                                         

Epoch 30/50
  Train - Loss: 0.2505, Acc: 0.8920
  Val   - AUC: 0.9017, AUPR: 0.8498, Acc: 0.8188


                                                         

Epoch 35/50
  Train - Loss: 0.1976, Acc: 0.9256
  Val   - AUC: 0.9004, AUPR: 0.8501, Acc: 0.8226


                                                         

Epoch 40/50
  Train - Loss: 0.1470, Acc: 0.9478
  Val   - AUC: 0.9008, AUPR: 0.8479, Acc: 0.8245


                                                         

Epoch 45/50
  Train - Loss: 0.0953, Acc: 0.9684
  Val   - AUC: 0.8966, AUPR: 0.8459, Acc: 0.8124


                                                         

Epoch 50/50
  Train - Loss: 0.0756, Acc: 0.9762
  Val   - AUC: 0.9006, AUPR: 0.8447, Acc: 0.8315

Best Results:
  AUC: 0.9059
  AUPR: 0.8543
  Accuracy: 0.8392
  Precision: 0.7692
  Recall: 0.8090

Training: Without TAP Score

Feature configuration:
  - One-hot encoding: True
  - AA features: True
  - TAP score: False
  - Rank score: True
  - Sequence feature dim: 42
Feature configuration:
  - One-hot encoding: True
  - AA features: True
  - TAP score: False
  - Rank score: True
  - Sequence feature dim: 42


                                                         

Epoch 5/50
  Train - Loss: 0.4777, Acc: 0.7584
  Val   - AUC: 0.8690, AUPR: 0.8019, Acc: 0.7996


                                                         

Epoch 10/50
  Train - Loss: 0.4409, Acc: 0.7855
  Val   - AUC: 0.8812, AUPR: 0.8195, Acc: 0.7983


                                                         

Epoch 15/50
  Train - Loss: 0.4295, Acc: 0.7924
  Val   - AUC: 0.8872, AUPR: 0.8294, Acc: 0.8156


                                                         

Epoch 20/50
  Train - Loss: 0.4078, Acc: 0.8104
  Val   - AUC: 0.8918, AUPR: 0.8370, Acc: 0.8194


                                                         

Epoch 25/50
  Train - Loss: 0.4021, Acc: 0.8076
  Val   - AUC: 0.8947, AUPR: 0.8440, Acc: 0.8162


                                                         

Epoch 30/50
  Train - Loss: 0.3847, Acc: 0.8216
  Val   - AUC: 0.8912, AUPR: 0.8394, Acc: 0.7977


                                                         

Epoch 35/50
  Train - Loss: 0.3742, Acc: 0.8227
  Val   - AUC: 0.8982, AUPR: 0.8506, Acc: 0.8251


                                                         

Epoch 40/50
  Train - Loss: 0.3542, Acc: 0.8363
  Val   - AUC: 0.8998, AUPR: 0.8525, Acc: 0.8188


                                                         

Epoch 45/50
  Train - Loss: 0.3398, Acc: 0.8451
  Val   - AUC: 0.9033, AUPR: 0.8586, Acc: 0.8328


                                                         

Epoch 50/50
  Train - Loss: 0.3192, Acc: 0.8580
  Val   - AUC: 0.9026, AUPR: 0.8592, Acc: 0.8341

Best Results:
  AUC: 0.9033
  AUPR: 0.8586
  Accuracy: 0.8328
  Precision: 0.7843
  Recall: 0.7573

Training: Without Rank Score

Feature configuration:
  - One-hot encoding: True
  - AA features: True
  - TAP score: True
  - Rank score: False
  - Sequence feature dim: 42
Feature configuration:
  - One-hot encoding: True
  - AA features: True
  - TAP score: True
  - Rank score: False
  - Sequence feature dim: 42


                                                         

Epoch 5/50
  Train - Loss: 0.4685, Acc: 0.7731
  Val   - AUC: 0.8702, AUPR: 0.8016, Acc: 0.7920


                                                         

Epoch 10/50
  Train - Loss: 0.4403, Acc: 0.7906
  Val   - AUC: 0.8818, AUPR: 0.8188, Acc: 0.8041


                                                         

Epoch 15/50
  Train - Loss: 0.4262, Acc: 0.7972
  Val   - AUC: 0.8882, AUPR: 0.8302, Acc: 0.8086


                                                         

Epoch 20/50
  Train - Loss: 0.4098, Acc: 0.8041
  Val   - AUC: 0.8880, AUPR: 0.8320, Acc: 0.8086


                                                         

Epoch 25/50
  Train - Loss: 0.3984, Acc: 0.8104
  Val   - AUC: 0.8942, AUPR: 0.8411, Acc: 0.8188


                                                         

Epoch 30/50
  Train - Loss: 0.3959, Acc: 0.8066
  Val   - AUC: 0.8939, AUPR: 0.8406, Acc: 0.8130


                                                         

Epoch 35/50
  Train - Loss: 0.3691, Acc: 0.8281
  Val   - AUC: 0.8985, AUPR: 0.8474, Acc: 0.8207


                                                         

Epoch 40/50
  Train - Loss: 0.3417, Acc: 0.8423
  Val   - AUC: 0.9025, AUPR: 0.8540, Acc: 0.8213


                                                         

Epoch 45/50
  Train - Loss: 0.3258, Acc: 0.8516
  Val   - AUC: 0.9026, AUPR: 0.8556, Acc: 0.8277


                                                         

Epoch 50/50
  Train - Loss: 0.3020, Acc: 0.8655
  Val   - AUC: 0.9016, AUPR: 0.8548, Acc: 0.8302

Best Results:
  AUC: 0.9035
  AUPR: 0.8548
  Accuracy: 0.8220
  Precision: 0.7807
  Recall: 0.7229

Training: Without Global Features

Feature configuration:
  - One-hot encoding: True
  - AA features: True
  - TAP score: False
  - Rank score: False
  - Sequence feature dim: 42
Feature configuration:
  - One-hot encoding: True
  - AA features: True
  - TAP score: False
  - Rank score: False
  - Sequence feature dim: 42


                                                

Error in Without Global Features: mat1 and mat2 shapes cannot be multiplied (256x14113 and 14112x1024)

Training: Only One-Hot

Feature configuration:
  - One-hot encoding: True
  - AA features: False
  - TAP score: False
  - Rank score: False
  - Sequence feature dim: 20
Feature configuration:
  - One-hot encoding: True
  - AA features: False
  - TAP score: False
  - Rank score: False
  - Sequence feature dim: 20


                                                

Error in Only One-Hot: mat1 and mat2 shapes cannot be multiplied (256x6721 and 6720x1024)

Training: Only AA Features

Feature configuration:
  - One-hot encoding: False
  - AA features: True
  - TAP score: False
  - Rank score: False
  - Sequence feature dim: 22
Feature configuration:
  - One-hot encoding: False
  - AA features: True
  - TAP score: False
  - Rank score: False
  - Sequence feature dim: 22


                                                

Error in Only AA Features: mat1 and mat2 shapes cannot be multiplied (256x7393 and 7392x1024)

ABLATION STUDY RESULTS
Configuration                  AUC      AUPR     Acc      Prec     Recall  
--------------------------------------------------------------------------------
Full Model (Baseline)          0.9027   0.8591   0.8181   0.7189   0.8365    (baseline)
Without One-Hot                0.9056   0.8618   0.8296   0.7698   0.7711    (+0.0029)
Without AA Features            0.9059   0.8543   0.8392   0.7692   0.8090    (+0.0032)
Without TAP Score              0.9033   0.8586   0.8328   0.7843   0.7573    (+0.0005)
Without Rank Score             0.9035   0.8548   0.8220   0.7807   0.7229    (+0.0007)

Results saved to 'ablation_study_results.csv'

FEATURE IMPORTANCE ANALYSIS

ΔAUC when removing each feature:
  TAP Score           : -0.0005
  Rank Score          : -0.0007
  One-Hot Encoding    : -0.0029
  AA Features         : -0.0032

CONCLUSIONS


