conda create -n protein_gcn python=3.9
conda activate protein_gcn

In [None]:
# 导入数据处理模块
from pdb_data_processing import process_proteins

# 定义参数
csv_path = 'proteins.csv'
pdb_folder = './pdb_files'
all_assign_path = 'all_assign.txt'
distance_threshold = 7.5

# 处理蛋白质数据
data_list = process_proteins(csv_path, pdb_folder, all_assign_path, distance_threshold)
print(f"Processed {len(data_list)} proteins.")

In [None]:
from GCN_model import GCN

# 定义模型参数
input_dim = 7  # 根据您的特征维度调整
hidden_dim = 128
output_dim = 128
pooling_ratio = 0.5
dropout_rate = 0.5

# 初始化模型
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = GCN(input_dim, hidden_dim, output_dim, pooling_ratio, dropout_rate).to(device)
model.eval()

In [None]:
from protein_feature_extraction import extract_features

# 提取特征
extracted_features = extract_features(model, data_list, device, batch_size=32)

# 保存特征
np.save('features.npy', extracted_features)
print("Saved extracted features to features.npy")

In [None]:
from GCN_evaluation import visualize_features, plot_silhouette_score
import numpy as np

# 加载特征
extracted_features = np.load('features.npy', allow_pickle=True)

# 聚合特征（例如，取平均）
protein_features = np.array([item['feature'] for item in extracted_features])

# 可视化
visualize_features(protein_features, method='pca')
visualize_features(protein_features, method='tsne')

# 计算轮廓系数
sil_score = plot_silhouette_score(protein_features, n_clusters=10)

In [None]:
from hyperparameter_tuning import grid_search
from model import GCN

# 定义超参数网格
param_grid = {
    'hidden_dim': [64, 128, 256],
    'dropout_rate': [0.3, 0.5, 0.7],
    'pooling_ratio': [0.3, 0.5, 0.7]
}

# 进行网格搜索
best_params, best_score = grid_search(
    model_class=GCN,
    data_list=data_list,
    device=device,
    param_grid=param_grid,
    n_clusters=10,
    batch_size=32
)
print(f"Best Parameters: {best_params}, Best Silhouette Score: {best_score:.4f}")

In [None]:
# 5. 应用最佳参数并重新提取特征
if grid_search_enabled and best_params is not None:
    print("Applying best parameters to GCN model...")
    optimized_gcn = ProteinGCNModel(
        input_dim=7,  # 根据您的特征维度调整
        hidden_dim=best_params['hidden_dim'],
        output_dim=128,  # 可以根据需要调整
        pooling_ratio=best_params['pooling_ratio'],
        dropout_rate=best_params['dropout_rate']
    ).to(device)
    optimized_gcn.eval()
    
    print("Extracting features with optimized GCN...")
    optimized_features = extract_features(optimized_gcn, data_list, device, batch_size=batch_size)
    
    # 保存优化后的特征
    optimized_output_features_path = 'optimized_features.npy'
    np.save(optimized_output_features_path, optimized_features)
    print(f"Saved optimized features to {optimized_output_features_path}")
    
    # 重新评估与可视化
    optimized_protein_features = np.array([item['feature'] for item in optimized_features])
    
    print("Visualizing optimized features with PCA...")
    visualize_features(optimized_protein_features, method='pca')
    
    print("Visualizing optimized features with t-SNE...")
    visualize_features(optimized_protein_features, method='tsne')
    
    # 计算轮廓系数
    optimized_sil_score = plot_silhouette_score(optimized_protein_features, n_clusters=10)
    print(f"Optimized Silhouette Score: {optimized_sil_score:.4f}")