In [None]:
import requests
import os
import time
from urllib.parse import quote
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import json
import csv

# 配置
GENE_TXT_PATH = '/Users/lfluo/Desktop/网络药理学/cgmega/ASD_Kidney/nodes.txt'    # 基因名称文本文件路径
OUTPUT_PDB_DIR = '/Users/lfluo/Desktop/网络药理学/cgmega/aASD_Kidney/PDB_file'      # PDB文件下载目录
LOG_FILE = os.path.join(OUTPUT_PDB_DIR, 'download_log.txt')      # 日志文件路径
INDEX_FILE = os.path.join(OUTPUT_PDB_DIR, 'structure_index.csv')  # 索引文件路径
UNI_API_URL = 'https://rest.uniprot.org/uniprotkb/search'
PDB_API_URL = 'https://data.rcsb.org/graphql'
ALPHAFOLD_URL_TEMPLATE = 'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'

# 创建PDB文件存储目录
os.makedirs(OUTPUT_PDB_DIR, exist_ok=True)

# 打开日志文件
try:
    log = open(LOG_FILE, 'w', encoding='utf-8')
except OSError as e:
    print(f"无法打开日志文件 {LOG_FILE}，错误: {e}")
    exit(1)

# 初始化索引列表
structure_sources = []

def log_message(message):
    print(message)
    log.write(message + '\n')

def read_gene_names(file_path):
    """
    读取gene_name.txt文件，返回基因名称列表
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            genes = [line.strip() for line in f if line.strip()]
        log_message(f"成功读取 {len(genes)} 个基因名称。")
        return genes
    except Exception as e:
        log_message(f"读取文件 {file_path} 时发生异常: {e}")
        return []

def create_session():
    """
    创建一个带有重试机制的requests Session
    """
    session = requests.Session()
    retry = Retry(
        total=5,  # 总共重试5次
        backoff_factor=1,  # 重试间隔因子
        status_forcelist=[429, 500, 502, 503, 504],  # 需要重试的HTTP状态码
        allowed_methods=["GET", "POST"]  # 替换 method_whitelist 为 allowed_methods
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def get_uniprot_ids(gene_names, session):
    """
    通过UniProt API将基因名称转换为人类的UniProt ID
    """
    uniprot_ids = {}
    headers = {
        'Accept': 'application/json'
    }
    for gene in tqdm(gene_names, desc="查询UniProt IDs"):
        # 添加物种过滤：仅人类（Taxonomy ID: 9606），并选择Reviewed条目
        query = f'gene:"{gene}" AND organism_id:9606 AND reviewed:true'
        params = {
            'query': query,
            'fields': 'accession',
            'size': 100  # 每个基因获取最多100个UniProt条目
        }
        try:
            response = session.get(UNI_API_URL, headers=headers, params=params)
            log_message(f"发送的查询语句: {query}")
            if response.status_code == 200:
                data = response.json()
                results = data.get('results', [])
                ids = [entry['primaryAccession'] for entry in results]
                if ids:
                    uniprot_ids[gene] = ids
                    log_message(f"基因 {gene} 对应的UniProt ID: {', '.join(ids)}")
                else:
                    log_message(f"基因 {gene} 未找到对应的人类 Reviewed UniProt ID。")
            else:
                log_message(f"查询基因 {gene} 时出错，状态码: {response.status_code}")
                log_message(f"响应内容: {response.text}")
        except Exception as e:
            log_message(f"查询基因 {gene} 时发生异常: {e}")
        time.sleep(0.5)  # 避免请求过快
    return uniprot_ids

def get_pdb_ids(uniprot_id, session):
    """
    通过RCSB PDB的GraphQL API获取与UniProt ID相关的PDB ID
    """
    # 修正GraphQL查询语法，确保UniProt ID用引号包裹
    query = f"""
    {{
      polymer_entity(uniprot: "{uniprot_id}") {{
        pdbx_polymer_entity_container_identifiers {{
          pdb_id
        }}
      }}
    }}
    """
    try:
        response = session.post(PDB_API_URL, json={'query': query})
        if response.status_code == 200:
            data = response.json()
            if not data.get('data'):
                log_message(f"UniProt ID {uniprot_id} 的响应中缺少 'data' 字段。")
                log_message(f"响应内容: {json.dumps(data, ensure_ascii=False, indent=2)}")
                return []
            entries = data.get('data', {}).get('polymer_entity', [])
            if entries is None:
                log_message(f"UniProt ID {uniprot_id} 的 'polymer_entity' 为 None。")
                log_message(f"响应内容: {json.dumps(data, ensure_ascii=False, indent=2)}")
                return []
            pdb_ids = []
            for entry in entries:
                identifiers = entry.get('pdbx_polymer_entity_container_identifiers', {})
                pdb_id = identifiers.get('pdb_id')
                if pdb_id:
                    pdb_ids.append(pdb_id)
            if pdb_ids:
                log_message(f"UniProt ID {uniprot_id} 对应的PDB ID: {', '.join(pdb_ids)}")
            else:
                log_message(f"UniProt ID {uniprot_id} 未找到对应的PDB条目。")
            return pdb_ids
        else:
            log_message(f"查询UniProt ID {uniprot_id} 的PDB时出错，状态码: {response.status_code}")
            log_message(f"响应内容: {response.text}")
            return []
    except Exception as e:
        log_message(f"查询UniProt ID {uniprot_id} 的PDB时发生异常: {e}")
        return []

def download_pdb(gene_name, pdb_id, output_dir, session):
    """
    下载指定PDB ID的PDB文件，并以基因名称命名
    """
    pdb_url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    try:
        response = session.get(pdb_url)
        if response.status_code == 200:
            # 构建文件名
            filename = f"{gene_name}_{pdb_id}.pdb"
            filepath = os.path.join(output_dir, filename)
            with open(filepath, 'w') as file:
                file.write(response.text)
            log_message(f"下载成功: {filename} (来源: PDB)")
            return True
        else:
            log_message(f"无法下载PDB文件: {pdb_id}, 状态码: {response.status_code}")
            return False
    except Exception as e:
        log_message(f"下载PDB文件 {pdb_id} 时发生异常: {e}")
        return False

def download_alphafold(gene_name, uniprot_id, output_dir, session):
    """
    从AlphaFold数据库下载指定UniProt ID的结构文件，并以基因名称命名
    """
    alpha_url = ALPHAFOLD_URL_TEMPLATE.format(uniprot_id=uniprot_id)
    try:
        response = session.get(alpha_url)
        if response.status_code == 200:
            filename = f"{gene_name}.pdb"
            filepath = os.path.join(output_dir, filename)
            with open(filepath, 'w') as file:
                file.write(response.text)
            log_message(f"下载成功: {filename} (来源: AlphaFold)")
            return True
        else:
            log_message(f"无法下载AlphaFold文件: {uniprot_id}, 状态码: {response.status_code}")
            log_message(f"响应内容: {response.text}")
            return False
    except Exception as e:
        log_message(f"下载AlphaFold文件 {uniprot_id} 时发生异常: {e}")
        return False

def main():
    session = create_session()
    try:
        # 读取基因名称
        gene_names = read_gene_names(GENE_TXT_PATH)
        if not gene_names:
            log_message("没有基因名称可处理。")
            return

        # 获取UniProt IDs
        uniprot_mapping = get_uniprot_ids(gene_names, session)
        if not uniprot_mapping:
            log_message("没有找到任何人类 Reviewed UniProt ID。")
            return

        # 遍历UniProt IDs，获取PDB IDs并下载PDB文件
        for gene, uniprot_ids in uniprot_mapping.items():
            for uniprot_id in uniprot_ids:
                pdb_ids = get_pdb_ids(uniprot_id, session)
                if pdb_ids:
                    for pdb_id in pdb_ids:
                        success = download_pdb(gene, pdb_id, OUTPUT_PDB_DIR, session)
                        if success:
                            # 记录来源为 PDB
                            structure_sources.append({
                                'gene_name': gene,
                                'uniprot_id': uniprot_id,
                                'pdb_id': f"{gene}_{pdb_id}",
                                'source': 'PDB'
                            })
                        time.sleep(0.3)  # 控制下载速率
                else:
                    # 如果没有PDB条目，尝试从AlphaFold下载
                    success = download_alphafold(gene, uniprot_id, OUTPUT_PDB_DIR, session)
                    if success:
                        # 记录来源为 AlphaFold
                        structure_sources.append({
                            'gene_name': gene,
                            'uniprot_id': uniprot_id,
                            'pdb_id': f"{gene}_AlphaFold",
                            'source': 'AlphaFold'
                        })
                time.sleep(0.5)  # 控制API请求速率


    finally:
        log.close()
        session.close()

if __name__ == "__main__":
    main()