In [6]:
import pandas as pd
import numpy as np
import os, re, json, ast
from collections import defaultdict,Counter
from multiprocessing.dummy import Pool
from string import ascii_uppercase
from string import ascii_lowercase
import subprocess, sys, getopt, time
import math,copy
import linecache
from itertools import combinations
import random
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import joblib
from joblib import Parallel, delayed
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [1]:
def get_uniprot_infos(unp_id):
    # unp_id = 'Q13131'
    ###获取uniprot id的序列信息,这里只提取了部分
    unp_seq_response = requests.get('https://www.ebi.ac.uk/proteins/api/proteins/%s'%(unp_id))
    unp_raw_result = json.loads(unp_seq_response.text)
    unp_id = unp_raw_result['accession']
    entry = unp_raw_result['id']
    sequence = unp_raw_result['sequence']['sequence']
    uniprot_infos = pd.DataFrame({'unp_id':[unp_id],'entry':[entry],'sequence':[sequence]})
    return uniprot_infos

In [2]:
def map_uniprot_to_pdb(unp_id):
    # unp_id = 'Q13131'##Q2M2I8
    unp2pdb_response = requests.get('https://www.ebi.ac.uk/pdbe/api/mappings/all_isoforms/%s'%(unp_id))
    unp2pdb_raw_result = json.loads(unp2pdb_response.text)[unp_id]['PDB']
    unp2pdb_dict = {'pdb_id':[],'entity_id':[],'chain_id':[],'is_canonical':[],'identity':[],'unp_range':[],'pdb_range':[]}
    for pdb in list(unp2pdb_raw_result):
        for pdb_data in unp2pdb_raw_result[pdb]:###insdel
            unp2pdb_dict['pdb_id'].append(pdb)
            for key in ['entity_id','chain_id','is_canonical','identity']:
                unp2pdb_dict[key].append(pdb_data[key])
            unp_range = [pdb_data['unp_start'],pdb_data['unp_end']]
            pdb_range = [pdb_data['start']['residue_number'],pdb_data['end']['residue_number']]
            unp2pdb_dict['unp_range'].append(unp_range)
            unp2pdb_dict['pdb_range'].append(pdb_range)
    unp2pdb_result = pd.DataFrame(unp2pdb_dict)
    unp2pdb_result['unp_id'] = unp_id
    unp2pdb_result['len_unp_range'] = unp2pdb_result.apply(lambda x: x['unp_range'][1]-x['unp_range'][0],axis=1)
    unp2pdb_result['len_pdb_range'] = unp2pdb_result.apply(lambda x: x['pdb_range'][1]-x['pdb_range'][0],axis=1)
    return unp2pdb_result

In [3]:
def get_pdb_summary(pdb_id):
    # pdb_id = '6c9j'
    pdb_summary_response = requests.get('https://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/%s'%(pdb_id))
    summary_raw_result = json.loads(pdb_summary_response.text)
    summary_need = {'title':[],'experimental_method':[],'number_of_entities':[],'assemblies':[]}
    for summary_data in summary_raw_result[pdb_id]:
        for key in list(summary_need):
            if key in summary_data:
                summary_need[key].append(summary_data[key])
            else:
                summary_need[key].append(np.nan)
    pdb_summary = pd.DataFrame(summary_need).explode('experimental_method')
    pdb_summary['pdb_id'] = pdb_id
    if pdb_summary.shape[0]!=1:
        print('This pdb_id has multiple summary, need check.')
    return pdb_summary

In [4]:
def get_pdb_molecules(pdb_id):
    pdb_molecule_response = requests.get('https://www.ebi.ac.uk/pdbe/api/pdb/entry/molecules/%s'%(pdb_id))
    molecule_raw_result = json.loads(pdb_molecule_response.text)
    ###注意此处仅仅获取了多肽链的信息，小分子、核酸信息不全，没有例子，不知道提哪些
    molecule_need = {'entity_id':[],'molecule_type':[],'mutation_flag':[],'molecule_name':[],'sequence':[],'pdb_sequence':[],'in_chains':[],
                     'pdb_sequence_indices_with_multiple_residues':[],'in_struct_asyms':[],'gene_name':[],'chem_comp_ids':[]}
    for entity_data in molecule_raw_result[pdb_id]:
        for key in molecule_need:
            if key in entity_data:
                molecule_need[key].append(entity_data[key])
            else:
                molecule_need[key].append(np.nan)
    pdb_molecule = pd.DataFrame(molecule_need).rename(columns={'sequence':'raw_pdb_sequence','pdb_sequence':'real_pdb_sequence'})
    pdb_molecule['pdb_id'] = pdb_id
    pdb_molecules = pdb_molecule.explode(['in_chains','in_struct_asyms']).rename(columns={'in_chains':'chain_id',
                                                                                          'in_struct_asyms':'struct_asym_id'})
    return pdb_molecules

In [5]:
def get_pdb_residues(pdb_id):
    # pdb_id = '6c9j'
    pdb_residues_response = requests.get('https://www.ebi.ac.uk/pdbe/api/pdb/entry/residue_listing/%s'%(pdb_id))
    residues_raw_result = json.loads(pdb_residues_response.text)
    chain_residues_list = []
    for entity_data in residues_raw_result[pdb_id]['molecules']:
        for chain_data in entity_data['chains']:
            chain_residues = pd.DataFrame(chain_data['residues'])
            chain_residues['struct_asym_id'] = chain_data['struct_asym_id']
            chain_residues['chain_id'] = chain_data['chain_id']
            chain_residues_list.append(chain_residues)
    pdb_residues = pd.concat(chain_residues_list,axis=0).reset_index(drop=1)
    pdb_residues['pdb_id'] = pdb_id
    return pdb_residues