This notebook is for combining the protein lists. As I've already combined Jorge's and my files, I'm just going to add Dokyun's. 

In [13]:
import pandas as pd
import csv
import requests
import numpy as np

In [14]:
# Some helper functions
def query_rcsb(uniprot_id, url):
    
    # Query test for pdb files associated with given UniProt accession number.
    query_text = {
    "query": {
      "type": "group",
      "logical_operator": "and",
      "nodes": [
        {
          "type": "terminal",
          "service": "text",
          "parameters": {
            "operator": "exact_match",
            "value": uniprot_id,
            "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession"
          }
        },
        {
          "type": "terminal",
          "service": "text",
          "parameters": {
            "operator": "exact_match",
            "value": "UniProt",
            "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name"
          }
        }
      ]
    },
    "request_options": {
      "return_all_hits": True
    },
    "return_type": "polymer_instance"
      }
      
    print("Querying RCSB PDB REST API for Uniprot_ID: %s" % uniprot_id)
      
    header = {'Content-Type': 'application/x-www-form-urlencoded'}
      
    response = requests.post(url, json=query_text)

    # In format 4CJ0 1P3I ...
    pdb_str = ''

    if response.status_code == 200:
          response_dic = response.json()
          for n in range(len(response_dic['result_set'])):
              pdb_str = pdb_str + response_dic['result_set'][n]['identifier'] + ' '
          
    else:
          pdb_str = np.nan
          print("Failed to retrieve results for Uniprot_ID: %s" % uniprot_id)

    return pdb_str

def prune_extra_chains(pdb_ids_str):

    #Turn the string into a list
    pdb_ids_w_chain = pdb_ids_str.strip().split(sep=' ')

    #Empty dictionary to fill.
    pdb_ids_dict = {}

    for pdb_id in pdb_ids_w_chain:

        #PDB ID (lowercase)
        pdb = pdb_id.split('.')[0].lower()

        #Chain label
        chain = pdb_id.split('.')[1]

        #Add the PDB ID as a key and the chain label as a value.
        if pdb not in pdb_ids_dict.keys():
            pdb_ids_dict[pdb] = [chain]
        else:
            pdb_ids_dict[pdb].append(chain)

    #Extract each PDB from the pdb_ids dict
    for pdb_id in pdb_ids_dict.copy():
            
        #Determine whether there are one or more chains.
        if len(pdb_ids_dict[pdb_id]) != 1:

            #If more than one chain, select the first chain as our representative.
            pdb_ids_dict[pdb_id] = pdb_ids_dict[pdb_id][0]

    # Now we convert them back to strings and add it all together.
    # Make a list of the values
    values_list = list(pdb_ids_dict.values())

    #Make a list of the keys
    key_list = list(pdb_ids_dict.keys())

    #Empty string to fill with my IDs.
    unique_pdb_ids = ''

    for n in range(len(pdb_ids_dict)):

        #Get the value
        chain = values_list[n][0]

        #Get the key
        key = key_list[n].lower()

        # Put them together
        pdb_id_chain_str = key + '.' + chain

         #Append to my unique_pdb_ids string
        unique_pdb_ids = unique_pdb_ids + ' ' + pdb_id_chain_str

    #Make the value of PDB at the index i equal to our new string.
    return unique_pdb_ids

In [15]:
df = pd.read_csv("./Compiling_Protein_Lists/Final_compilation/no_autoinhibited_structures_dokyun_na.csv").astype('object')
df.head()
'''Retrieves PDB IDs for each protein in the dataframe in the form of ID.chain (e.g. 1A2K.A)'''

# Create a column to store the PDB IDs for each protein
df['pdb'] = ''

for i in range(len(df)):
    # Define UniProt ID and URL
    uniprot_id = df.loc[i, 'Uniprot ID']
    url = 'https://search.rcsb.org/rcsbsearch/v2/query'

    pdb_ids = query_rcsb(uniprot_id, url)

    # If received NaN from query, then drop row. Else, prune chains.
    if type(pdb_ids) == float:
        df = df.drop(index=[i])
    else:
        pdb_ids_pruned = prune_extra_chains(pdb_ids)
        df.loc[i, 'pdb'] = pdb_ids_pruned.strip()

df.reset_index(drop=True, inplace=True)

# Create a number of pdbs column
df['number_of_pdbs'] = df['pdb'].apply(lambda x: len(x.split()))

# Save the dataframe to a csv file
df.to_csv("./curated_protein_lists/no_autoinhibited_structures_dokyun_pdb.csv", index=False)

Querying RCSB PDB REST API for Uniprot_ID: Q7XEK4
Failed to retrieve results for Uniprot_ID: Q7XEK4
Querying RCSB PDB REST API for Uniprot_ID: Q12774
Failed to retrieve results for Uniprot_ID: Q12774
Querying RCSB PDB REST API for Uniprot_ID: A0A044RE18
Failed to retrieve results for Uniprot_ID: A0A044RE18
Querying RCSB PDB REST API for Uniprot_ID: Q39253
Failed to retrieve results for Uniprot_ID: Q39253
Querying RCSB PDB REST API for Uniprot_ID: Q5VT25
Failed to retrieve results for Uniprot_ID: Q5VT25
Querying RCSB PDB REST API for Uniprot_ID: O15078
Failed to retrieve results for Uniprot_ID: O15078
Querying RCSB PDB REST API for Uniprot_ID: O15078
Failed to retrieve results for Uniprot_ID: O15078
Querying RCSB PDB REST API for Uniprot_ID: Q9JK25
Failed to retrieve results for Uniprot_ID: Q9JK25
Querying RCSB PDB REST API for Uniprot_ID: Q8K382
Failed to retrieve results for Uniprot_ID: Q8K382
Querying RCSB PDB REST API for Uniprot_ID: P48608
Failed to retrieve results for Uniprot_ID: