In [4]:
import requests
import pandas as pd
import numpy as np
import random

In [5]:
'''
Build the PDB Search API request
'''

# Set the search parameters
url = 'https://search.rcsb.org/rcsbsearch/v2/query'
query = {
  "query": {
    "type": "group",
    "nodes": [
      {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_polymer_instance_annotation.annotation_lineage.id",
              "operator": "exact_match",
              "value": "56992",
              "negation": False
            }
          },
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_polymer_instance_annotation.type",
              "operator": "exact_match",
              "value": "SCOP",
              "negation": False
            }
          }
        ],
        "label": "nested-attribute"
      },
      {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_polymer_instance_annotation.annotation_lineage.id",
              "operator": "exact_match",
              "negation": True,
              "value": "144255"
            }
          },
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_polymer_instance_annotation.type",
              "operator": "exact_match",
              "value": "SCOP",
              "negation": False
            }
          }
        ],
        "label": "nested-attribute"
      }
    ],
    "logical_operator": "and",
    "label": "text"
  },
  "return_type": "polymer_entity",
  "request_options": {
    "group_by_return_type": "groups",
    "group_by": {
      "aggregation_method": "matching_uniprot_accession",
      "ranking_criteria_type": {
        "sort_by": "rcsb_entry_info.resolution_combined",
        "direction": "asc"
      }
    },
    "return_all_hits": True,
    "results_content_type": [
      "experimental"
    ],
    "sort": [
      {
        "sort_by": "score",
        "direction": "desc"
      },
      {
        "sort_by": "size",
        "direction": "desc"
      }
    ],
    "scoring_strategy": "combined"
  }
}

In [6]:
response = requests.post(url, json=query)

response_dic = response.json()

# Get the PDB IDs
group_set = response_dic['group_set']

# Get the list of uniprot ids
uniprots = [group_set[i]['identifier'] for i in range(len(group_set))]


# with open('./project_pipeline/data/single_domain_uniprots.csv', 'w') as f:
#     writer = csv.writer(f)
#     writer.writerow(['uniprot'])
#     writer.writerows(uniprots)


In [9]:
def get_domains(uniprot_id):
    '''
    Get the domain information from UniProtKB
    '''
    print(f'Getting domains for {uniprot_id}')
    url = f'https://rest.uniprot.org/uniprotkb/search?query=accession:{uniprot_id}&fields=ft_domain'
    response = requests.get(url)
    response_dic = response.json()
    domains = []
    try:
        features = response_dic['results'][0]['features']
        # Get the start and end of any domains
        for i in range(len(features)):
            if response_dic['results'][0]['features'][0]['type'] == 'Domain':
                start = str(features[i]['location']['start']['value'])
                end = str(features[i]['location']['end']['value'])
                domains.append((start + '-' + end))

        domains_string = ','.join(domains)

    except KeyError:
        print(f'No domains found for {uniprot_id}')
        domains_string = None

    return domains_string

def single_domains(uniprots):
    '''
    Get the single domains from UniProtKB
    '''
    domains = {'uniprot': [], 'region': []}
    # Get domains for the uniprot ids
    for i in range(len(uniprots)):
        uniprot_id = uniprots[i]
        region = get_domains(uniprot_id)
        domains['uniprot'].append(uniprot_id)
        domains['region'].append(region)

    # Convert to pandas dataframe
    domains_df = pd.DataFrame.from_dict(domains, orient='columns')

    return domains_df

domains_df = single_domains(uniprots)

Getting domains for P0DTD1
Getting domains for P00734
Getting domains for P01308
Getting domains for P03366
Getting domains for P03372
Getting domains for P04585
Getting domains for P12497
Getting domains for P0A7N4
Getting domains for P0A7N9
Getting domains for P0DOY6
Getting domains for P0A7Q6
Getting domains for P00742
Getting domains for P00533
Getting domains for Q9UIF8
Getting domains for P40422
Getting domains for P08709
Getting domains for P05067
Getting domains for Q32ZE1
Getting domains for P00749
Getting domains for P27999
Getting domains for P03069
Getting domains for Q00987
Getting domains for P0C6X7
Getting domains for P00974
Getting domains for Q92793
Getting domains for P08581
Getting domains for P09945
Getting domains for P41182
Getting domains for P01024
Getting domains for P19793
Getting domains for P05106
Getting domains for P01050
Getting domains for Q8NBP7
Getting domains for P00428
Getting domains for P01315
Getting domains for P06213
Getting domains for P01317
G

In [15]:
domains_df = domains_df.drop(domains_df[domains_df['region'] == ''].index)
print(domains_df.head())
# Save the dataframe
domains_df.to_csv('./project_pipeline/data/single_domain_domains.csv', index=False)

  uniprot                                             region
0  P0DTD1  12-127,148-179,183-456,458-688,690-818,821-929...
1  P00734                      44-89,108-186,213-291,364-618
3  P03366                520-589,643-833,1033-1156,1213-1363
4  P03372                                            311-547
5  P04585                508-577,631-821,1021-1144,1201-1351
