In [29]:
import requests
import pandas as pd
import numpy as np
import random

In [30]:
'''
Build the PDB Search API request
'''

# Set the search parameters
url = 'https://search.rcsb.org/rcsbsearch/v2/query'
query = {
  "query": {
    "type": "group",
    "nodes": [
      {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_polymer_instance_annotation.annotation_lineage.id",
              "operator": "exact_match",
              "value": "56992",
              "negation": False
            }
          },
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_polymer_instance_annotation.type",
              "operator": "exact_match",
              "value": "SCOP",
              "negation": False
            }
          }
        ],
        "label": "nested-attribute"
      },
      {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_polymer_instance_annotation.annotation_lineage.id",
              "operator": "exact_match",
              "negation": True,
              "value": "144255"
            }
          },
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_polymer_instance_annotation.type",
              "operator": "exact_match",
              "value": "SCOP",
              "negation": False
            }
          }
        ],
        "label": "nested-attribute"
      }
    ],
    "logical_operator": "and",
    "label": "text"
  },
  "return_type": "polymer_entity",
  "request_options": {
    "group_by_return_type": "groups",
    "group_by": {
      "aggregation_method": "matching_uniprot_accession",
      "ranking_criteria_type": {
        "sort_by": "rcsb_entry_info.resolution_combined",
        "direction": "asc"
      }
    },
    "return_all_hits": True,
    "results_content_type": [
      "experimental"
    ],
    "sort": [
      {
        "sort_by": "score",
        "direction": "desc"
      },
      {
        "sort_by": "size",
        "direction": "desc"
      }
    ],
    "scoring_strategy": "combined"
  }
}

In [35]:
response = requests.post(url, json=query)

response_dic = response.json()

# Get the PDB IDs
group_set = response_dic['group_set']

results = {}

for i in range(len(group_set)):
    uniprot = group_set[i]['identifier']
    result_set = group_set[i]['result_set']
    pdbs = [result_set[j]['identifier'] for j in range(len(result_set))]

    results[uniprot] = pdbs

keys = [key for key in results.keys()]

indices = random.sample(range(0, len(keys)), 100)

uniprots = [keys[i] for i in indices]

# with open('./project_pipeline/data/single_domain_uniprots.csv', 'w') as f:
#     writer = csv.writer(f)
#     writer.writerow(['uniprot'])
#     writer.writerows(uniprots)
