Code to retrieve multi-domain proteins for analysis of domain to domain predicted aligned error.

In [1]:
import requests
import pandas as pd
import numpy as np
import random
import csv

In [2]:
'''
Build the PDB Search API request
'''

# Set the search parameters
url = 'https://search.rcsb.org/rcsbsearch/v2/query'
query = {
  "query": {
    "type": "group",
    "nodes": [
      {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_polymer_instance_annotation.annotation_lineage.id",
              "operator": "exact_match",
              "value": "56572",
              "negation": False
            }
          },
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_polymer_instance_annotation.type",
              "operator": "exact_match",
              "value": "SCOP",
              "negation": False
            }
          }
        ],
        "label": "nested-attribute"
      },
    ],
    "logical_operator": "and",
    "label": "text"
  },
  "return_type": "polymer_entity",
  "request_options": {
    "group_by_return_type": "groups",
    "group_by": {
      "aggregation_method": "matching_uniprot_accession",
      "ranking_criteria_type": {
        "sort_by": "rcsb_entry_info.resolution_combined",
        "direction": "asc"
      }
    },
    "return_all_hits": True,
    "results_content_type": [
      "experimental"
    ],
    "sort": [
      {
        "sort_by": "score",
        "direction": "desc"
      },
      {
        "sort_by": "size",
        "direction": "desc"
      }
    ],
    "scoring_strategy": "combined"
  }
}

In [3]:
response = requests.post(url, json=query)

response_dic = response.json()

# Get the PDB IDs
group_set = response_dic['group_set']

# Get the list of uniprot ids
uniprots = [group_set[i]['identifier'] for i in range(len(group_set))]


# with open('./project_pipeline/data/multi_domain_uniprots.csv', 'w') as f:
#     writer = csv.writer(f)
#     writer.writerow(['uniprot'])
#     writer.writerows(uniprots)


In [4]:
def get_domains(uniprot_id):
    '''
    Get the domain information from UniProtKB
    '''
    print(f'Getting domains for {uniprot_id}')
    url = f'https://rest.uniprot.org/uniprotkb/search?query=accession:{uniprot_id}&fields=ft_domain'
    response = requests.get(url)
    response_dic = response.json()
    domains = []
    try:
        features = response_dic['results'][0]['features']
        # Get the start and end of any domains
        for i in range(len(features)):
            if response_dic['results'][0]['features'][0]['type'] == 'Domain':
                start = str(features[i]['location']['start']['value'])
                end = str(features[i]['location']['end']['value'])
                domains.append((start + '-' + end))

        domains_string = ','.join(domains)

    except KeyError:
        print(f'No domains found for {uniprot_id}')
        domains_string = None

    return domains_string

def single_domains(uniprots):
    '''
    Get the single domains from UniProtKB
    '''
    domains = {'uniprot': [], 'region': []}
    # Get domains for the uniprot ids
    for i in range(len(uniprots)):
        uniprot_id = uniprots[i]
        region = get_domains(uniprot_id)
        domains['uniprot'].append(uniprot_id)
        domains['region'].append(region)

    # Convert to pandas dataframe
    domains_df = pd.DataFrame.from_dict(domains, orient='columns')

    return domains_df

domains_df = single_domains(uniprots)

Getting domains for P03366
Getting domains for P33334
Getting domains for P04585
Getting domains for P12497
Getting domains for P04050
Getting domains for P08518
Getting domains for P03300
Getting domains for P03303
Getting domains for P03367
Getting domains for Q97W02
Getting domains for Q38087
Getting domains for P00811
Getting domains for P26663
Getting domains for Q8RQE9
Getting domains for Q8RQE8
Getting domains for P62593
Getting domains for P19821
Getting domains for Q99ZW2
Getting domains for Q5SLP7
Getting domains for P03313
Getting domains for Q9F663
Getting domains for P00636
Getting domains for P27958
Getting domains for P0AES6
Getting domains for O92972
Getting domains for Q12306
Getting domains for P09467
Getting domains for P01011
Getting domains for P01009
Getting domains for P03355
Getting domains for P14489
Getting domains for O94925
Getting domains for P21179
Getting domains for Q9UNA4
Getting domains for P0AD64
Getting domains for G3XD46
Getting domains for Q82122
G

In [17]:
domains_df = domains_df.drop(domains_df[domains_df['region'] == ''].index).reset_index(drop=True)
domains_df = domains_df.dropna().reset_index(drop=True)

# Remove any proteins with only one annotated domain
for i in range(len(domains_df)):
    region = domains_df.loc[i, 'region']
    count = region.count('-')
    if count <= 1:
        domains_df = domains_df.drop(i)

# remove any proteins that may overlap with my autoinhibited set
autoinhibited = pd.read_csv('./project_pipeline/data/classified_files_3.tsv', sep='\t').astype('object')
common = domains_df['uniprot'].isin(autoinhibited['uniprot'])
print(common)
domains_df = domains_df.drop(domains_df[common].index).reset_index(drop=True)

# Save the dataframe
domains_df.to_csv('./project_pipeline/data/multi_domain_domains.csv', index=False)

0      False
1      False
2      False
3      False
4      False
       ...  
132    False
133    False
134    False
135    False
136    False
Name: uniprot, Length: 137, dtype: bool


In [30]:
single_df = pd.read_csv('./project_pipeline/data/single_domain_domains.csv')
common2 = single_df['uniprot'].isin(autoinhibited['uniprot'])


KeyError: 'uniprot'