In [84]:
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import requests
import warnings
from typing import Optional
import time

In [85]:
def request_limited(url: str,
                    rtype: str = "GET",
                    num_attempts: int = 3,
                    sleep_time=0.5,
                    **kwargs) -> Optional[requests.models.Response]:
    """
    HTML request with rate-limiting base on response code

    
    Parameters
    ----------
    url : str
        The url for the request
    rtype : str
        The request type (oneof ["GET", "POST"])
    num_attempts : int
        In case of a failed retrieval, the number of attempts to try again
    sleep_time : int
        The amount of time to wait between requests, in case of
        API rate limits
    **kwargs : dict
        The keyword arguments to pass to the request

    Returns
    -------

    response : requests.models.Response
        The server response object. Only returned if request was successful,
        otherwise returns None.

    """

    if rtype not in ["GET", "POST"]:
        warnings.warn("Request type not recognized")
        return None

    total_attempts = 0
    while (total_attempts <= num_attempts):
        if rtype == "GET":
            response = requests.get(url, **kwargs)
        elif rtype == "POST":
            response = requests.post(url, **kwargs)

        if response.status_code == 200:
            return response

        if response.status_code == 429:
            curr_sleep = (1 + total_attempts) * sleep_time
            warnings.warn("Too many requests, waiting " + str(curr_sleep) +
                          " s")
            time.sleep(curr_sleep)
        elif 500 <= response.status_code < 600:
            warnings.warn("Server error encountered. Retrying")
        total_attempts += 1

    warnings.warn("Too many failures on requests. Exiting...")
    return None


def get_nonpolymer_bound_components(entry_id='7rfs', 
                                    url_root='https://data.rcsb.org/rest/v1/core/entry/'):
    
    url = url_root + entry_id
    response = request_limited(url)
    if response is None or response.status_code != 200:
        warnings.warn("Retrieval failed, returning None")
        return None
    result = str(response.text)
    all_info = json.loads(result)
    try:
        ligands = all_info['pdbx_vrpt_summary']['restypes_notchecked_for_bond_angle_geometry']
        return ligands
    except:
        warnings.warn("No ligands found, returning None")
        return None

In [99]:
with open('./gett_pdb/pdbbid.csv', 'r') as f:
    text = f.read()
pdb_ids = text.split(',')

In [100]:
no_ligand_stru_list=[]
no_pdb_file_list=[]
cavity_empty_list=[]
true_list=[]
for i in pdb_ids:
    if os.path.exists(f'./gett_pdb/{i}'):
        if os.path.exists(f'./gett_pdb/{i}/{i}_mol2'):
            if os.path.exists(f'./gett_pdb/{i}/{i}_mol2/ligand.mol2'):
                true_list.append(i)
            else:
                cavity_empty_list.append(i)
        else:
            no_pdb_file_list.append(i)
    else:
        no_ligand_stru_list.append(i)

In [101]:
with open('./error_list/no_ligand_stru_list.txt', 'w') as f:
    for i in no_ligand_stru_list:
        f.write(i + ' ')

In [102]:
with open('./error_list/no_pdb_file.txt', 'w') as f:
    for i in no_pdb_file_list:
        f.write(i + ',')

In [106]:
with open('./error_list/no_cavity_list.txt', 'w') as f:
    for i in cavity_empty_list:
        f.write(i + ',')

In [105]:
with open('./error_list/true_pdb_id.txt', 'w') as f:
    for i in true_list:
        f.write(i + ',')