In [4]:
import numpy as np
import os
import requests
import json
from tqdm import tqdm

path = os.getcwd()

In [5]:
def get_alphafold_db_pdb(protein_id: str, out_path: str) -> bool:

    """
    With the uniprot id, get the AF PDB from the DB.
    """

    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    requestURL = f"https://alphafold.ebi.ac.uk/files/AF-{protein_id}-F1-model_v4.pdb"
    r = requests.get(requestURL)

    if r.status_code == 200:
        with open(out_path, "wb") as f:
            f.write(r.content)
            return True
    else:
        return False

In [6]:
proteinIDs = []

with open("DeepTMHMM.partitions.json","r") as FileObj:
    CVs = json.loads(FileObj.read())
    for cv in CVs.keys():
        cvProteins = CVs[cv]
        for protein in cvProteins:
            proteinIDs += [protein["id"]]

In [7]:
print("Number of protein IDs in total: ", proteinIDs.__len__())
print("Number of unique protein IDs in total",np.unique(proteinIDs).__len__())

Number of protein IDs in total:  3576
Number of unique protein IDs in total 3576


In [65]:
for proteinID in tqdm(proteinIDs):
    get_alphafold_db_pdb(proteinID,path + "/AlphaFoldDB/" + proteinID+".pdb")

100%|██████████| 3576/3576 [30:07<00:00,  1.98it/s]


In [8]:
AlphaFoldResults = [f[:-4] for f in os.listdir(path + "/AlphaFoldDB") if (os.path.isfile(path + "/AlphaFoldDB/" + f) & f.__contains__(".pdb") )]
AlphaMissing = set(proteinIDs) - set(AlphaFoldResults)

print("Found", AlphaFoldResults.__len__(),"of",proteinIDs.__len__(),"proteins in AlphaFold DB")
print("Missing the following:")
for missing in AlphaMissing:
    print(missing)

Found 3544 of 3576 proteins in AlphaFold DB
Missing the following:
P36022
Q7TMY8
Q05470
P04875
Q8IZQ1
P98161
Q9VDW6
Q9VKA4
O83774
O83276
Q5VT06
Q9P2D1
Q01484
P29994
Q14315
P14217
F8VPN2
Q6KC79
Q61001
Q9UKN1
Q9U943
Q96T58
P69332
Q5I6C7
Q3KNY0
Q9VC56
Q96Q15
Q9SMH5
Q868Z9
Q14789
Q8WXX0
P0DTC2
