In [7]:
import csv
import codecs
from tqdm import tqdm
from urllib.request import urlretrieve

In [8]:
rows = []
with codecs.open('../data/HomologyTAPE/pdb_chain_scop_uniprot.tsv', encoding='utf-8') as f:
    for row in csv.DictReader(f, skipinitialspace=True, delimiter='\t'):
        rows.append(row)
print(len(rows))
print(rows[0])

104840
{'PDB': '101m', 'CHAIN': 'A', 'SP_PRIMARY': 'P02185', 'SUNID': '15125', 'SCOP_ID': 'd101ma_'}


In [9]:
fold_dic = {}
for row in rows:
    fold_dic[row['SCOP_ID']] = {}
    fold_dic[row['SCOP_ID']]['PDB-chain'] = row['PDB'].upper() + '-' + row['CHAIN']
    fold_dic[row['SCOP_ID']]['uniprot'] = row['SP_PRIMARY']

In [10]:
fold_dic['d101ma_']

{'PDB-chain': '101M-A', 'uniprot': 'P02185'}

In [11]:
with open('../data/HomologyTAPE/test_fold.txt') as f:
    test_file_lines = f.readlines()
test_lines = {}
test_pdb = {}
failed_test_scop = []
for line in test_file_lines:
    data = line.split('\t')
    data = [d.strip() for d in data]
    test_lines[data[0]] = data[-1]
    try:
        if fold_dic[data[0]]['PDB-chain'] not in test_pdb.keys():
            test_pdb[fold_dic[data[0]]['PDB-chain']] = []
        test_pdb[fold_dic[data[0]]['PDB-chain']].append(data[-1])
    except:
        failed_test_scop.append(data[0])

In [12]:
with open('../data/HomologyTAPE/training.txt') as f:
    train_file_lines = f.readlines()
train_lines = {}
train_pdb = {}
failed_train_scop = []
for line in train_file_lines:
    data = line.split('\t')
    data = [d.strip() for d in data]
    train_lines[data[0]] = data[-1]
    try:
        if fold_dic[data[0]]['PDB-chain'] not in train_pdb.keys():
            train_pdb[fold_dic[data[0]]['PDB-chain']] = []
        train_pdb[fold_dic[data[0]]['PDB-chain']].append(data[-1])
    except:
        failed_train_scop.append(data[0])

In [13]:
with open('../data/HomologyTAPE/validation.txt') as f:
    val_file_lines = f.readlines()
val_lines = {}
val_pdb = {}
failed_val_scop = []
for line in val_file_lines:
    data = line.split('\t')
    data = [d.strip() for d in data]
    val_lines[data[0]] = data[-1]
    try:
        if fold_dic[data[0]]['PDB-chain'] not in val_pdb.keys():
            val_pdb[fold_dic[data[0]]['PDB-chain']] = []
        val_pdb[fold_dic[data[0]]['PDB-chain']].append(data[-1])
    except:
        failed_val_scop.append(data[0])

In [8]:
fold_dic['d1ysya1']

{'PDB-chain': '1YSY-A', 'uniprot': 'P0C6X7'}

In [9]:
for key, value in test_pdb.items():
    test_pdb[key] = list(set(value))
for key, value in train_pdb.items():
    train_pdb[key] = list(set(value))
for key, value in val_pdb.items():
    val_pdb[key] = list(set(value))

In [10]:
print(len(test_file_lines))
print(len(train_file_lines))
print(len(val_file_lines))
print(len(failed_test_scop))
print(len(failed_train_scop))
print(len(failed_val_scop))

718
12312
736
17
550
11


In [11]:
print(len(test_pdb))
print(len(train_pdb))
print(len(val_pdb))
print(len(test_pdb))

667
9952
706
667


In [12]:
all_pdb = list(test_pdb.keys()) + list(train_pdb.keys()) + list(val_pdb.keys())

In [13]:
all_pdb = [pdb[:4] for pdb in all_pdb]
print(len(all_pdb))

11325


In [14]:
import json
no_download = []
with open('../data/HomologyTAPE/all_download.json') as f:
    download_pdb = json.load(f)
with open('../data/HomologyTAPE/download_pdb.json') as f:
    download_fold = json.load(f)

In [15]:
download_fold = [i.upper() for i in download_fold]

In [15]:
for pdb in tqdm(all_pdb):
    if pdb not in download_pdb:
        no_download.append(pdb)

  2%|▏         | 171/11325 [00:00<00:32, 340.42it/s]

100%|██████████| 11325/11325 [00:48<00:00, 231.54it/s]


In [17]:
pdb_chains = list(test_pdb.keys()) + list(train_pdb.keys()) + list(val_pdb.keys())

In [1]:
with open('../data/EnzymeCommission/nrPDB-EC_annot_copy.tsv') as f:
    ec_lines = f.readlines()
with open('../data/GeneOntology/nrPDB-GO_annot.tsv') as f:
    go_lines = f.readlines()
download_chains = []
download_ec_chains = [ec.split('\t')[0] for ec in ec_lines[1:]]
download_go_chains = [ec.split('\t')[0] for ec in go_lines[12:]]
download_chains = download_ec_chains + download_go_chains

In [20]:
import os
download_fold = os.listdir('../data/HomologyTAPE/all/')
download_fold = [i.split('.')[0] for i in download_fold]

In [23]:
download_fold[0]

'2QN6'

In [24]:
download_chains = list(set(download_chains))
need_download = []
for pdb in tqdm(pdb_chains):
    if pdb not in download_chains and pdb[:4] not in download_pdb and pdb[:4] not in download_fold:
        need_download.append(pdb)

  0%|          | 19/11325 [00:00<01:01, 182.37it/s]

100%|██████████| 11325/11325 [01:03<00:00, 178.69it/s]


In [25]:
print(need_download)

[]


In [19]:
need_download = [pdb[:4].lower() for pdb in need_download]

In [36]:
len(list(set(need_download)))

9273

In [27]:
import os
downloaded = os.listdir('../data/HomologyTAPE/all')
downloaded = [pdb[:4].lower() for pdb in downloaded]

In [35]:
len(list(set(downloaded)))

6556

In [34]:
n_d = []
for pdb in need_download:
    if pdb not in downloaded:
        n_d.append(pdb)

In [32]:
n_d = list(set(n_d))
print(len(n_d))

2717


In [37]:
failed = 0
root_dir = '../data/HomologyTAPE/'
idx = 0
download_pdb = downloaded
failed_pdb = []
for pdbid in tqdm(n_d):
    
    # if idx == 10:
    #     break
    try:
        
        url = 'https://ftp.wwpdb.org/pub/pdb/data/structures/divided/pdb/'+pdbid[1:3]+'/pdb' + pdbid + '.ent.gz'
        # print(url)
        urlretrieve(url, root_dir +'all/' + pdbid.upper() + '.pdb.gz')
        download_pdb.append(pdbid.lower())
    except:
        print("cannot download:", pdbid)
        failed += 1
        failed_pdb.append(pdbid.lower())
    idx += 1
with open('../data/HomologyTAPE/download_pdb.json', 'w') as f:
    json.dump(download_pdb, f)
with open('../data/HomologyTAPE/failed_pdb.json', 'w') as f:
    json.dump(failed_pdb, f)

  0%|          | 0/2916 [00:00<?, ?it/s]

100%|██████████| 2916/2916 [57:56<00:00,  1.19s/it] 


In [10]:
import h5py

In [15]:
f = h5py.File("../data/HomologyTAPE/d2gyco1.hdf5","r")
print(f.keys())

<KeysViewHDF5 ['amino_chains', 'amino_neighs', 'amino_neighs_hb', 'amino_neighs_sindices', 'amino_neighs_sindices_hb', 'amino_pos', 'amino_types', 'atom_amino_id', 'atom_chain_ids', 'atom_chain_names', 'atom_names', 'atom_pos', 'atom_residue_id', 'atom_residue_names', 'atom_types', 'cov_bond_list', 'cov_bond_list_hb', 'cov_bond_list_sindices', 'cov_bond_list_sindices_hb', 'pos_center']>


In [22]:
f['amino_chains'][()]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0], dtype=int64)