In [3]:
import os
import pandas as pd
import pickle
from ABDB import database as db

In [4]:
db.set_numbering_scheme("chothia")

if db.get_numbering_scheme() != "chothia":
    raise ValueError()

### Get current data

In [5]:
parapred_dir = os.path.abspath("")
data_dir = os.path.join(parapred_dir, "data")
precomputed_dir = os.path.join(parapred_dir, "precomputed")

# train_csv = os.path.join(data_dir, "dataset_train.csv")
# val_csv = os.path.join(data_dir, "dataset_val.csv")
# test_csv = os.path.join(data_dir, "dataset_test.csv")

train_csv = os.path.join(data_dir, "dataset_train_BIG.csv")
val_csv = os.path.join(data_dir, "dataset_val_BIG.csv")
test_csv = os.path.join(data_dir, "dataset_test_BIG.csv")

train_df = pd.read_csv(train_csv)
val_df = pd.read_csv(val_csv)
test_df = pd.read_csv(test_csv)
all_data_df = pd.concat([train_df, val_df, test_df])

seqs_pickle = os.path.join(precomputed_dir, "downloaded_seqs.p")

##### Csvs

In [6]:
train_val_test_pdbs = all_data_df["pdb"].tolist()
all_data_df.head()

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain
0,4aei,H,L,0,A
1,7mzg,H,L,0,A
2,6ddm,B,A,0,C
3,4tqe,H,L,0,A
4,6q0e,H,L,0,A


##### Pickle

In [7]:
if os.path.isfile(seqs_pickle):
    with open(seqs_pickle, "rb") as f:
        current_pickle = pickle.load(f)

current_pickle_pdbs = list(current_pickle.keys())
pdb0 = current_pickle_pdbs[0]

print(pdb0)
current_pickle[pdb0]

2hh0


{'H': {(4, ''): 'L',
  (5, ''): 'E',
  (6, ''): 'Q',
  (7, ''): 'S',
  (8, ''): 'G',
  (9, ''): 'A',
  (10, ''): 'E',
  (11, ''): 'L',
  (12, ''): 'V',
  (13, ''): 'K',
  (14, ''): 'P',
  (15, ''): 'G',
  (16, ''): 'A',
  (17, ''): 'S',
  (18, ''): 'V',
  (19, ''): 'K',
  (20, ''): 'L',
  (21, ''): 'S',
  (22, ''): 'C',
  (23, ''): 'T',
  (24, ''): 'A',
  (25, ''): 'S',
  (26, ''): 'G',
  (27, ''): 'F',
  (28, ''): 'N',
  (29, ''): 'I',
  (30, ''): 'E',
  (31, ''): 'D',
  (32, ''): 'S',
  (33, ''): 'Y',
  (34, ''): 'I',
  (35, ''): 'H',
  (36, ''): 'W',
  (37, ''): 'V',
  (38, ''): 'K',
  (39, ''): 'Q',
  (40, ''): 'R',
  (41, ''): 'P',
  (42, ''): 'E',
  (43, ''): 'Q',
  (44, ''): 'G',
  (45, ''): 'L',
  (46, ''): 'E',
  (47, ''): 'W',
  (48, ''): 'I',
  (49, ''): 'G',
  (50, ''): 'R',
  (51, ''): 'I',
  (52, ''): 'D',
  (52, 'A'): 'P',
  (53, ''): 'E',
  (54, ''): 'D',
  (55, ''): 'G',
  (56, ''): 'E',
  (57, ''): 'T',
  (58, ''): 'K',
  (59, ''): 'Y',
  (60, ''): 'A',
  (61, ''): 'P

### Compare the different data

In [8]:
current_pickle_pdbs = current_pickle.keys()
train_val_test_pdbs = all_data_df["pdb"].tolist()

print(f"Number of pdbs in pickle: \t{len(current_pickle_pdbs)}")
print(f"Number of pdbs in csvs: \t{len(train_val_test_pdbs)}")

Number of pdbs in pickle: 	1053
Number of pdbs in csvs: 	1105


##### Check if all pdbs in my csv have entries in the pickle file

In [9]:
# quick check how many pdbs (ignore chains for now) are not in pickle
csv_but_not_pickle_pdbs = [pdb for pdb in train_val_test_pdbs if pdb not in current_pickle_pdbs]
print(f"Number of pdbs in csvs but not pickle: \t{len(csv_but_not_pickle_pdbs)}")

Number of pdbs in csvs but not pickle: 	497


##### Add new entries to pickle as needed

In [10]:
new_pickle = current_pickle.copy()

In [11]:
for index, row in all_data_df.iterrows():
    
    pdb = row["pdb"]
    Hchain = row["Hchain"]
    Lchain = row["Lchain"]
    csv_chains = [Hchain, Lchain]
    
    if pdb in new_pickle.keys():
        pass
    else:
        new_pickle[pdb] = {}
        
    pickle_chains = list(new_pickle[pdb].keys())
    for chain in csv_chains:
        if chain not in pickle_chains:
            p = db.fetch(pdb)
            new_pickle[pdb][chain] = dict(p.get_numbering()[chain][0][0])

##### Recheck - all pdbs should now have an entry in the new pickle

In [12]:
new_pickle_pdbs = new_pickle.keys()
print(f"Number of pdbs in new pickle: \t{len(new_pickle_pdbs)}")
print(f"Number of pdbs in csvs: \t{len(train_val_test_pdbs)}")

csv_but_not_pickle_pdbs = [pdb for pdb in train_val_test_pdbs if pdb not in new_pickle_pdbs]
print(f"\nNumber of pdbs in csvs but not pickle: \t{len(csv_but_not_pickle_pdbs)}")

Number of pdbs in new pickle: 	1550
Number of pdbs in csvs: 	1105

Number of pdbs in csvs but not pickle: 	0


### Create back-up of original pickle and overwrite previous one with new version

In [13]:
back_up_pickle = os.path.join(precomputed_dir, "downloaded_seqs.p.bu")

with open(back_up_pickle, "wb") as f:
    pickle.dump(current_pickle, f)

In [14]:
with open(seqs_pickle, "wb") as f:
    pickle.dump(new_pickle, f)