# Introduction

In this notebook, we document how to check if a new dataset is built with consistent parameters and how to lock the dataset in case we need to update it.


## Basic Checksum
In the cell below we can check if a new dataset is built with parameters consistent to the old standard dataset downloaded when you clone the github repo. Say for example, you are building a new set of landmarks and want to check if the same parameters were used as the provided dataset. Follow these steps
 
1. Move an old reliable landmark e.g. `4f3t00000000.nucsite.landmark` somewhere else e.g. into a junk folder `/home/Downloads/`
2. Make the new set together with 4f3t.
3. Because 4f3t was recorded in `../Database-PDB/CHECKSUM_LandmarkNucsiteIntegrity.pkl` checking its content against the one recorded should tell if the same protocol has been used in generating the new set. 

The whole protocol takes more than 20 minutes to complete the check. It prints lists of discrepancy and returns the discrepancies. Refer to the script `commandChecksum.py` for details. Further remarks. 
* Extra files compare to Typi (Class labels) for Halo/Feature/Landmark. These are rejected entries after the landmark building process (e.g. artificial base). `['1a1v00000000', '1kdh00000000', '1m0600000000', '2f5500000000', '3af600000000', '3c5f00000001', '3f2100000000', '3f2200000000', '3f2300000000', '3iem00000000', '3vaf00000000', '3vaf00000001', '3vak00000000', '3vak00000001', '4r8i00000000', '4tu700000000', '4tu700000001', '4wb200000000', '4wb200000001', '4wb300000000', '4wb300000001', '6idg00000000', '6kcp00000000', '6kdi00000000', '6l9700000000', '6mdx00000000', '6mdx00000001', '6u6x00000000', '6u6x00000001', '6ycs00000000', '6ycs00000001']`



In [None]:
import glob
import pickle
import numpy as np
import tqdm
import os
from scipy import sparse
import pandas as pd
import multiprocessing



sys.path.append('../')
from NucleicNet.DatasetBuilding.util import *
from NucleicNet.DatasetBuilding.commandChecksum import Checksum





ChecksumC = Checksum()
Discrepancy_Typi = ChecksumC.CheckTypi()
Discrepancy_Halo = ChecksumC.CheckHalo()
Discrepancy_Apo = ChecksumC.CheckApo()
Discrepancy_Cleansed = ChecksumC.CheckCleansed()
Discrepancy_LandmarkNucsite = ChecksumC.CheckLandmarkNucsite()
Discrepancy_LandmarkFpocket = ChecksumC.CheckLandmarkFpocket()
#Discrepancy_Feature = ChecksumC.CheckFeature()



## ==========================================================================================
## :x: WARNING. DO NOT RUN AFTER THIS LINE UNLESS YOU WANT TO OVERWRITE THE CHECKSUM STORED.
## ===========================================================================================

# Number of Datapoints

In [7]:
import glob
import numpy as np
import tqdm
from scipy import sparse
# TODO load all typi and make a dictionary for it 

PdbidList = sorted(glob.glob("../Database-PDB/typi/*.typi.npz"))
PdbidList = [i.split("/")[-1].split(".")[0] for i in PdbidList]
datasizedf = {}# []
for pdbid_i in tqdm.tqdm(range(len(PdbidList))):
                pdbid = PdbidList[pdbid_i]
                with np.load("../Database-PDB/typi/%s.typi.npz" %(pdbid)) as f:
                        typi = sparse.csr_matrix((f['data'], f['indices'], f['indptr']), shape= f['shape'])
                #print(typi.shape[0])
                datasizedf[pdbid] = typi.shape[0]

import pickle

with open("../Database-PDB/CHECKSUM_Datapoint.pkl", "wb") as fn:
    pickle.dump(datasizedf, fn,protocol=4)


100%|██████████| 12530/12530 [01:32<00:00, 135.51it/s]


# Checksum Typi

In [2]:
import glob
import numpy as np
import tqdm
from scipy import sparse
PdbidList = sorted(glob.glob("../Database-PDB/typi/*.typi.npz"))
PdbidList = [i.split("/")[-1].split(".")[0] for i in PdbidList]
typiintegrity = {}# []
for pdbid_i in tqdm.tqdm(range(len(PdbidList))):
                pdbid = PdbidList[pdbid_i]
                with np.load("../Database-PDB/typi/%s.typi.npz" %(pdbid)) as f:
                        typi = sparse.csr_matrix((f['data'], f['indices'], f['indptr']), shape= f['shape'])
                typiintegrity[pdbid] = np.sum(typi, axis = 0) # TODO This should be done columnwise, because the row almost always sum to 1 when there is no ambiguity

#print(datasizedf)

import pickle

with open("../Database-PDB/CHECKSUM_TypiIntegrity.pkl", "wb") as fn:
    pickle.dump(typiintegrity, fn,protocol=4)

100%|██████████| 12530/12530 [01:48<00:00, 115.46it/s]


# Checksum Halo


In [12]:
import glob
import numpy as np
import tqdm
import os
from scipy import sparse
# TODO load all typi and make a dictionary for it 
# TODO typi, halo, feature
# TODO Report as a list
PdbidList = sorted(glob.glob("../Database-PDB/halo/*.halo*")) 
PdbidList = sorted(set([i.split("/")[-1].split(".")[0] for i in PdbidList]))
Integrity_Halo = {}# []
for pdbid_i in tqdm.tqdm(range(len(PdbidList))):
    pdbid = PdbidList[pdbid_i]

    ContentCheckXyz = os.path.getsize("../Database-PDB/halo/%s.haloxyz" %(pdbid)) 
    ContentCheckTup = os.path.getsize("../Database-PDB/halo/%s.halotup" %(pdbid))

    with open("../Database-PDB/halo/%s.halotup" %(pdbid), 'rb') as fn:
        halonum, e1,e2,e3 = pickle.load(fn) 
    Integrity_Halo[pdbid] = np.array([halonum.shape[0], sum(e1), sum(e2), sum(e3), ContentCheckXyz, ContentCheckTup])

import pickle
with open("../Database-PDB/CHECKSUM_HaloIntegrity.pkl", "wb") as fn:
    pickle.dump(Integrity_Halo, fn,protocol=4)

100%|██████████| 12561/12561 [01:13<00:00, 170.77it/s]


# Checksum Feature

In [17]:
import glob
import numpy as np
import tqdm
import os
from scipy import sparse

import multiprocessing

PdbidList = sorted(glob.glob("../Database-PDB/feature/*.npz*")) 
PdbidList = sorted(set([i.split("/")[-1] for i in PdbidList]))
pool = multiprocessing.Pool(12)

def OOC_IntegrityFeature(pdbid):
        with np.load("../Database-PDB/feature/%s" %(pdbid)) as f:
                feat = sparse.csr_matrix((f['data'], f['indices'], f['indptr']), shape= f['shape'])
        Integrity_Feature_ = np.sum(feat, axis = 0) # TODO This should be done columnwise, because the row almost always sum to 1 when there is no ambiguity

        return (pdbid, Integrity_Feature_)

Integrity_Feature = pool.map(OOC_IntegrityFeature, PdbidList)
Integrity_Feature = dict(Integrity_Feature)
import pickle
with open("../Database-PDB/CHECKSUM_FeatureIntegrity.pkl", "wb") as fn:
    pickle.dump(Integrity_Feature, fn,protocol=4)


'\nIntegrity_Feature = {}\nfor pdbid_i in tqdm.tqdm(range(len(PdbidList))):\n        pdbid = PdbidList[pdbid_i]\n        with np.load("../Database-PDB/feature/%s" %(pdbid)) as f:\n                feat = sparse.csr_matrix((f[\'data\'], f[\'indices\'], f[\'indptr\']), shape= f[\'shape\'])\n        Integrity_Feature[pdbid] = np.sum(feat, axis = 0) # TODO This should be done columnwise, because the row almost always sum to 1 when there is no ambiguity\n\n\nimport pickle\nwith open("../Database-PDB/CHECKSUM_FeatureIntegrity.pkl", "wb") as fn:\n    pickle.dump(Integrity_Feature, fn,protocol=4)\n'

# Checksum Landmarks

In [23]:
import glob
import numpy as np
import tqdm
import os
from scipy import sparse
import pandas as pd

PdbidList = sorted(glob.glob("../Database-PDB/landmark/*.nucsite.landmark")) 
PdbidList = sorted(set([i.split("/")[-1].split(".")[0] for i in PdbidList]))
Integrity_LandmarkNucsite = {}
for pdbid_i in tqdm.tqdm(range(len(PdbidList))):
    pdbid = PdbidList[pdbid_i]
    ContentCheckNucsite = os.path.getsize("../Database-PDB/landmark/%s.nucsite.landmark" %(pdbid)) 
    df = pd.read_pickle("../Database-PDB/landmark/%s.nucsite.landmark" %(pdbid))
    Integrity_LandmarkNucsite[pdbid] = np.array([df['centroid_id'].sum() , df.shape[0], np.around(df['x_coord'].sum(),3), np.around(df['y_coord'].sum(),3), np.around(df['z_coord'].sum(),3)])

import pickle
with open("../Database-PDB/CHECKSUM_LandmarkNucsiteIntegrity.pkl", "wb") as fn:
    pickle.dump(Integrity_LandmarkNucsite, fn,protocol=4)

# NOTE Fpocket is stochastic and I cannot fix the random seed.
PdbidList = sorted(glob.glob("../Database-PDB/landmark/*.fpocket.landmark")) 
PdbidList = sorted(set([i.split("/")[-1].split(".")[0] for i in PdbidList]))
Integrity_LandmarkFpocket = {}
for pdbid_i in tqdm.tqdm(range(len(PdbidList))):
    pdbid = PdbidList[pdbid_i]
    ContentCheckFpocket = os.path.getsize("../Database-PDB/landmark/%s.fpocket.landmark" %(pdbid)) 
    df = pd.read_pickle("../Database-PDB/landmark/%s.fpocket.landmark" %(pdbid))
    Integrity_LandmarkFpocket[pdbid] = np.array([df['centroid_id'].sum() , df.shape[0], np.around(df['x_coord'].sum(),3), np.around(df['y_coord'].sum(),3), np.around(df['z_coord'].sum(),3)])

import pickle
with open("../Database-PDB/CHECKSUM_LandmarkFpocketIntegrity.pkl", "wb") as fn:
    pickle.dump(Integrity_LandmarkFpocket, fn,protocol=4)

100%|██████████| 12533/12533 [00:31<00:00, 403.23it/s]
100%|██████████| 12561/12561 [00:34<00:00, 361.27it/s]


# Checksum Cleansed

In [27]:
import glob
import numpy as np
import tqdm
import os
from scipy import sparse
import pandas as pd

PdbidList = sorted(glob.glob("../Database-PDB/cleansed/*.pdb")) 
PdbidList = sorted(set([i.split("/")[-1].split(".")[0] for i in PdbidList]))
Integrity_Cleansed = {}
for pdbid_i in tqdm.tqdm(range(len(PdbidList))):
    pdbid = PdbidList[pdbid_i]
    Integrity_Cleansed[pdbid] = os.path.getsize("../Database-PDB/cleansed/%s.pdb" %(pdbid)) 
import pickle
with open("../Database-PDB/CHECKSUM_CleansedIntegrity.pkl", "wb") as fn:
    pickle.dump(Integrity_Cleansed, fn,protocol=4)

100%|██████████| 12563/12563 [00:00<00:00, 123044.72it/s]


# Checksum Apo

In [29]:
import glob
import numpy as np
import tqdm
import os
from scipy import sparse
import pandas as pd

PdbidList = sorted(glob.glob("../Database-PDB/apo/*.pdb")) 
PdbidList = sorted(set([i.split("/")[-1].split(".")[0] for i in PdbidList]))
Integrity_Apo = {}
for pdbid_i in tqdm.tqdm(range(len(PdbidList))):
    pdbid = PdbidList[pdbid_i]
    Integrity_Apo[pdbid] = os.path.getsize("../Database-PDB/apo/%s.pdb" %(pdbid)) 
import pickle
with open("../Database-PDB/CHECKSUM_ApoIntegrity.pkl", "wb") as fn:
    pickle.dump(Integrity_Apo, fn,protocol=4)

100%|██████████| 12563/12563 [00:00<00:00, 120620.72it/s]
