diff --git a/script/resenv_main.py b/script/resenv_main.py index 4d9b2e5..219dae7 100644 --- a/script/resenv_main.py +++ b/script/resenv_main.py @@ -78,28 +78,20 @@ def wrapper(model, dx, resn, threshold, lt, env_distance): def resenv(grid, ipdb, resn, opdb, threshold=0.2, lt=False, env_distance=4, n_jobs=1): dx = gridData.Grid(grid) - models = [] - for path in tqdm(ipdb, desc="[read PDBs]", disable=not (VERBOSE or DEBUG)): - snapshot = uPDB.get_structure(path) - [models.append(m) for m in snapshot] - - out_structure = PDB.Structure.Structure("") - lst_of_lst = Parallel(n_jobs=n_jobs)( - delayed(wrapper)(model, dx, resn, threshold, lt, env_distance) - for model in tqdm(models, desc="[extract res. env.]", disable=not (VERBOSE or DEBUG)) - ) - for lst in lst_of_lst: - for struct in lst: - struct.id = len(out_structure) - struct.serial_num = struct.id+1 - out_structure.add(struct) - - pdbio = PDB.PDBIO() - pdbio.set_structure(out_structure) - path = opdb - print("output", path) - pdbio.save(path) + out_helper = uPDB.PDBIOhelper(opdb) + for path in ipdb: + reader = uPDB.MultiModelPDBReader(path) + + lst_of_lst = Parallel(n_jobs=n_jobs)( + delayed(wrapper)(model, dx, resn, threshold, lt, env_distance) + for model in tqdm(reader, desc="[extract res. env.]", disable=not (VERBOSE or DEBUG)) + ) + + + for lst in lst_of_lst: + for struct in lst: + out_helper.save(struct) VERBOSE = None DEBUG = None diff --git a/script/utilities/Bio/PDB.py b/script/utilities/Bio/PDB.py index 197455c..31a637f 100644 --- a/script/utilities/Bio/PDB.py +++ b/script/utilities/Bio/PDB.py @@ -3,8 +3,8 @@ """ BioPytnonの Bio.PDB モジュールに関係する関数群。 -version: 1.2.0 -last update: 14 Sep, 2021 +version: 1.3.0 +last update: 1 Jul, 2022 Authors: Keisuke Yanagisawa """ import gzip @@ -14,6 +14,92 @@ import tempfile import io +class MultiModelPDBReader(object): + """ + 多数のモデルが含まれるPDBファイルを + 省メモリで読むためのヘルパークラス。 + iteratorに対応し、1つずつ読んでくれる。 + 一応get_modelもあるが、これの実装は雑なので注意。 + """ + + def _init_fileobj(self): + self.model_positions = [] + self.fileend = False + self.file.seek(0) + while True: + l = self.file.readline() + if l.startswith(self.header): + self.model_positions.append( + self.file.tell() - len(l.encode()) + ) + break + + def __init__(self, file, header="MODEL"): + self.file = open(file) + self.model_positions = [] + self.fileend = False + self.header = header + self._init_fileobj() + + def __del__(self): + self.file.close() + + def get_model(self, idx): + """ + get a model with 0-origin + Note that it uses idx, not MODEL ID. + It will do sequential search, thus the computation complexity is O(N), not O(1) + """ + if idx < 0: + raise IndexError(f"{self.header} index out of range") + + while (len(self.model_positions) <= idx+1): + if self.fileend: + raise IndexError(f"{self.header} index out of range") + self.model_positions.append(self._next()) + self.file.seek(self.model_positions[idx]) + + with tempfile.NamedTemporaryFile("w") as f: + n_bytes_to_be_read \ + = self.model_positions[idx+1] - self.model_positions[idx] + self.file.seek(self.model_positions[idx]) + f.write(self.file.read(n_bytes_to_be_read)) + f.flush() + return get_structure(f.name) + + def _next(self) -> int: + """ + get next STARTING point + """ + + if(self.fileend): + return None + + self.file.seek(self.model_positions[-1]) + + while True: + l = self.file.readline() + if l == "": + self.fileend = True + break + elif l.startswith(self.header): + cur = self.file.tell() - len(l.encode()) + if cur == self.model_positions[-1]: + continue + self.file.seek(cur) + break + return self.file.tell() + + def __iter__(self): + self._init_fileobj() + return self + + def __next__(self): + try: + return self.get_model(len(self.model_positions)-1) + except IndexError as e: + raise StopIteration + class PDBIOhelper(): """ 多数のmodelを単一のPDBに登録する時のヘルパクラスです。