drastically reduce the memory usage for pdb snapshots

keisuke-yanagisawa · Jul 3, 2022 · 2345ab6 · 2345ab6
1 parent 3f95277
commit 2345ab6
Show file tree

Hide file tree

Showing 2 changed files with 101 additions and 23 deletions.
diff --git a/script/resenv_main.py b/script/resenv_main.py
@@ -78,28 +78,20 @@ def wrapper(model, dx, resn, threshold, lt, env_distance):
 def resenv(grid, ipdb, resn, opdb, 
            threshold=0.2, lt=False, env_distance=4, n_jobs=1):
     dx = gridData.Grid(grid)
-    models = []
-    for path in tqdm(ipdb, desc="[read PDBs]", disable=not (VERBOSE or DEBUG)):
-        snapshot = uPDB.get_structure(path)
-        [models.append(m) for m in snapshot]
-
-    out_structure = PDB.Structure.Structure("")
-    lst_of_lst = Parallel(n_jobs=n_jobs)(
-        delayed(wrapper)(model, dx, resn, threshold, lt, env_distance)
-        for model in tqdm(models, desc="[extract res. env.]", disable=not (VERBOSE or DEBUG))
-    )
 
-    for lst in lst_of_lst:
-        for struct in lst:
-            struct.id = len(out_structure)
-            struct.serial_num = struct.id+1
-            out_structure.add(struct)
-
-    pdbio = PDB.PDBIO()
-    pdbio.set_structure(out_structure)
-    path = opdb
-    print("output", path)
-    pdbio.save(path)
+    out_helper = uPDB.PDBIOhelper(opdb)
+    for path in ipdb:
+        reader = uPDB.MultiModelPDBReader(path)
+
+        lst_of_lst = Parallel(n_jobs=n_jobs)(
+            delayed(wrapper)(model, dx, resn, threshold, lt, env_distance)
+            for model in tqdm(reader, desc="[extract res. env.]", disable=not (VERBOSE or DEBUG))
+        )
+
+
+        for lst in lst_of_lst:
+            for struct in lst:
+                out_helper.save(struct)
 
 VERBOSE = None
 DEBUG = None

diff --git a/script/utilities/Bio/PDB.py b/script/utilities/Bio/PDB.py
@@ -3,8 +3,8 @@
 """
 BioPytnonの Bio.PDB モジュールに関係する関数群。
 
-version: 1.2.0
-last update: 14 Sep, 2021
+version: 1.3.0
+last update: 1 Jul, 2022
 Authors: Keisuke Yanagisawa
 """
 import gzip
@@ -14,6 +14,92 @@
 import tempfile
 import io
 
+class MultiModelPDBReader(object):
+    """
+    多数のモデルが含まれるPDBファイルを
+    省メモリで読むためのヘルパークラス。
+    iteratorに対応し、1つずつ読んでくれる。
+    一応get_modelもあるが、これの実装は雑なので注意。
+    """
+
+    def _init_fileobj(self):
+        self.model_positions = []
+        self.fileend = False
+        self.file.seek(0)
+        while True:
+            l = self.file.readline()
+            if l.startswith(self.header):
+                self.model_positions.append(
+                    self.file.tell() - len(l.encode())
+                )
+                break
+
+    def __init__(self, file, header="MODEL"):
+        self.file = open(file)
+        self.model_positions = []
+        self.fileend = False
+        self.header = header
+        self._init_fileobj()
+
+    def __del__(self):
+        self.file.close()
+
+    def get_model(self, idx):
+        """
+        get a model with 0-origin
+        Note that it uses idx, not MODEL ID.
+        It will do sequential search, thus the computation complexity is O(N), not O(1)
+        """
+        if idx < 0:
+            raise IndexError(f"{self.header} index out of range")                
+
+        while (len(self.model_positions) <= idx+1):
+            if self.fileend:
+                raise IndexError(f"{self.header} index out of range")                
+            self.model_positions.append(self._next())
+        self.file.seek(self.model_positions[idx])
+
+        with tempfile.NamedTemporaryFile("w") as f:
+            n_bytes_to_be_read \
+              = self.model_positions[idx+1] - self.model_positions[idx]
+            self.file.seek(self.model_positions[idx])
+            f.write(self.file.read(n_bytes_to_be_read))
+            f.flush()
+            return get_structure(f.name)
+
+    def _next(self) -> int:
+        """
+        get next STARTING point
+        """
+
+        if(self.fileend):
+            return None
+
+        self.file.seek(self.model_positions[-1])
+
+        while True:
+            l = self.file.readline()
+            if l == "":
+                self.fileend = True
+                break
+            elif l.startswith(self.header):
+                cur = self.file.tell() - len(l.encode())
+                if cur == self.model_positions[-1]:
+                    continue
+                self.file.seek(cur)
+                break
+        return self.file.tell()
+
+    def __iter__(self):
+        self._init_fileobj()
+        return self
+
+    def __next__(self):
+        try:
+            return self.get_model(len(self.model_positions)-1)
+        except IndexError as e:
+            raise StopIteration
+
 class PDBIOhelper():
     """
     多数のmodelを単一のPDBに登録する時のヘルパクラスです。