Skip to content

Commit

Permalink
drastically reduce the memory usage for pdb snapshots
Browse files Browse the repository at this point in the history
  • Loading branch information
keisuke-yanagisawa committed Jul 3, 2022
1 parent 3f95277 commit 2345ab6
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 23 deletions.
34 changes: 13 additions & 21 deletions script/resenv_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,28 +78,20 @@ def wrapper(model, dx, resn, threshold, lt, env_distance):
def resenv(grid, ipdb, resn, opdb,
threshold=0.2, lt=False, env_distance=4, n_jobs=1):
dx = gridData.Grid(grid)
models = []
for path in tqdm(ipdb, desc="[read PDBs]", disable=not (VERBOSE or DEBUG)):
snapshot = uPDB.get_structure(path)
[models.append(m) for m in snapshot]

out_structure = PDB.Structure.Structure("")
lst_of_lst = Parallel(n_jobs=n_jobs)(
delayed(wrapper)(model, dx, resn, threshold, lt, env_distance)
for model in tqdm(models, desc="[extract res. env.]", disable=not (VERBOSE or DEBUG))
)

for lst in lst_of_lst:
for struct in lst:
struct.id = len(out_structure)
struct.serial_num = struct.id+1
out_structure.add(struct)

pdbio = PDB.PDBIO()
pdbio.set_structure(out_structure)
path = opdb
print("output", path)
pdbio.save(path)
out_helper = uPDB.PDBIOhelper(opdb)
for path in ipdb:
reader = uPDB.MultiModelPDBReader(path)

lst_of_lst = Parallel(n_jobs=n_jobs)(
delayed(wrapper)(model, dx, resn, threshold, lt, env_distance)
for model in tqdm(reader, desc="[extract res. env.]", disable=not (VERBOSE or DEBUG))
)


for lst in lst_of_lst:
for struct in lst:
out_helper.save(struct)

VERBOSE = None
DEBUG = None
Expand Down
90 changes: 88 additions & 2 deletions script/utilities/Bio/PDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
"""
BioPytnonの Bio.PDB モジュールに関係する関数群。
version: 1.2.0
last update: 14 Sep, 2021
version: 1.3.0
last update: 1 Jul, 2022
Authors: Keisuke Yanagisawa
"""
import gzip
Expand All @@ -14,6 +14,92 @@
import tempfile
import io

class MultiModelPDBReader(object):
"""
多数のモデルが含まれるPDBファイルを
省メモリで読むためのヘルパークラス。
iteratorに対応し、1つずつ読んでくれる。
一応get_modelもあるが、これの実装は雑なので注意。
"""

def _init_fileobj(self):
self.model_positions = []
self.fileend = False
self.file.seek(0)
while True:
l = self.file.readline()
if l.startswith(self.header):
self.model_positions.append(
self.file.tell() - len(l.encode())
)
break

def __init__(self, file, header="MODEL"):
self.file = open(file)
self.model_positions = []
self.fileend = False
self.header = header
self._init_fileobj()

def __del__(self):
self.file.close()

def get_model(self, idx):
"""
get a model with 0-origin
Note that it uses idx, not MODEL ID.
It will do sequential search, thus the computation complexity is O(N), not O(1)
"""
if idx < 0:
raise IndexError(f"{self.header} index out of range")

while (len(self.model_positions) <= idx+1):
if self.fileend:
raise IndexError(f"{self.header} index out of range")
self.model_positions.append(self._next())
self.file.seek(self.model_positions[idx])

with tempfile.NamedTemporaryFile("w") as f:
n_bytes_to_be_read \
= self.model_positions[idx+1] - self.model_positions[idx]
self.file.seek(self.model_positions[idx])
f.write(self.file.read(n_bytes_to_be_read))
f.flush()
return get_structure(f.name)

def _next(self) -> int:
"""
get next STARTING point
"""

if(self.fileend):
return None

self.file.seek(self.model_positions[-1])

while True:
l = self.file.readline()
if l == "":
self.fileend = True
break
elif l.startswith(self.header):
cur = self.file.tell() - len(l.encode())
if cur == self.model_positions[-1]:
continue
self.file.seek(cur)
break
return self.file.tell()

def __iter__(self):
self._init_fileobj()
return self

def __next__(self):
try:
return self.get_model(len(self.model_positions)-1)
except IndexError as e:
raise StopIteration

class PDBIOhelper():
"""
多数のmodelを単一のPDBに登録する時のヘルパクラスです。
Expand Down

0 comments on commit 2345ab6

Please sign in to comment.