In [1]:
!pwd

/mnt/ssd_elecom_black_c2c/ssd_elecom_black_c2c-script


In [2]:
import itertools
import time
from pathlib import Path
import re
from tqdm import tqdm
from tqdm.notebook import tqdm
import subprocess
from multiprocessing import Pool, cpu_count

In [3]:
def get_subdir_list(p_sub_list):
    """
    To get a sub directory path list, Use thie func().
    
    pram: p_aub_list: specify a directory which sub dirs is gotten from.
    """
    # 引数の直下のディレクトリ・パスの一覧を取得
    sub_dir_list_temp = []
    for p_sub in tqdm(p_sub_list):
        sub_dir_list_temp.append([p_s_s for p_s_s in p_sub.iterdir()])
    # ２次元リストを１次元リスト化
    return sum(sub_dir_list_temp, [])


p = Path('../cif/')
p_sub_list = [p_s for p_s in p.glob('[1-9]')]

In [4]:
cif_path_list = get_subdir_list(get_subdir_list(get_subdir_list(p_sub_list)))

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/77 [00:00<?, ?it/s]

  0%|          | 0/6927 [00:00<?, ?it/s]

In [5]:
def poscar_dir_filter(cif_file_path):
    cif_file_path_str = str(cif_file_path)
    return not('.cif' in cif_file_path_str)

poscar_dir_list = [path for path in cif_path_list if poscar_dir_filter(path)]

In [6]:
def iterdir_func(poscar_dir):
    return list(poscar_dir.iterdir())
    
def flatten_func(list_2dim):
    return list(itertools.chain.from_iterable(list_2dim))


# 並列化
before = time.time()
try:
    p = Pool(cpu_count() - 1)
    # iterdir
    nnlist_poscar_path_list = list(tqdm(p.imap(iterdir_func, poscar_dir_list), total=len(poscar_dir_list)))
    # flatten
    nnlist_poscar_path_list = flatten_func(nnlist_poscar_path_list)
    
finally:
    p.close()
    p.join()
after = time.time()
print(f"it took {after - before}sec.")

  0%|          | 0/507142 [00:00<?, ?it/s]

it took 959.1611270904541sec.


In [7]:
# check
print(f"len(nnlist_poscar_path_list) -> {len(nnlist_poscar_path_list)}")

len(nnlist_poscar_path_list) -> 616651


#### .nnlistのパスだけのリストを抽出し，.npyで保存

In [97]:
def nnlist_filter(nnlist_poscar_path):
    nnlist_poscar_path_str = str(nnlist_poscar_path)
    return '.nnlist' in nnlist_poscar_path_str

In [98]:
nnlist_path_list = [mix for mix in nnlist_poscar_path_list if nnlist_filter(mix)]

In [99]:
nnlist_path_list[0]

PosixPath('../cif/1/00/00/1000000/POSCAR.nnlist')

In [109]:
# check
print(f"len(nnlist_path_list) -> {len(nnlist_path_list)}")
print(f"check nnlist_path_list's last 7 string and unique it -> {set([str(p)[-7:] for p in nnlist_path_list])}")

len(nnlist_path_list) -> 308325
check nnlist_path_list's last 7 string and unique it -> {'.nnlist'}


In [27]:
# save nnlist_path_list as .npy
import numpy as np
np.save('nnlist_path_list.npy', nnlist_path_list)

In [29]:
# load nnlist_path_list.npy
nnlist_path_list_loaded = np.load('nnlist_path_list.npy', allow_pickle=True)

#### POSCARのパスだけのリストを抽出し，.npyで保存

In [88]:
import re
def poscar_filter(nnlist_poscar_path):
    pattern = '.+/POSCAR$'
    string = str(nnlist_poscar_path)
    return re.search(pattern, string)

In [91]:
poscar_path_list = [mix for mix in nnlist_poscar_path_list if poscar_filter(mix)]

In [110]:
# check
print(f"len(poscar_path_list) -> {len(poscar_path_list)}")
print(f"check poscar_path_list's last 6 string and unique it -> {set([str(p)[-6:] for p in poscar_path_list])}")

len(poscar_path_list) -> 308325
check poscar_path_list's last 6 string and unique it -> {'POSCAR'}


In [93]:
np.save('poscar_path_list.npy', poscar_path_list)

In [94]:
# load nnlist_path_list.npy
poscar_path_list_loaded = np.load('poscar_path_list.npy', allow_pickle=True)
poscar_path_list_loaded

array([PosixPath('../cif/1/00/00/1000000/POSCAR'),
       PosixPath('../cif/1/00/00/1000001/POSCAR'),
       PosixPath('../cif/1/00/00/1000002/POSCAR'), ...,
       PosixPath('../cif/9/01/78/9017895/POSCAR'),
       PosixPath('../cif/9/01/79/9017907/POSCAR'),
       PosixPath('../cif/9/01/79/9017909/POSCAR')], dtype=object)

In [None]:
# # 並列化
# try:
#     p = Pool(cpu_count() - 1)
#     poscar_nnlist_path_list = list(tqdm(p.imap(get_subdir_list, poscar_dir_list[0:100]), total=len(poscar_dir_list[])))

# finally:
#     p.close()
#     p.join()