diff --git a/intel_pytorch_extension_py/launch.py b/intel_pytorch_extension_py/launch.py index c61802152..da10907bf 100644 --- a/intel_pytorch_extension_py/launch.py +++ b/intel_pytorch_extension_py/launch.py @@ -8,6 +8,7 @@ import glob import numpy as np from argparse import ArgumentParser, REMAINDER +THP_LOC = "/sys/kernel/mm/transparent_hugepage/enabled" r""" This is a script for launching PyTorch training and inference on Intel Xeon CPU with optimal configurations. @@ -91,6 +92,7 @@ *** Memory allocator *** "--enable_tcmalloc" and "--enable_jemalloc" can be used to enable different memory allcator. +"--set_thp" can be used to set Transparent HugePages setting. """ @@ -212,15 +214,16 @@ def add_lib_preload(lib_type=None): "/usr/local/lib64/", "/usr/lib/", "/usr/lib64/"] lib_find = False for lib_path in library_paths: - library_file = lib_path + "lib" + lib_type + ".so" - matches = glob.glob(library_file) - if len(matches) > 0: - if "LD_PRELOAD" in os.environ: - os.environ["LD_PRELOAD"] = matches[0] + ":" + os.environ["LD_PRELOAD"] - else: - os.environ["LD_PRELOAD"] = matches[0] - lib_find = True - break + for each_lib_type in lib_type.split(","): + library_file = lib_path + "lib" + each_lib_type + ".so" + matches = glob.glob(library_file) + if len(matches) > 0: + if "LD_PRELOAD" in os.environ: + os.environ["LD_PRELOAD"] = matches[0] + ":" + os.environ["LD_PRELOAD"] + else: + os.environ["LD_PRELOAD"] = matches[0] + lib_find = True + break if not lib_find: # Unable to find the TCMalloc library file @@ -228,6 +231,22 @@ def add_lib_preload(lib_type=None): " or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or " "~/.local/lib/ so the LD_PRELOAD environment variable will not be set." .format(lib_type, lib_type, expanduser("~"))) + +def get_thp(): + if os.path.exists(THP_LOC): + with open(THP_LOC) as f: + val = f.read().strip().split("[")[1].split("]")[0] + if val in ['always','never','madvise']: + return val + return None + +def set_thp(arg): + if not arg: return + if os.path.exists(THP_LOC): + os.system("sync;echo 3 > /proc/sys/vm/drop_caches") + os.system(f"echo {arg} > {THP_LOC}") + else: + print("Warning: Unable to enable Transparent HugePages.") def set_multi_thread_and_allcator(args): @@ -254,7 +273,10 @@ def set_multi_thread_and_allcator(args): if args.enable_jemalloc: add_lib_preload(lib_type="jemalloc") if args.enable_iomp: - add_lib_preload(lib_type="iomp") + add_lib_preload(lib_type="iomp,iomp5") + if args.set_thp: + set_thp(args.set_thp) + os.environ["LAUNCH_THP_SET"] = get_thp() def launch(args): ''' @@ -288,34 +310,38 @@ def launch(args): if args.ncore_per_instance == -1: args.ncore_per_instance = len(cores) // args.ninstances + os.environ["LAUNCH_THP_OLD"] = get_thp() set_multi_thread_and_allcator(args) + os.environ["LAUNCH_CMD"] = "#" for i in range(args.ninstances): cmd = [] cur_process_cores = "" - if args.disable_numactl: - cmd.append(args.program) - cmd.extend(args.program_args) - else: + if not args.disable_numactl: cmd = ["numactl"] for core in cores[i * args.ncore_per_instance:(i + 1) * args.ncore_per_instance]: cur_process_cores = cur_process_cores + str(core) + "," numa_params = "-C {} ".format(cur_process_cores[:-1]) cmd.extend(numa_params.split()) - with_python = not args.no_python - if with_python: - cmd.append(sys.executable) - if args.module: - cmd.append("-m") - cmd.append(args.program) - cmd.extend(args.program_args) + with_python = not args.no_python + if with_python: + cmd.append(sys.executable) + if args.module: + cmd.append("-m") + cmd.append(args.program) + cmd.extend(args.program_args) + os.environ["LAUNCH_CMD"] += " ".join(cmd) + ",#" process = subprocess.Popen(cmd, env=os.environ) processes.append(process) + os.environ["LAUNCH_CMD"] = os.environ["LAUNCH_CMD"][:-2] for process in processes: process.wait() if process.returncode != 0: raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd) + if args.set_thp: + # reset to existing val + set_thp(os.environ["LAUNCH_THP_OLD"]) def mpi_dist_launch(args): ''' @@ -417,6 +443,8 @@ def add_memory_allocator_params(parser): help="Enable tcmalloc allocator") group.add_argument("--enable_jemalloc", action='store_true', default=False, help="Enable jemalloc allocator") + group.add_argument("--set_thp", default=None, choices=['always','madvise','never'], + help="Set Transparent HugePages setting") def add_multi_instance_params(parser): @@ -463,6 +491,10 @@ def parse_args(): parser.add_argument("--no_python", default=False, action="store_true", help="Do not prepend the --program script with \"python\" - just exec " "it directly. Useful when the script is not a Python script.") + parser.add_argument("--print_env", default=False, action="store_true", + help="print modified env configuration") + parser.add_argument("--dnnverbose", default=False, action="store_true", + help="enable verbose log for OneDNN and OneMKL") add_memory_allocator_params(parser) add_kmp_iomp_params(parser) @@ -481,14 +513,24 @@ def parse_args(): def main(): + env_before = set(os.environ.keys()) if platform.system() == "Windows": raise RuntimeError("Windows platform is not supported!!!") args = parse_args() + if args.dnnverbose: + os.environ["DNNL_VERBOSE"] = "1" + os.environ["MKLDNN_VERBOSE"] = "1" + os.environ["MKL_VERBOSE"] = "1" if args.distributed: mpi_dist_launch(args) else: launch(args) + + if args.print_env: + print(f'Launch settings: ') + for x in sorted(set(os.environ.keys()) - env_before): + print(f'{x}={os.environ[x]}') if __name__ == "__main__": main()