From 84eb848da596167b7086daec547458a21c7cb10b Mon Sep 17 00:00:00 2001 From: js1010 Date: Sun, 7 Feb 2021 18:40:18 +0900 Subject: [PATCH 01/26] merge task/implement-io --- cpp/src/culda.cu | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/cpp/src/culda.cu b/cpp/src/culda.cu index 0c12182..edcd403 100644 --- a/cpp/src/culda.cu +++ b/cpp/src/culda.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2020 Jisang Yoon +// Copyright (c) 2021 Jisang Yoon // All rights reserved. // // This source code is licensed under the Apache 2.0 license found in the @@ -13,4 +13,47 @@ CuLDA::CuLDA() { CuLDA::~CuLDA() {} +bool CuLDA::Init(std::string opt_path) { + std::ifstream in(opt_path.c_str()); + if (not in.is_open()) return false; + + std::string str((std::istreambuf_iterator(in)), + std::istreambuf_iterator()); + std::string err_cmt; + auto _opt = json11::Json::parse(str, err_cmt); + if (not err_cmt.empty()) return false; + opt_ = _opt; + CuSimLogger().set_log_level(opt_["c_log_level"].int_value()); + num_topics_ = opt_["num_topics"].int_value(); + block_dim_ = opt_["block_dim"].int_value(); + return true; +} + +void CuLDA::LoadModel(float* alpha, float* beta, int num_words) { + num_words_ = num_words; + DEBUG("copy model({} x {})", num_topics_, num_words_); + dev_alpha_.resize(num_topics_); + dev_beta_.resize(num_topics_ * num_words_); + #ifdef HALF_PRECISION + // conversion to half data and copy + std::vector halpha(num_topics_), hbeta(num_topics_ * num_words_); + for (int i = 0; i < num_topics_; ++i) { + halpha[i] = conversion(alpha[i]); + for (int j = 0; j < num_words_; ++j) { + hbeta[i * num_words + j] = conversion(beta[i * num_words + j]); + } + } + thrust::copy(halpha.begin(), halpha.end(), dev_alapha_.begin()); + thrust::copy(hbeta.begin(), hbeta.end(), dev_beta_.begin()); + #else + thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin()); + thrust::copy(beta, beta + num_topics_ * num_words_, dev_beta_.begin()); + #endif + alpha_ = alpha; beta_ = beta; +} + +void CuLDA::FeedData(const int* indices, const int* indptr, int num_indices, int num_indptr) { + +} + } // namespace cusim From ba98fa5081a9074b9c4c138fc853f136a12c1d70 Mon Sep 17 00:00:00 2001 From: js1010 Date: Sun, 7 Feb 2021 18:43:53 +0900 Subject: [PATCH 02/26] restructure directory --- cpp/include/{ => culda}/culda.hpp | 0 cpp/include/{ => utils}/ioutils.hpp | 0 cpp/include/{ => utils}/log.hpp | 0 cpp/include/{ => utils}/types.hpp | 0 cpp/src/{ => culda}/culda.cu | 0 cpp/src/{ => utils}/ioutils.cc | 0 cpp/src/{ => utils}/log.cc | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename cpp/include/{ => culda}/culda.hpp (100%) rename cpp/include/{ => utils}/ioutils.hpp (100%) rename cpp/include/{ => utils}/log.hpp (100%) rename cpp/include/{ => utils}/types.hpp (100%) rename cpp/src/{ => culda}/culda.cu (100%) rename cpp/src/{ => utils}/ioutils.cc (100%) rename cpp/src/{ => utils}/log.cc (100%) diff --git a/cpp/include/culda.hpp b/cpp/include/culda/culda.hpp similarity index 100% rename from cpp/include/culda.hpp rename to cpp/include/culda/culda.hpp diff --git a/cpp/include/ioutils.hpp b/cpp/include/utils/ioutils.hpp similarity index 100% rename from cpp/include/ioutils.hpp rename to cpp/include/utils/ioutils.hpp diff --git a/cpp/include/log.hpp b/cpp/include/utils/log.hpp similarity index 100% rename from cpp/include/log.hpp rename to cpp/include/utils/log.hpp diff --git a/cpp/include/types.hpp b/cpp/include/utils/types.hpp similarity index 100% rename from cpp/include/types.hpp rename to cpp/include/utils/types.hpp diff --git a/cpp/src/culda.cu b/cpp/src/culda/culda.cu similarity index 100% rename from cpp/src/culda.cu rename to cpp/src/culda/culda.cu diff --git a/cpp/src/ioutils.cc b/cpp/src/utils/ioutils.cc similarity index 100% rename from cpp/src/ioutils.cc rename to cpp/src/utils/ioutils.cc diff --git a/cpp/src/log.cc b/cpp/src/utils/log.cc similarity index 100% rename from cpp/src/log.cc rename to cpp/src/utils/log.cc From 525f9f81cee1938b20ab2ac9f27ece01eaa56f34 Mon Sep 17 00:00:00 2001 From: js1010 Date: Sun, 7 Feb 2021 18:45:16 +0900 Subject: [PATCH 03/26] change include path --- cpp/include/culda/culda.hpp | 4 ++-- cpp/include/utils/ioutils.hpp | 4 ++-- cpp/src/culda/culda.cu | 2 +- cpp/src/utils/ioutils.cc | 2 +- cpp/src/utils/log.cc | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp index 0c2ba61..b26a96a 100644 --- a/cpp/include/culda/culda.hpp +++ b/cpp/include/culda/culda.hpp @@ -27,8 +27,8 @@ #include // NOLINT #include "json11.hpp" -#include "log.hpp" -#include "types.hpp" +#include "utils/log.hpp" +#include "utils/types.hpp" namespace cusim { diff --git a/cpp/include/utils/ioutils.hpp b/cpp/include/utils/ioutils.hpp index 3e8304d..b94ec0a 100644 --- a/cpp/include/utils/ioutils.hpp +++ b/cpp/include/utils/ioutils.hpp @@ -21,8 +21,8 @@ #include #include "json11.hpp" -#include "log.hpp" -#include "types.hpp" +#include "utils/log.hpp" +#include "utils/types.hpp" namespace cusim { diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu index edcd403..e050c88 100644 --- a/cpp/src/culda/culda.cu +++ b/cpp/src/culda/culda.cu @@ -3,7 +3,7 @@ // // This source code is licensed under the Apache 2.0 license found in the // LICENSE file in the root directory of this source tree. -#include "culda.hpp" +#include "culda/culda.hpp" namespace cusim { diff --git a/cpp/src/utils/ioutils.cc b/cpp/src/utils/ioutils.cc index 45d551b..098b1f2 100644 --- a/cpp/src/utils/ioutils.cc +++ b/cpp/src/utils/ioutils.cc @@ -3,7 +3,7 @@ // // This source code is licensed under the Apache 2.0 license found in the // LICENSE file in the root directory of this source tree. -#include "ioutils.hpp" +#include "utils/ioutils.hpp" namespace cusim { diff --git a/cpp/src/utils/log.cc b/cpp/src/utils/log.cc index ef5252b..ddfcb0c 100644 --- a/cpp/src/utils/log.cc +++ b/cpp/src/utils/log.cc @@ -5,7 +5,7 @@ // LICENSE file in the root directory of this source tree. // reference: https://github.com/kakao/buffalo/blob/5f571c2c7d8227e6625c6e538da929e4db11b66d/lib/misc/log.cc -#include "log.hpp" +#include "utils/log.hpp" namespace cusim { From 3ef51ed3ef465c0834b5326410c696937e9185bd Mon Sep 17 00:00:00 2001 From: js1010 Date: Sun, 7 Feb 2021 18:51:02 +0900 Subject: [PATCH 04/26] add functions to culda.hpp --- cpp/include/culda/culda.hpp | 11 ++++++++++- cusim/ioutils/bindings.cc | 2 +- setup.py | 4 +++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp index b26a96a..397aeb5 100644 --- a/cpp/include/culda/culda.hpp +++ b/cpp/include/culda/culda.hpp @@ -36,9 +36,18 @@ class CuLDA { public: CuLDA(); ~CuLDA(); + bool Init(std::string opt_path); + void LoadModel(float* alpha, float* beta, int num_words); + void FeedData(const int* indices, const int* indptr, + int num_indices, int num_indptr); + private: + json11::Json opt_; std::shared_ptr logger_; - thrust::device_vector device_data_; + thrust::device_vector dev_alpha_, dev_beta_; + const float *alpha_, *beta_; + int block_cnt_, block_dim_; + int num_topics_, num_words_; }; } // namespace cusim diff --git a/cusim/ioutils/bindings.cc b/cusim/ioutils/bindings.cc index 5b2c1dd..ad35ebb 100644 --- a/cusim/ioutils/bindings.cc +++ b/cusim/ioutils/bindings.cc @@ -8,7 +8,7 @@ #include #include -#include "ioutils.hpp" +#include "utils/ioutils.hpp" namespace py = pybind11; diff --git a/setup.py b/setup.py index dacb5f3..5031bcb 100644 --- a/setup.py +++ b/setup.py @@ -68,7 +68,9 @@ def __init__(self, name): extend_compile_flags = get_extend_compile_flags() extra_compile_args = ['-fopenmp', '-std=c++14', '-ggdb', '-O3'] + \ extend_compile_flags -csrcs = glob.glob("cpp/src/*.cu") + glob.glob("cpp/src/*.cc") +csrcs = \ + glob.glob("cpp/src/**/*.cu", recursive=True) + \ + glob.glob("cpp/src/**/*.cc", recursive=True) extensions = [ Extension("cusim.ioutils.ioutils_bind", sources= csrcs + [ \ From d90ff18e935a2b33af869e4cc6e37abf45b8d801 Mon Sep 17 00:00:00 2001 From: js1010 Date: Sun, 7 Feb 2021 19:26:02 +0900 Subject: [PATCH 05/26] add culda binding --- cpp/include/utils/cuda_info.cuh | 96 +++++++++++++++++++++++++++++++++ cusim/__init__.py | 1 + cusim/culda/__init__.py | 6 +++ cusim/culda/bindings.cc | 57 ++++++++++++++++++++ cusim/culda/pyculda.py | 34 ++++++++++++ 5 files changed, 194 insertions(+) create mode 100644 cpp/include/utils/cuda_info.cuh create mode 100644 cusim/culda/__init__.py create mode 100644 cusim/culda/bindings.cc create mode 100644 cusim/culda/pyculda.py diff --git a/cpp/include/utils/cuda_info.cuh b/cpp/include/utils/cuda_info.cuh new file mode 100644 index 0000000..ab6fefe --- /dev/null +++ b/cpp/include/utils/cuda_info.cuh @@ -0,0 +1,96 @@ +// Copyright (c) 2021 Jisang Yoon +// All rights reserved. +// +// This source code is licensed under the Apache 2.0 license found in the +// LICENSE file in the root directory of this source tree. +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // NOLINT + +#include "json11.hpp" +#include "utils/log.hpp" +#include "utils/types.hpp" + +#define DEBUG1(x) LOGGER->debug("[{}:{}] " x "\n", __FILENAME__, __LINE__); + +namespace cusim { + +std::shared_ptr LOGGER = CuSimLogger().get_logger(); + +struct DeviceInfo { + int devId, mp_cnt, major, minor, cores; +} + +DeviceInfo GetDeviceInfo() { + DeviceInfo ret; + CHECK_CUDA(cudaGetDevice(&ret.devId)); + cudaDeviceProp prop; + CHECK_CUDA(cudaGetDeviceProperties(&prop, ret.devId)); + ret.mp_cnt = prop.multiProcessorCount; + ret.major = prop.major; + ret.minor = prop.minor; + // reference: https://stackoverflow.com/a/32531982 + switch (ret.major) { + case 2: // Fermi + if (ret.minor == 1) + ret.cores = ret.mp_cnt * 48; + else + ret.cores = ret.mp_cnt * 32; + break; + case 3: // Kepler + ret.cores = ret.mp_cnt * 192; + break; + case 5: // Maxwell + ret.cores = ret.mp_cnt * 128; + break; + case 6: // Pascal + if (ret.minor == 1 or ret.minor == 2) + ret.cores = ret.mp_cnt * 128; + else if (ret.minor == 0) + ret.cores = ret.mp_cnt * 64; + else + DEBUG1("Unknown device type"); + break; + case 7: // Volta and Turing + if (ret.minor == 0 or ret.minor == 5) + ret.cores = ret.mp_cnt * 64; + else + DEBUG1("Unknown device type"); + break; + case 8: // Ampere + if (ret.minor == 0) + ret.cores = ret.mp_cnt * 64; + else if (minor_ == 6) + ret.cores = ret.mp_cnt * 128; + else + DEBUG1("Unknown device type"); + break; + default: + DEBUG1("Unknown device type"); + break; + } + if (ret.cores == -1) ret.cores = ret.mp_cnt * 128; + return ret +} + + +} // namespace cusim diff --git a/cusim/__init__.py b/cusim/__init__.py index 796d7b2..24fe984 100644 --- a/cusim/__init__.py +++ b/cusim/__init__.py @@ -4,3 +4,4 @@ # This source code is licensed under the Apache 2.0 license found in the # LICENSE file in the root directory of this source tree. from cusim.ioutils import IoUtils +from cusim.culda import CuLDA diff --git a/cusim/culda/__init__.py b/cusim/culda/__init__.py new file mode 100644 index 0000000..e27fb6a --- /dev/null +++ b/cusim/culda/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) 2021 Jisang Yoon +# All rights reserved. +# +# This source code is licensed under the Apache 2.0 license found in the +# LICENSE file in the root directory of this source tree. +from cusim.culda.pyculda import CuLDA diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc new file mode 100644 index 0000000..d3ee2b4 --- /dev/null +++ b/cusim/culda/bindings.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2021 Jisang Yoon +// All rights reserved. +// +// This source code is licensed under the Apache 2.0 license found in the +// LICENSE file in the root directory of this source tree. +#include +#include +#include + +#include +#include "culda/culda.hpp" + +namespace py = pybind11; + +typedef py::array_t float_array; +typedef py::array_t int_array; + +class CuLDABind { + public: + CuLDABind() {} + + bool Init(std::string opt_path) { + return obj_.Init(opt_path); + } + + void LoadModel(py::object& alpha, py::object& beta) { + float_array _alpha(alpha); + float_array _beta(beta); + auto alpha_buffer = _alphpa.request(); + auto beta_buffer = _beta.request(); + if (alpha_buffer.ndim != 1 or beta_buffer.ndim != 2 or + alpha_buffer.shape[0] != beta_buffer.shape[0]) { + throw std::runtime_error("invalid alpha or beta"); + } + int num_words = beta_buffer.shape[1]; + return obj_.LoadModel(_alpha.mutable_data(0), _beta.mutable_data(0), num_words); + } + + private: + cusim::CuLDA obj_; +}; + +PYBIND11_PLUGIN(culda_bind) { + py::module m("CuLDABind"); + + py::class_(m, "CuLDABind") + .def(py::init()) + .def("init", &CuLDABind::Init, py::arg("opt_path")) + .def("load_model", &IoUtilsBind::LoadModel, + py::arg("alpha"), py::arg("beta")) + .def("__repr__", + [](const CuLDABind &a) { + return ""; + } + ); + return m.ptr(); +} diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py new file mode 100644 index 0000000..08fdb89 --- /dev/null +++ b/cusim/culda/pyculda.py @@ -0,0 +1,34 @@ +# Copyright (c) 2021 Jisang Yoon +# All rights reserved. +# +# This source code is licensed under the Apache 2.0 license found in the +# LICENSE file in the root directory of this source tree. + +# pylint: disable=no-name-in-module,too-few-public-methods,no-member +import os +# from os.path import join as pjoin + +import json +import tempfile + +# import h5py +# import numpy as np + +from cusim import aux +from cusim.culda.culda_bind import CuLDABind +from cusim.config_pb2 import CuLDAConfigProto + +class CuLDA: + def __init__(self, opt=None): + self.opt = aux.get_opt_as_proto(opt or {}, CuLDAConfigProto) + self.logger = aux.get_logger("culda", level=self.opt.py_log_level) + + tmp = tempfile.NamedTemporaryFile(mode='w', delete=False) + opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2) + tmp.write(opt_content) + tmp.close() + + self.logger.info("opt: %s", opt_content) + self.obj = CuLDABind() + assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}" + os.remove(tmp.name) From 6cebe9103e71befcb4b3a87dff0582c85e5f73db Mon Sep 17 00:00:00 2001 From: js1010 Date: Sun, 7 Feb 2021 22:22:08 +0900 Subject: [PATCH 06/26] bug-fix --- cpp/include/culda/culda.hpp | 2 + cpp/include/utils/cuda_base_kernels.cuh | 63 +++++++++++++++++++++++++ cpp/include/utils/cuda_info.cuh | 23 ++++----- cpp/src/culda/culda.cu | 6 ++- cusim/culda/bindings.cc | 6 +-- cusim/proto/config.proto | 9 ++++ setup.py | 23 ++++++--- 7 files changed, 109 insertions(+), 23 deletions(-) create mode 100644 cpp/include/utils/cuda_base_kernels.cuh diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp index 397aeb5..4d85124 100644 --- a/cpp/include/culda/culda.hpp +++ b/cpp/include/culda/culda.hpp @@ -29,6 +29,7 @@ #include "json11.hpp" #include "utils/log.hpp" #include "utils/types.hpp" +#include "utils/cuda_info.cuh" namespace cusim { @@ -42,6 +43,7 @@ class CuLDA { int num_indices, int num_indptr); private: + DeviceInfo dev_info_; json11::Json opt_; std::shared_ptr logger_; thrust::device_vector dev_alpha_, dev_beta_; diff --git a/cpp/include/utils/cuda_base_kernels.cuh b/cpp/include/utils/cuda_base_kernels.cuh new file mode 100644 index 0000000..3cb2979 --- /dev/null +++ b/cpp/include/utils/cuda_base_kernels.cuh @@ -0,0 +1,63 @@ +// Copyright (c) 2021 Jisang Yoon +// All rights reserved. +// +// This source code is licensed under the Apache 2.0 license found in the +// LICENSE file in the root directory of this source tree. +#pragma once +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "utils/types.hpp" + +namespace cusim { + +// Error Checking utilities, checks status codes from cuda calls +// and throws exceptions on failure (which cython can proxy back to python) +#define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); } +inline void checkCuda(cudaError_t code, const char *file, int line) { + if (code != cudaSuccess) { + std::stringstream err; + err << "Cuda Error: " << cudaGetErrorString(code) << " (" << file << ":" << line << ")"; + throw std::runtime_error(err.str()); + } +} + +inline const char* cublasGetErrorString(cublasStatus_t status) { + switch (status) { + case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; + } + return "Unknown"; +} + +#define CHECK_CUBLAS(code) { checkCublas((code), __FILE__, __LINE__); } +inline void checkCublas(cublasStatus_t code, const char * file, int line) { + if (code != CUBLAS_STATUS_SUCCESS) { + std::stringstream err; + err << "cublas error: " << cublasGetErrorString(code) + << " (" << file << ":" << line << ")"; + throw std::runtime_error(err.str()); + } +} +} // namespace cusim diff --git a/cpp/include/utils/cuda_info.cuh b/cpp/include/utils/cuda_info.cuh index ab6fefe..0cd90f8 100644 --- a/cpp/include/utils/cuda_info.cuh +++ b/cpp/include/utils/cuda_info.cuh @@ -26,19 +26,16 @@ #include #include // NOLINT -#include "json11.hpp" -#include "utils/log.hpp" -#include "utils/types.hpp" +#include "utils/cuda_base_kernels.cuh" -#define DEBUG1(x) LOGGER->debug("[{}:{}] " x "\n", __FILENAME__, __LINE__); namespace cusim { -std::shared_ptr LOGGER = CuSimLogger().get_logger(); struct DeviceInfo { int devId, mp_cnt, major, minor, cores; -} + bool unknown = false; +}; DeviceInfo GetDeviceInfo() { DeviceInfo ret; @@ -68,29 +65,29 @@ DeviceInfo GetDeviceInfo() { else if (ret.minor == 0) ret.cores = ret.mp_cnt * 64; else - DEBUG1("Unknown device type"); + ret.unknown = true; break; case 7: // Volta and Turing if (ret.minor == 0 or ret.minor == 5) ret.cores = ret.mp_cnt * 64; else - DEBUG1("Unknown device type"); + ret.unknown = true; break; case 8: // Ampere if (ret.minor == 0) ret.cores = ret.mp_cnt * 64; - else if (minor_ == 6) + else if (ret.minor == 6) ret.cores = ret.mp_cnt * 128; else - DEBUG1("Unknown device type"); + ret.unknown = true; break; default: - DEBUG1("Unknown device type"); + ret.unknown = true; break; } if (ret.cores == -1) ret.cores = ret.mp_cnt * 128; - return ret + return ret; } -} // namespace cusim +} // namespace cusim diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu index e050c88..d87b2e0 100644 --- a/cpp/src/culda/culda.cu +++ b/cpp/src/culda/culda.cu @@ -9,6 +9,10 @@ namespace cusim { CuLDA::CuLDA() { logger_ = CuSimLogger().get_logger(); + dev_info_ = GetDeviceInfo(); + if (dev_info_.unknown) DEBUG0("Unknown device type"); + INFO("cuda device info, major: {}, minor: {}, multi processors: {}, cores: {}", + dev_info_.major, dev_info_.minor, dev_info_.mp_cnt, dev_info_.cores); } CuLDA::~CuLDA() {} @@ -26,6 +30,7 @@ bool CuLDA::Init(std::string opt_path) { CuSimLogger().set_log_level(opt_["c_log_level"].int_value()); num_topics_ = opt_["num_topics"].int_value(); block_dim_ = opt_["block_dim"].int_value(); + block_cnt_ = opt_["hyper_threads"].number_value() * (dev_info_.cores / block_dim_); return true; } @@ -53,7 +58,6 @@ void CuLDA::LoadModel(float* alpha, float* beta, int num_words) { } void CuLDA::FeedData(const int* indices, const int* indptr, int num_indices, int num_indptr) { - } } // namespace cusim diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc index d3ee2b4..0c6f72f 100644 --- a/cusim/culda/bindings.cc +++ b/cusim/culda/bindings.cc @@ -26,7 +26,7 @@ class CuLDABind { void LoadModel(py::object& alpha, py::object& beta) { float_array _alpha(alpha); float_array _beta(beta); - auto alpha_buffer = _alphpa.request(); + auto alpha_buffer = _alpha.request(); auto beta_buffer = _beta.request(); if (alpha_buffer.ndim != 1 or beta_buffer.ndim != 2 or alpha_buffer.shape[0] != beta_buffer.shape[0]) { @@ -43,10 +43,10 @@ class CuLDABind { PYBIND11_PLUGIN(culda_bind) { py::module m("CuLDABind"); - py::class_(m, "CuLDABind") + py::class_(m, "CuLDABind") .def(py::init()) .def("init", &CuLDABind::Init, py::arg("opt_path")) - .def("load_model", &IoUtilsBind::LoadModel, + .def("load_model", &CuLDABind::LoadModel, py::arg("alpha"), py::arg("beta")) .def("__repr__", [](const CuLDABind &a) { diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto index 071184b..b3899aa 100644 --- a/cusim/proto/config.proto +++ b/cusim/proto/config.proto @@ -12,3 +12,12 @@ message IoUtilsConfigProto { optional int32 chunk_lines = 3 [default = 100000]; optional int32 num_threads = 4 [default = 4]; } + +message CuLDAConfigProto { + optional int32 py_log_level = 1 [default = 2]; + optional int32 c_log_level = 2 [default = 2]; + + optional int32 num_topics = 3 [default = 10]; + optional int32 block_dim = 4 [default = 32]; + optional int32 hyper_threads = 5 [default = 10]; +} diff --git a/setup.py b/setup.py index 5031bcb..cadbe5b 100644 --- a/setup.py +++ b/setup.py @@ -68,24 +68,35 @@ def __init__(self, name): extend_compile_flags = get_extend_compile_flags() extra_compile_args = ['-fopenmp', '-std=c++14', '-ggdb', '-O3'] + \ extend_compile_flags -csrcs = \ - glob.glob("cpp/src/**/*.cu", recursive=True) + \ - glob.glob("cpp/src/**/*.cc", recursive=True) +util_srcs = glob.glob("cpp/src/utils/*.cc") extensions = [ Extension("cusim.ioutils.ioutils_bind", - sources= csrcs + [ \ + sources= util_srcs + [ \ "cusim/ioutils/bindings.cc", "3rd/json11/json11.cpp"], language="c++", extra_compile_args=extra_compile_args, extra_link_args=["-fopenmp"], + extra_objects=[], + include_dirs=[ \ + "cpp/include/", np.get_include(), pybind11.get_include(), + pybind11.get_include(True), + "3rd/json11", "3rd/spdlog/include"]), + Extension("cusim.culda.culda_bind", + sources= util_srcs + [ \ + "cpp/src/culda/culda.cu", + "cusim/culda/bindings.cc", + "3rd/json11/json11.cpp"], + language="c++", + extra_compile_args=extra_compile_args, + extra_link_args=["-fopenmp"], library_dirs=[CUDA['lib64']], libraries=['cudart', 'cublas', 'curand'], extra_objects=[], include_dirs=[ \ "cpp/include/", np.get_include(), pybind11.get_include(), pybind11.get_include(True), CUDA['include'], - "3rd/json11", "3rd/spdlog/include"]) + "3rd/json11", "3rd/spdlog/include"]), ] @@ -168,7 +179,7 @@ def setup_package(): download_url="https://github.com/js1010/cusim/releases", include_package_data=False, license='Apac2', - packages=['cusim/', "cusim/ioutils/"], + packages=['cusim/', "cusim/ioutils/", "cusim/culda/"], cmdclass=cmdclass, classifiers=[_f for _f in CLASSIFIERS.split('\n') if _f], platforms=['Linux', 'Mac OSX', 'Unix'], From 56c08b6071ea19c911d4128821a97d4ed183b14a Mon Sep 17 00:00:00 2001 From: js1010 Date: Mon, 8 Feb 2021 17:57:20 +0900 Subject: [PATCH 07/26] update --- cpp/include/utils/cuda_base_kernels.cuh | 57 +++++++++++++++++++++++++ cpp/include/utils/cuda_info.cuh | 2 +- 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/cpp/include/utils/cuda_base_kernels.cuh b/cpp/include/utils/cuda_base_kernels.cuh index 3cb2979..ea2f5f1 100644 --- a/cpp/include/utils/cuda_base_kernels.cuh +++ b/cpp/include/utils/cuda_base_kernels.cuh @@ -26,6 +26,63 @@ namespace cusim { +struct DeviceInfo { + int devId, mp_cnt, major, minor, cores; + bool unknown = false; +}; + +DeviceInfo GetDeviceInfo2() { + DeviceInfo ret; + CHECK_CUDA(cudaGetDevice(&ret.devId)); + cudaDeviceProp prop; + CHECK_CUDA(cudaGetDeviceProperties(&prop, ret.devId)); + ret.mp_cnt = prop.multiProcessorCount; + ret.major = prop.major; + ret.minor = prop.minor; + // reference: https://stackoverflow.com/a/32531982 + switch (ret.major) { + case 2: // Fermi + if (ret.minor == 1) + ret.cores = ret.mp_cnt * 48; + else + ret.cores = ret.mp_cnt * 32; + break; + case 3: // Kepler + ret.cores = ret.mp_cnt * 192; + break; + case 5: // Maxwell + ret.cores = ret.mp_cnt * 128; + break; + case 6: // Pascal + if (ret.minor == 1 or ret.minor == 2) + ret.cores = ret.mp_cnt * 128; + else if (ret.minor == 0) + ret.cores = ret.mp_cnt * 64; + else + ret.unknown = true; + break; + case 7: // Volta and Turing + if (ret.minor == 0 or ret.minor == 5) + ret.cores = ret.mp_cnt * 64; + else + ret.unknown = true; + break; + case 8: // Ampere + if (ret.minor == 0) + ret.cores = ret.mp_cnt * 64; + else if (ret.minor == 6) + ret.cores = ret.mp_cnt * 128; + else + ret.unknown = true; + break; + default: + ret.unknown = true; + break; + } + if (ret.cores == -1) ret.cores = ret.mp_cnt * 128; + return ret; +} + // Error Checking utilities, checks status codes from cuda calls // and throws exceptions on failure (which cython can proxy back to python) #define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); } diff --git a/cpp/include/utils/cuda_info.cuh b/cpp/include/utils/cuda_info.cuh index 0cd90f8..7ff30ba 100644 --- a/cpp/include/utils/cuda_info.cuh +++ b/cpp/include/utils/cuda_info.cuh @@ -37,7 +37,7 @@ struct DeviceInfo { bool unknown = false; }; -DeviceInfo GetDeviceInfo() { +DeviceInfo GetDeviceInfo2() { DeviceInfo ret; CHECK_CUDA(cudaGetDevice(&ret.devId)); cudaDeviceProp prop; From a71c178a2a8549073d70bfd08000f26f660004b9 Mon Sep 17 00:00:00 2001 From: js1010 Date: Mon, 8 Feb 2021 18:26:02 +0900 Subject: [PATCH 08/26] build succeed --- cpp/include/culda/culda.hpp | 2 +- cpp/include/utils/cuda_info.cuh | 93 ------------------- ...ase_kernels.cuh => cuda_utils_kernels.cuh} | 75 +++++++-------- cpp/include/utils/ioutils.hpp | 1 - cpp/include/utils/types.hpp | 5 + setup.py | 2 +- 6 files changed, 43 insertions(+), 135 deletions(-) delete mode 100644 cpp/include/utils/cuda_info.cuh rename cpp/include/utils/{cuda_base_kernels.cuh => cuda_utils_kernels.cuh} (96%) diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp index 4d85124..bc514a9 100644 --- a/cpp/include/culda/culda.hpp +++ b/cpp/include/culda/culda.hpp @@ -29,7 +29,7 @@ #include "json11.hpp" #include "utils/log.hpp" #include "utils/types.hpp" -#include "utils/cuda_info.cuh" +#include "utils/cuda_utils_kernels.cuh" namespace cusim { diff --git a/cpp/include/utils/cuda_info.cuh b/cpp/include/utils/cuda_info.cuh deleted file mode 100644 index 7ff30ba..0000000 --- a/cpp/include/utils/cuda_info.cuh +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2021 Jisang Yoon -// All rights reserved. -// -// This source code is licensed under the Apache 2.0 license found in the -// LICENSE file in the root directory of this source tree. -#pragma once -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include // NOLINT - -#include "utils/cuda_base_kernels.cuh" - - -namespace cusim { - - -struct DeviceInfo { - int devId, mp_cnt, major, minor, cores; - bool unknown = false; -}; - -DeviceInfo GetDeviceInfo2() { - DeviceInfo ret; - CHECK_CUDA(cudaGetDevice(&ret.devId)); - cudaDeviceProp prop; - CHECK_CUDA(cudaGetDeviceProperties(&prop, ret.devId)); - ret.mp_cnt = prop.multiProcessorCount; - ret.major = prop.major; - ret.minor = prop.minor; - // reference: https://stackoverflow.com/a/32531982 - switch (ret.major) { - case 2: // Fermi - if (ret.minor == 1) - ret.cores = ret.mp_cnt * 48; - else - ret.cores = ret.mp_cnt * 32; - break; - case 3: // Kepler - ret.cores = ret.mp_cnt * 192; - break; - case 5: // Maxwell - ret.cores = ret.mp_cnt * 128; - break; - case 6: // Pascal - if (ret.minor == 1 or ret.minor == 2) - ret.cores = ret.mp_cnt * 128; - else if (ret.minor == 0) - ret.cores = ret.mp_cnt * 64; - else - ret.unknown = true; - break; - case 7: // Volta and Turing - if (ret.minor == 0 or ret.minor == 5) - ret.cores = ret.mp_cnt * 64; - else - ret.unknown = true; - break; - case 8: // Ampere - if (ret.minor == 0) - ret.cores = ret.mp_cnt * 64; - else if (ret.minor == 6) - ret.cores = ret.mp_cnt * 128; - else - ret.unknown = true; - break; - default: - ret.unknown = true; - break; - } - if (ret.cores == -1) ret.cores = ret.mp_cnt * 128; - return ret; -} - - -} // namespace cusim diff --git a/cpp/include/utils/cuda_base_kernels.cuh b/cpp/include/utils/cuda_utils_kernels.cuh similarity index 96% rename from cpp/include/utils/cuda_base_kernels.cuh rename to cpp/include/utils/cuda_utils_kernels.cuh index ea2f5f1..69c02e2 100644 --- a/cpp/include/utils/cuda_base_kernels.cuh +++ b/cpp/include/utils/cuda_utils_kernels.cuh @@ -26,12 +26,43 @@ namespace cusim { -struct DeviceInfo { - int devId, mp_cnt, major, minor, cores; - bool unknown = false; -}; -DeviceInfo GetDeviceInfo2() { +// Error Checking utilities, checks status codes from cuda calls +// and throws exceptions on failure (which cython can proxy back to python) +#define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); } +inline void checkCuda(cudaError_t code, const char *file, int line) { + if (code != cudaSuccess) { + std::stringstream err; + err << "Cuda Error: " << cudaGetErrorString(code) << " (" << file << ":" << line << ")"; + throw std::runtime_error(err.str()); + } +} + +inline const char* cublasGetErrorString(cublasStatus_t status) { + switch (status) { + case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; + } + return "Unknown"; +} + +#define CHECK_CUBLAS(code) { checkCublas((code), __FILE__, __LINE__); } +inline void checkCublas(cublasStatus_t code, const char * file, int line) { + if (code != CUBLAS_STATUS_SUCCESS) { + std::stringstream err; + err << "cublas error: " << cublasGetErrorString(code) + << " (" << file << ":" << line << ")"; + throw std::runtime_error(err.str()); + } +} + +inline DeviceInfo GetDeviceInfo() { DeviceInfo ret; CHECK_CUDA(cudaGetDevice(&ret.devId)); cudaDeviceProp prop; @@ -83,38 +114,4 @@ DeviceInfo GetDeviceInfo2() { return ret; } -// Error Checking utilities, checks status codes from cuda calls -// and throws exceptions on failure (which cython can proxy back to python) -#define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); } -inline void checkCuda(cudaError_t code, const char *file, int line) { - if (code != cudaSuccess) { - std::stringstream err; - err << "Cuda Error: " << cudaGetErrorString(code) << " (" << file << ":" << line << ")"; - throw std::runtime_error(err.str()); - } -} - -inline const char* cublasGetErrorString(cublasStatus_t status) { - switch (status) { - case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; - } - return "Unknown"; -} - -#define CHECK_CUBLAS(code) { checkCublas((code), __FILE__, __LINE__); } -inline void checkCublas(cublasStatus_t code, const char * file, int line) { - if (code != CUBLAS_STATUS_SUCCESS) { - std::stringstream err; - err << "cublas error: " << cublasGetErrorString(code) - << " (" << file << ":" << line << ")"; - throw std::runtime_error(err.str()); - } -} } // namespace cusim diff --git a/cpp/include/utils/ioutils.hpp b/cpp/include/utils/ioutils.hpp index b94ec0a..cc3e09e 100644 --- a/cpp/include/utils/ioutils.hpp +++ b/cpp/include/utils/ioutils.hpp @@ -22,7 +22,6 @@ #include "json11.hpp" #include "utils/log.hpp" -#include "utils/types.hpp" namespace cusim { diff --git a/cpp/include/utils/types.hpp b/cpp/include/utils/types.hpp index 3af46c7..4efefe0 100644 --- a/cpp/include/utils/types.hpp +++ b/cpp/include/utils/types.hpp @@ -39,3 +39,8 @@ #endif #define WARP_SIZE 32 + +struct DeviceInfo { + int devId, mp_cnt, major, minor, cores; + bool unknown = false; +}; diff --git a/setup.py b/setup.py index cadbe5b..db61bb6 100644 --- a/setup.py +++ b/setup.py @@ -71,7 +71,7 @@ def __init__(self, name): util_srcs = glob.glob("cpp/src/utils/*.cc") extensions = [ Extension("cusim.ioutils.ioutils_bind", - sources= util_srcs + [ \ + sources = util_srcs + [ \ "cusim/ioutils/bindings.cc", "3rd/json11/json11.cpp"], language="c++", From b99c6ffebb5e9bceeeb48b2f66b7a7d7d1fb60bf Mon Sep 17 00:00:00 2001 From: js1010 Date: Mon, 8 Feb 2021 20:36:33 +0900 Subject: [PATCH 09/26] implement lda kernel --- cpp/include/culda/cuda_lda_kernels.cuh | 74 ++++++++++++++++++++++++ cpp/include/utils/cuda_utils_kernels.cuh | 57 ++++++++++++++++++ cpp/src/culda/culda.cu | 10 +++- examples/example1.py | 7 ++- 4 files changed, 145 insertions(+), 3 deletions(-) create mode 100644 cpp/include/culda/cuda_lda_kernels.cuh diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh new file mode 100644 index 0000000..2811421 --- /dev/null +++ b/cpp/include/culda/cuda_lda_kernels.cuh @@ -0,0 +1,74 @@ +// Copyright (c) 2021 Jisang Yoon +// All rights reserved. +// +// This source code is licensed under the Apache 2.0 license found in the +// LICENSE file in the root directory of this source tree. +#pragma once +#include "utils/cuda_utils_kernels.cuh" + +namespace cusim { + +__inline__ __device__ +cuda_scalar Psi(cuda_scalar x) { + +} + +__global__ void EstepKernel( + const int* indices, const int* indptr, + const int num_indices, const int num_indptr, + const int num_words, const int num_topics, const int num_iters, + cuda_scalar* gamma, cuda_scalar* new_gamma, cuda_scalar* phi, + cuda_scalar* alpha, cuda_scalar* beta, + cuda_scalar* grad_alpha, cuda_scalar* new_beta) { + + // storage for block + cuda_scalar* _gamma = gamma + num_topics * blockIdx.x; + cuda_scalar* _new_gamma = new_gamma + num_topics * blockIdx.x; + cuda_scalar* _phi = phi + num_topics * blockIdx.x; + + for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) { + int beg = indptr[i], end = indptr[i + 1]; + + // initialize gamma + for (int j = threadIdx.x; j < num_topics; j += blockDim.x) + _gamma[j] = alpha[j] + (end - beg) / num_topics; + __syncthreads(); + + // iterate E step + for (int j = 0; j < num_iters; ++j) { + // initialize new gamma + for (int k = threadIdx.x; k < num_topics; k += blockDim.x) + _new_gamma[k] = 0; + __synctheads(); + + // compute phi from gamma + for (int k = beg; k < end; ++k) { + int w = indices[k]; + // compute phi + for (int l = threadIdx.x; l < num_topics; l += blockDim.x) + _phi[l] = beta[w * num_topics + l] * exp(Psi(_gamma[l])); + __syncthreads(); + + // normalize phi and add it to new gamma and new beta + cuda_scalar phi_sum = Sum(_phi, num_topics); + for (int l = threadIdx.x; l < num_topics; l += blockDim.x) { + _phi[l] /= phi_sum; + _new_gamma[l] += _phi[l]; + if (j + 1 == num_iters) new_beta[w * num_topics + l] += phi[l]; + } + __syncthreads(); + } + + // update gamma + for (int k = threadIdx.x; k < num_topics; l += blockDim.x) + _gamma[k] = _new_gamma[k] + alpha[k]; + __syncthreads(); + } + cuda_scalar gamma_sum = Sum(_gamma, num_topics); + for (int j = threadIdx.x; j < num_topics, j += blockDim.x) + grad_alpha[j] += (Psi(_gamma[j]) - Psi(gamma_sum)); + __syncthreaads() + } +} + +} // cusim diff --git a/cpp/include/utils/cuda_utils_kernels.cuh b/cpp/include/utils/cuda_utils_kernels.cuh index 69c02e2..ef71021 100644 --- a/cpp/include/utils/cuda_utils_kernels.cuh +++ b/cpp/include/utils/cuda_utils_kernels.cuh @@ -114,4 +114,61 @@ inline DeviceInfo GetDeviceInfo() { return ret; } +__inline__ __device__ +cuda_scalar warp_reduce_sum(cuda_scalar val) { + #if __CUDACC_VER_MAJOR__ >= 9 + // __shfl_down is deprecated with cuda 9+. use newer variants + unsigned int active = __activemask(); + #pragma unroll + for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { + val = add(val, __shfl_down_sync(active, val, offset)); + } + #else + #pragma unroll + for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { + val = add(val, __shfl_down(val, offset)); + } + #endif + return val; +} + +__inline__ __device__ +cuda_scalar Sum(const cuda_scalar* vec, const int length) { + + static __shared__ cuda_scalar shared[32]; + + // figure out the warp/ position inside the warp + int warp = threadIdx.x / WARP_SIZE; + int lane = threadIdx.x % WARP_SIZE; + + // paritial sum + cuda_scalar val = 0; + for (int i = threadIdx.x; i < length; i += blockDim.x) + val += vec[i]; + val = warp_reduce_sum(val); + + // write out the partial reduction to shared memory if appropiate + if (lane == 0) { + shared[warp] = val; + } + __syncthreads(); + + // if we we don't have multiple warps, we're done + if (blockDim.x <= WARP_SIZE) { + return shared[0]; + } + + // otherwise reduce again in the first warp + val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: conversion(0.0f); + if (warp == 0) { + val = warp_reduce_sum(val); + // broadcast back to shared memory + if (threadIdx.x == 0) { + shared[0] = val; + } + } + __syncthreads(); + return shared[0]; +} + } // namespace cusim diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu index d87b2e0..da5db80 100644 --- a/cpp/src/culda/culda.cu +++ b/cpp/src/culda/culda.cu @@ -31,6 +31,7 @@ bool CuLDA::Init(std::string opt_path) { num_topics_ = opt_["num_topics"].int_value(); block_dim_ = opt_["block_dim"].int_value(); block_cnt_ = opt_["hyper_threads"].number_value() * (dev_info_.cores / block_dim_); + INFO("num_topics: {}, block_dim: {}, block_cnt: {}", num_topics_, block_dim_, block_cnt_); return true; } @@ -57,7 +58,14 @@ void CuLDA::LoadModel(float* alpha, float* beta, int num_words) { alpha_ = alpha; beta_ = beta; } -void CuLDA::FeedData(const int* indices, const int* indptr, int num_indices, int num_indptr) { +void CuLDA::FeedData(const int* indices, const int* indptr, + int num_indices, int num_indptr, float* gamma) { + thrust::vector dev_phi(num_indices * num_topics_); + thrust::vector dev_gamma(num_indptr * num_topics_); + + + + } } // namespace cusim diff --git a/examples/example1.py b/examples/example1.py index 6cbdaa9..6d97a84 100644 --- a/examples/example1.py +++ b/examples/example1.py @@ -10,7 +10,7 @@ import fire from gensim import downloader as api -from cusim import aux, IoUtils +from cusim import aux, IoUtils, CuLDA LOGGER = aux.get_logger() DOWNLOAD_PATH = "./res" @@ -32,11 +32,14 @@ def download(): LOGGER.info("cmd: %s", cmd) subprocess.call(cmd, shell=True) -def run(): +def run_io(): download() iou = IoUtils(opt={"chunk_lines": 10000, "num_threads": 8}) iou.convert_stream_to_h5(DATA_PATH, 5, DATA_PATH2) +def run_lda(): + CuLDA() + if __name__ == "__main__": fire.Fire() From 083c82eec0f24c2e1272ef2243fec42d49255777 Mon Sep 17 00:00:00 2001 From: js1010 Date: Mon, 8 Feb 2021 21:33:45 +0900 Subject: [PATCH 10/26] not support half --- cpp/include/utils/cuda_utils_kernels.cuh | 14 +++++++------- cpp/include/utils/types.hpp | 4 ++++ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/include/utils/cuda_utils_kernels.cuh b/cpp/include/utils/cuda_utils_kernels.cuh index ef71021..1503f90 100644 --- a/cpp/include/utils/cuda_utils_kernels.cuh +++ b/cpp/include/utils/cuda_utils_kernels.cuh @@ -115,34 +115,34 @@ inline DeviceInfo GetDeviceInfo() { } __inline__ __device__ -cuda_scalar warp_reduce_sum(cuda_scalar val) { +float warp_reduce_sum(float val) { #if __CUDACC_VER_MAJOR__ >= 9 // __shfl_down is deprecated with cuda 9+. use newer variants unsigned int active = __activemask(); #pragma unroll for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { - val = add(val, __shfl_down_sync(active, val, offset)); + val += __shfl_down_sync(active, val, offset); } #else #pragma unroll for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { - val = add(val, __shfl_down(val, offset)); + val += __shfl_down(val, offset); } #endif return val; } __inline__ __device__ -cuda_scalar Sum(const cuda_scalar* vec, const int length) { +float ReduceSum(const float* vec, const int length) { - static __shared__ cuda_scalar shared[32]; + static __shared__ floaat shared[32]; // figure out the warp/ position inside the warp int warp = threadIdx.x / WARP_SIZE; int lane = threadIdx.x % WARP_SIZE; // paritial sum - cuda_scalar val = 0; + float val = 0.0f; for (int i = threadIdx.x; i < length; i += blockDim.x) val += vec[i]; val = warp_reduce_sum(val); @@ -159,7 +159,7 @@ cuda_scalar Sum(const cuda_scalar* vec, const int length) { } // otherwise reduce again in the first warp - val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: conversion(0.0f); + val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: 0.0f; if (warp == 0) { val = warp_reduce_sum(val); // broadcast back to shared memory diff --git a/cpp/include/utils/types.hpp b/cpp/include/utils/types.hpp index 4efefe0..475f50c 100644 --- a/cpp/include/utils/types.hpp +++ b/cpp/include/utils/types.hpp @@ -19,23 +19,27 @@ #define mul(x, y) ( __hmul(x, y) ) #define add(x, y) ( __hadd(x, y) ) #define sub(x, y) ( __hsub(x, y) ) + #define div(x, y) ( __hdiv(x, y) ) #define gt(x, y) ( __hgt(x, y) ) // x > y #define ge(x, y) ( __hge(x, y) ) // x >= y #define lt(x, y) ( __hlt(x, y) ) // x < y #define le(x, y) ( __hle(x, y) ) // x <= y #define out_scalar(x) ( __half2float(x) ) #define conversion(x) ( __float2half(x) ) + #define cuda_exp(x) ( __hexp(x) ) #else typedef float cuda_scalar; #define mul(x, y) ( x * y ) #define add(x, y) ( x + y ) #define sub(x, y) ( x - y ) + #define div(x, y) ( x / y ) #define gt(x, y) ( x > y ) #define ge(x, y) ( x >= y ) #define lt(x, y) ( x < y ) #define le(x, y) ( x <= y ) #define out_scalar(x) ( x ) #define conversion(x) ( x ) + #define cuda_exp(x) ( expf(x) ) #endif #define WARP_SIZE 32 From 7824d9d7bd6adc12d8fbbca702ae791b7bbd1eb6 Mon Sep 17 00:00:00 2001 From: js1010 Date: Mon, 8 Feb 2021 21:42:50 +0900 Subject: [PATCH 11/26] fix typo --- cpp/include/culda/cuda_lda_kernels.cuh | 40 +++++++++++++------ cpp/include/culda/culda.hpp | 5 ++- cpp/include/utils/cuda_utils_kernels.cuh | 9 +++-- cpp/include/utils/types.hpp | 50 ------------------------ cpp/src/culda/culda.cu | 27 +++---------- 5 files changed, 43 insertions(+), 88 deletions(-) delete mode 100644 cpp/include/utils/types.hpp diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh index 2811421..1569209 100644 --- a/cpp/include/culda/cuda_lda_kernels.cuh +++ b/cpp/include/culda/cuda_lda_kernels.cuh @@ -6,25 +6,35 @@ #pragma once #include "utils/cuda_utils_kernels.cuh" + namespace cusim { __inline__ __device__ -cuda_scalar Psi(cuda_scalar x) { - +float Digamma(float x) { + float result = 0f, xx, xx2, xx4; + for ( ; x < 7.0f; ++x) + result -= 1.0f / x; + x -= 0.5f; + xx = 1.0f / x; + xx2 = xx * xx; + xx4 = xx2 * xx2; + result += logf(x) + 1.0f / 24.0f * xx2 - 7.0f / 960.0f * xx4 + + 31.0f / 8064.0f * xx4 * xx2 - 127.0f / 30720.0f * xx4 * xx4; + return result; } __global__ void EstepKernel( const int* indices, const int* indptr, const int num_indices, const int num_indptr, const int num_words, const int num_topics, const int num_iters, - cuda_scalar* gamma, cuda_scalar* new_gamma, cuda_scalar* phi, - cuda_scalar* alpha, cuda_scalar* beta, - cuda_scalar* grad_alpha, cuda_scalar* new_beta) { + float* gamma, float* new_gamma, float* phi, + float* alpha, float* beta, + float* grad_alpha, float* new_beta) { // storage for block - cuda_scalar* _gamma = gamma + num_topics * blockIdx.x; - cuda_scalar* _new_gamma = new_gamma + num_topics * blockIdx.x; - cuda_scalar* _phi = phi + num_topics * blockIdx.x; + float* _gamma = gamma + num_topics * blockIdx.x; + float* _new_gamma = new_gamma + num_topics * blockIdx.x; + float* _phi = phi + num_topics * blockIdx.x; for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) { int beg = indptr[i], end = indptr[i + 1]; @@ -46,11 +56,11 @@ __global__ void EstepKernel( int w = indices[k]; // compute phi for (int l = threadIdx.x; l < num_topics; l += blockDim.x) - _phi[l] = beta[w * num_topics + l] * exp(Psi(_gamma[l])); + _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l])); __syncthreads(); // normalize phi and add it to new gamma and new beta - cuda_scalar phi_sum = Sum(_phi, num_topics); + float phi_sum = ReduceSum(_phi, num_topics); for (int l = threadIdx.x; l < num_topics; l += blockDim.x) { _phi[l] /= phi_sum; _new_gamma[l] += _phi[l]; @@ -64,11 +74,19 @@ __global__ void EstepKernel( _gamma[k] = _new_gamma[k] + alpha[k]; __syncthreads(); } - cuda_scalar gamma_sum = Sum(_gamma, num_topics); + float gamma_sum = ReduceSum(_gamma, num_topics); for (int j = threadIdx.x; j < num_topics, j += blockDim.x) grad_alpha[j] += (Psi(_gamma[j]) - Psi(gamma_sum)); __syncthreaads() } } +__global__ void MstepKernel( + float* alpha, float* beta, + float* grad_alpha, float* new_beta, + const int num_words, const int num_topic) { + + +} + } // cusim diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp index bc514a9..7309b5e 100644 --- a/cpp/include/culda/culda.hpp +++ b/cpp/include/culda/culda.hpp @@ -28,7 +28,6 @@ #include "json11.hpp" #include "utils/log.hpp" -#include "utils/types.hpp" #include "utils/cuda_utils_kernels.cuh" namespace cusim { @@ -46,7 +45,9 @@ class CuLDA { DeviceInfo dev_info_; json11::Json opt_; std::shared_ptr logger_; - thrust::device_vector dev_alpha_, dev_beta_; + thrust::device_vector dev_alpha_, dev_beta_; + thrust::device_vector dev_grad_alpha_, dev_new_beta_; + thrust::device_vector dev_gamma_, dev_new_gamma_, dev_phi_; const float *alpha_, *beta_; int block_cnt_, block_dim_; int num_topics_, num_words_; diff --git a/cpp/include/utils/cuda_utils_kernels.cuh b/cpp/include/utils/cuda_utils_kernels.cuh index 1503f90..d925549 100644 --- a/cpp/include/utils/cuda_utils_kernels.cuh +++ b/cpp/include/utils/cuda_utils_kernels.cuh @@ -22,11 +22,14 @@ #include #include -#include "utils/types.hpp" - namespace cusim { +struct DeviceInfo { + int devId, mp_cnt, major, minor, cores; + bool unknown = false; +}; +#define WARP_SIZE 32 // Error Checking utilities, checks status codes from cuda calls // and throws exceptions on failure (which cython can proxy back to python) #define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); } @@ -135,7 +138,7 @@ float warp_reduce_sum(float val) { __inline__ __device__ float ReduceSum(const float* vec, const int length) { - static __shared__ floaat shared[32]; + static __shared__ float shared[32]; // figure out the warp/ position inside the warp int warp = threadIdx.x / WARP_SIZE; diff --git a/cpp/include/utils/types.hpp b/cpp/include/utils/types.hpp deleted file mode 100644 index 475f50c..0000000 --- a/cpp/include/utils/types.hpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2020 Jisang Yoon -// All rights reserved. -// -// This source code is licensed under the Apache 2.0 license found in the -// LICENSE file in the root directory of this source tree. -#pragma once -#include - -// experimental codes to use half precision -// not properly working yet.. -// #define HALF_PRECISION 1 - -// #if __CUDA_ARCH__ < 530 -// #undef HALF_PRECISION -// #endif - -#ifdef HALF_PRECISION - typedef half cuda_scalar; - #define mul(x, y) ( __hmul(x, y) ) - #define add(x, y) ( __hadd(x, y) ) - #define sub(x, y) ( __hsub(x, y) ) - #define div(x, y) ( __hdiv(x, y) ) - #define gt(x, y) ( __hgt(x, y) ) // x > y - #define ge(x, y) ( __hge(x, y) ) // x >= y - #define lt(x, y) ( __hlt(x, y) ) // x < y - #define le(x, y) ( __hle(x, y) ) // x <= y - #define out_scalar(x) ( __half2float(x) ) - #define conversion(x) ( __float2half(x) ) - #define cuda_exp(x) ( __hexp(x) ) -#else - typedef float cuda_scalar; - #define mul(x, y) ( x * y ) - #define add(x, y) ( x + y ) - #define sub(x, y) ( x - y ) - #define div(x, y) ( x / y ) - #define gt(x, y) ( x > y ) - #define ge(x, y) ( x >= y ) - #define lt(x, y) ( x < y ) - #define le(x, y) ( x <= y ) - #define out_scalar(x) ( x ) - #define conversion(x) ( x ) - #define cuda_exp(x) ( expf(x) ) -#endif - -#define WARP_SIZE 32 - -struct DeviceInfo { - int devId, mp_cnt, major, minor, cores; - bool unknown = false; -}; diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu index da5db80..a51313e 100644 --- a/cpp/src/culda/culda.cu +++ b/cpp/src/culda/culda.cu @@ -40,32 +40,15 @@ void CuLDA::LoadModel(float* alpha, float* beta, int num_words) { DEBUG("copy model({} x {})", num_topics_, num_words_); dev_alpha_.resize(num_topics_); dev_beta_.resize(num_topics_ * num_words_); - #ifdef HALF_PRECISION - // conversion to half data and copy - std::vector halpha(num_topics_), hbeta(num_topics_ * num_words_); - for (int i = 0; i < num_topics_; ++i) { - halpha[i] = conversion(alpha[i]); - for (int j = 0; j < num_words_; ++j) { - hbeta[i * num_words + j] = conversion(beta[i * num_words + j]); - } - } - thrust::copy(halpha.begin(), halpha.end(), dev_alapha_.begin()); - thrust::copy(hbeta.begin(), hbeta.end(), dev_beta_.begin()); - #else - thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin()); - thrust::copy(beta, beta + num_topics_ * num_words_, dev_beta_.begin()); - #endif + thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin()); + thrust::copy(beta, beta + num_topics_ * num_words_, dev_beta_.begin()); alpha_ = alpha; beta_ = beta; } void CuLDA::FeedData(const int* indices, const int* indptr, - int num_indices, int num_indptr, float* gamma) { - thrust::vector dev_phi(num_indices * num_topics_); - thrust::vector dev_gamma(num_indptr * num_topics_); - - - - + int num_indices, int num_indptr) { + thrust::device_vector dev_phi(num_indices * num_topics_); + thrust::device_vector dev_gamma(num_indptr * num_topics_); } } // namespace cusim From 5e9f0cc7ae8b79b215d4baf236c02586f581e82c Mon Sep 17 00:00:00 2001 From: js1010 Date: Mon, 8 Feb 2021 22:31:00 +0900 Subject: [PATCH 12/26] implement e-step and fix typos --- cpp/include/culda/cuda_lda_kernels.cuh | 33 +++++++--------- cpp/include/culda/culda.hpp | 33 ++++++++++++++-- cpp/src/culda/culda.cu | 55 ++++++++++++++++++++++++-- 3 files changed, 96 insertions(+), 25 deletions(-) diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh index 1569209..b8648a3 100644 --- a/cpp/include/culda/cuda_lda_kernels.cuh +++ b/cpp/include/culda/cuda_lda_kernels.cuh @@ -9,17 +9,19 @@ namespace cusim { +// reference: http://web.science.mq.edu.au/~mjohnson/code/digamma.c __inline__ __device__ float Digamma(float x) { - float result = 0f, xx, xx2, xx4; + float result = 0.0f, xx, xx2, xx4; for ( ; x < 7.0f; ++x) result -= 1.0f / x; x -= 0.5f; xx = 1.0f / x; xx2 = xx * xx; xx4 = xx2 * xx2; - result += logf(x) + 1.0f / 24.0f * xx2 - 7.0f / 960.0f * xx4 + - 31.0f / 8064.0f * xx4 * xx2 - 127.0f / 30720.0f * xx4 * xx4; + result += logf(x) + 1.0f / 24.0f * xx2 + - 7.0f / 960.0f * xx4 + 31.0f / 8064.0f * xx4 * xx2 + - 127.0f / 30720.0f * xx4 * xx4; return result; } @@ -48,12 +50,12 @@ __global__ void EstepKernel( for (int j = 0; j < num_iters; ++j) { // initialize new gamma for (int k = threadIdx.x; k < num_topics; k += blockDim.x) - _new_gamma[k] = 0; - __synctheads(); + _new_gamma[k] = 0.0f; + __syncthreads(); // compute phi from gamma for (int k = beg; k < end; ++k) { - int w = indices[k]; + const int w = indices[k]; // compute phi for (int l = threadIdx.x; l < num_topics; l += blockDim.x) _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l])); @@ -64,29 +66,22 @@ __global__ void EstepKernel( for (int l = threadIdx.x; l < num_topics; l += blockDim.x) { _phi[l] /= phi_sum; _new_gamma[l] += _phi[l]; - if (j + 1 == num_iters) new_beta[w * num_topics + l] += phi[l]; + if (j + 1 == num_iters) + new_beta[w * num_topics + l] += phi[l]; } __syncthreads(); } // update gamma - for (int k = threadIdx.x; k < num_topics; l += blockDim.x) + for (int k = threadIdx.x; k < num_topics; k += blockDim.x) _gamma[k] = _new_gamma[k] + alpha[k]; __syncthreads(); } float gamma_sum = ReduceSum(_gamma, num_topics); - for (int j = threadIdx.x; j < num_topics, j += blockDim.x) - grad_alpha[j] += (Psi(_gamma[j]) - Psi(gamma_sum)); - __syncthreaads() + for (int j = threadIdx.x; j < num_topics; j += blockDim.x) + grad_alpha[j] += (Digamma(_gamma[j]) - Digamma(gamma_sum)); + __syncthreads(); } } -__global__ void MstepKernel( - float* alpha, float* beta, - float* grad_alpha, float* new_beta, - const int num_words, const int num_topic) { - - -} - } // cusim diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp index 7309b5e..fe57cd3 100644 --- a/cpp/include/culda/culda.hpp +++ b/cpp/include/culda/culda.hpp @@ -32,15 +32,41 @@ namespace cusim { + +// reference: https://people.math.sc.edu/Burkardt/cpp_src/asa121/asa121.cpp +inline float Trigamma(float x) { + const float a = 0.0001f; + const float b = 5.0f; + const float b2 = 0.1666666667f; + const float b4 = -0.03333333333f; + const float b6 = 0.02380952381f; + const float b8 = -0.03333333333f; + float value = 0, y = 0, z = x; + if (x <= a) return 1.0f / x / x; + while (z < b) { + value += 1.0f / z / z; + z++; + } + y = 1.0f / z / z; + value += value + 0.5 * y + (1.0 + + y * (b2 + + y * (b4 + + y * (b6 + + y * b8)))) / z; + return value; +} + + class CuLDA { public: CuLDA(); ~CuLDA(); bool Init(std::string opt_path); - void LoadModel(float* alpha, float* beta, int num_words); + void LoadModel(float* alpha, float* beta, const int num_words); + void InitModel(); void FeedData(const int* indices, const int* indptr, - int num_indices, int num_indptr); - + const int num_indices, const int num_indptr, const int num_iters); + void Mstep(const int num_docs); private: DeviceInfo dev_info_; json11::Json opt_; @@ -49,6 +75,7 @@ class CuLDA { thrust::device_vector dev_grad_alpha_, dev_new_beta_; thrust::device_vector dev_gamma_, dev_new_gamma_, dev_phi_; const float *alpha_, *beta_; + std::vector grad_alpha_, new_beta_; int block_cnt_, block_dim_; int num_topics_, num_words_; }; diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu index a51313e..ab2f6a2 100644 --- a/cpp/src/culda/culda.cu +++ b/cpp/src/culda/culda.cu @@ -4,6 +4,7 @@ // This source code is licensed under the Apache 2.0 license found in the // LICENSE file in the root directory of this source tree. #include "culda/culda.hpp" +#include "culda/cuda_lda_kernels.cuh" namespace cusim { @@ -43,12 +44,60 @@ void CuLDA::LoadModel(float* alpha, float* beta, int num_words) { thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin()); thrust::copy(beta, beta + num_topics_ * num_words_, dev_beta_.begin()); alpha_ = alpha; beta_ = beta; + InitModel(); +} + +void CuLDA::InitModel() { + // resize device vector + dev_grad_alpha_.resize(num_topics_); + dev_new_beta_.resize(num_topics_ * num_words_); + dev_gamma_.resize(num_topics_ * block_cnt_); + dev_new_gamma_.resize(num_topics_ * block_cnt_); + dev_phi_.resize(num_topics_ * block_cnt_); + + // resize host vector + grad_alpha_.resize(num_topics_); + new_beta_.resize(num_topics_ * num_words_); + + // fill zeros + std::fill(grad_alpha_.begin(), grad_alpha_.end(), 0); + std::fill(new_beta_.begin(), new_beta_.end(), 0); + + // copy to device + thrust::copy(grad_alpha_.begin(), grad_alpha_.end(), dev_grad_alpha_.begin()); + thrust::copy(new_beta_.begin(), new_beta_.end(), dev_new_beta_.begin()); + CHECK_CUDA(cudaDeviceSynchronize()); } void CuLDA::FeedData(const int* indices, const int* indptr, - int num_indices, int num_indptr) { - thrust::device_vector dev_phi(num_indices * num_topics_); - thrust::device_vector dev_gamma(num_indptr * num_topics_); + const int num_indices, const int num_indptr, const int num_iters) { + thrust::device_vector dev_indices(num_indices); + thrust::device_vector dev_indptr(num_indptr + 1); + thrust::copy(indices, indices + num_indices, dev_indices.begin()); + thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin()); + CHECK_CUDA(cudaDeviceSynchronize()); + + EstepKernel<<>>( + thrust::raw_pointer_cast(dev_indices.data()), + thrust::raw_pointer_cast(dev_indptr.data()), + num_indices, num_indptr, num_words_, num_topics_, num_iters, + thrust::raw_pointer_cast(dev_gamma_.data()), + thrust::raw_pointer_cast(dev_new_gamma_.data()), + thrust::raw_pointer_cast(dev_phi_.data()), + thrust::raw_pointer_cast(dev_alpha_.data()), + thrust::raw_pointer_cast(dev_beta_.data()), + thrust::raw_pointer_cast(dev_grad_alpha_.data()), + thrust::raw_pointer_cast(dev_new_beta_.data())); + + CHECK_CUDA(cudaDeviceSynchronize()); +} + +void CuLDA::Mstep(const int num_docs) { + thrust::copy(dev_grad_alpha_.begin(), dev_grad_alpha_.end(), grad_alpha_.begin()); + thrust::copy(dev_new_beta_.begin(), dev_new_beta_.end(), new_beta_.begin()); + CHECK_CUDA(cudaDeviceSynchronize()); + + } } // namespace cusim From 18451db95bfb992149bb673ad65fad98b4ffe151 Mon Sep 17 00:00:00 2001 From: js1010 Date: Mon, 8 Feb 2021 23:04:38 +0900 Subject: [PATCH 13/26] add pull and push --- cpp/include/culda/culda.hpp | 10 ++++----- cpp/src/culda/culda.cu | 42 ++++++++++++++++++------------------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp index fe57cd3..ae1424a 100644 --- a/cpp/include/culda/culda.hpp +++ b/cpp/include/culda/culda.hpp @@ -62,11 +62,12 @@ class CuLDA { CuLDA(); ~CuLDA(); bool Init(std::string opt_path); - void LoadModel(float* alpha, float* beta, const int num_words); - void InitModel(); + void LoadModel(float* alpha, float* beta, + float* grad_alpha, float* new_beta, const int num_words); void FeedData(const int* indices, const int* indptr, const int num_indices, const int num_indptr, const int num_iters); - void Mstep(const int num_docs); + void Pull(); + void Push(); private: DeviceInfo dev_info_; json11::Json opt_; @@ -74,8 +75,7 @@ class CuLDA { thrust::device_vector dev_alpha_, dev_beta_; thrust::device_vector dev_grad_alpha_, dev_new_beta_; thrust::device_vector dev_gamma_, dev_new_gamma_, dev_phi_; - const float *alpha_, *beta_; - std::vector grad_alpha_, new_beta_; + float *alpha_, *beta_, *grad_alpha_, *new_beta_; int block_cnt_, block_dim_; int num_topics_, num_words_; }; diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu index ab2f6a2..d37c3f5 100644 --- a/cpp/src/culda/culda.cu +++ b/cpp/src/culda/culda.cu @@ -36,7 +36,8 @@ bool CuLDA::Init(std::string opt_path) { return true; } -void CuLDA::LoadModel(float* alpha, float* beta, int num_words) { +void CuLDA::LoadModel(float* alpha, float* beta, + float* grad_alpha, float* new_beta, int num_words) { num_words_ = num_words; DEBUG("copy model({} x {})", num_topics_, num_words_); dev_alpha_.resize(num_topics_); @@ -44,28 +45,19 @@ void CuLDA::LoadModel(float* alpha, float* beta, int num_words) { thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin()); thrust::copy(beta, beta + num_topics_ * num_words_, dev_beta_.begin()); alpha_ = alpha; beta_ = beta; - InitModel(); -} - -void CuLDA::InitModel() { + // resize device vector + grad_alpha_ = grad_alpha; + new_beta_ = new_beta; dev_grad_alpha_.resize(num_topics_); dev_new_beta_.resize(num_topics_ * num_words_); + + // copy to device + thrust::copy(grad_alpha_, grad_alpha_ + num_topics_, dev_grad_alpha_.begin()); + thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin()); dev_gamma_.resize(num_topics_ * block_cnt_); dev_new_gamma_.resize(num_topics_ * block_cnt_); dev_phi_.resize(num_topics_ * block_cnt_); - - // resize host vector - grad_alpha_.resize(num_topics_); - new_beta_.resize(num_topics_ * num_words_); - - // fill zeros - std::fill(grad_alpha_.begin(), grad_alpha_.end(), 0); - std::fill(new_beta_.begin(), new_beta_.end(), 0); - - // copy to device - thrust::copy(grad_alpha_.begin(), grad_alpha_.end(), dev_grad_alpha_.begin()); - thrust::copy(new_beta_.begin(), new_beta_.end(), dev_new_beta_.begin()); CHECK_CUDA(cudaDeviceSynchronize()); } @@ -92,12 +84,18 @@ void CuLDA::FeedData(const int* indices, const int* indptr, CHECK_CUDA(cudaDeviceSynchronize()); } -void CuLDA::Mstep(const int num_docs) { - thrust::copy(dev_grad_alpha_.begin(), dev_grad_alpha_.end(), grad_alpha_.begin()); - thrust::copy(dev_new_beta_.begin(), dev_new_beta_.end(), new_beta_.begin()); +void CuLDA::Pull() { + thrust::copy(dev_grad_alpha_.begin(), dev_grad_alpha_.end(), grad_alpha_); + thrust::copy(dev_new_beta_.begin(), dev_new_beta_.end(), new_beta_); CHECK_CUDA(cudaDeviceSynchronize()); - +} +void CuLDA::Push() { + thrust::copy(alpha_, alpha_ + num_topics_, dev_alpha_.begin()); + thrust::copy(grad_alpha_, grad_alpha_ + num_topics_, dev_grad_alpha_.begin()); + thrust::copy(beta_, beta_ + num_words_ * num_topics_, dev_beta_.begin()); + thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin()); + CHECK_CUDA(cudaDeviceSynchronize()); } -} // namespace cusim +} // namespace cusim From 4896f5f2ea883b719aaaa37a225d54418e9231a6 Mon Sep 17 00:00:00 2001 From: js1010 Date: Tue, 9 Feb 2021 23:29:07 +0900 Subject: [PATCH 14/26] bind lda --- cpp/src/utils/ioutils.cc | 4 +-- cusim/culda/bindings.cc | 50 +++++++++++++++++++++++++++++++++++--- cusim/culda/pyculda.py | 21 ++++++++++++++-- cusim/ioutils/pyioutils.py | 2 +- cusim/proto/config.proto | 1 + setup.py | 10 +++++--- 6 files changed, 76 insertions(+), 12 deletions(-) diff --git a/cpp/src/utils/ioutils.cc b/cpp/src/utils/ioutils.cc index 098b1f2..f0b49d0 100644 --- a/cpp/src/utils/ioutils.cc +++ b/cpp/src/utils/ioutils.cc @@ -165,11 +165,9 @@ void IoUtils::GetWordVocab(int min_count, std::string keys_path) { // write keys to csv file std::ofstream fout(keys_path.c_str()); INFO("dump keys to {}", keys_path); - std::string header = "index,key\n"; - fout.write(header.c_str(), header.size()); int n = word_list_.size(); for (int i = 0; i < n; ++i) { - std::string line = std::to_string(i) + ",\"" + word_list_[i] + "\"\n"; + std::string line = word_list_[i] + "\n"; fout.write(line.c_str(), line.size()); } fout.close(); diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc index 0c6f72f..157b837 100644 --- a/cusim/culda/bindings.cc +++ b/cusim/culda/bindings.cc @@ -23,7 +23,9 @@ class CuLDABind { return obj_.Init(opt_path); } - void LoadModel(py::object& alpha, py::object& beta) { + void LoadModel(py::object& alpha, py::object& beta, + py::object& grad_alpha, py::object& new_beta) { + // check shape of alpha and beta float_array _alpha(alpha); float_array _beta(beta); auto alpha_buffer = _alpha.request(); @@ -32,8 +34,45 @@ class CuLDABind { alpha_buffer.shape[0] != beta_buffer.shape[0]) { throw std::runtime_error("invalid alpha or beta"); } + + // check shape of grad alpha and new beta + float_array _grad_alpha(grad_alpha); + float_array _new_beta(new_beta); + auto grad_alpha_buffer = _grad_alpha.request(); + auto new_beta_buffer = _new_beta.request(); + if (grad_alpha_buffer.ndim != 1 or + new_beta_buffer.ndim != 2 or + grad_alpha_buffer.shape[0] != new_beta_buffer.shape[0]) { + throw std::runtime_error("invalid grad_alpha or new_beta"); + } + int num_words = beta_buffer.shape[1]; - return obj_.LoadModel(_alpha.mutable_data(0), _beta.mutable_data(0), num_words); + + return obj_.LoadModel(_alpha.mutable_data(0), + _beta.mutable_data(0), + _grad_alpha.mutable_data(0), + _new_beta.mutable_data(0), num_words); + } + + void FeedData(py::object& indices, py::object indptr, const int num_iters) { + int_array _indices(indices); + int_array _indptr(indptr); + auto indices_buffer = _indices.request(); + auto indptr_buffer = _indptr.request(); + if (indices_buffer.ndim != 1 or indptr_buffer.ndim != 1) { + throw std::runtime_error("invalid indices or indptr"); + } + int num_indices = indices_buffer.shape[0]; + int num_indptr = indptr_buffer.shape[0]; + obj_.FeedData(_indices.data(0), _indptr.data(0), num_indices, num_indptr, num_iters); + } + + void Pull() { + obj_.Pull(); + } + + void Push() { + obj_.Push(); } private: @@ -47,7 +86,12 @@ PYBIND11_PLUGIN(culda_bind) { .def(py::init()) .def("init", &CuLDABind::Init, py::arg("opt_path")) .def("load_model", &CuLDABind::LoadModel, - py::arg("alpha"), py::arg("beta")) + py::arg("alpha"), py::arg("beta"), + py::arg("grad_alpha"), py::arg("new_beta")) + .def("feed_data", &CuLDABind::FeedData, + py::arg("indices"), py::arg("indptr"), py::arg("num_iters")) + .def("pull", &CuLDABind::Pull) + .def("push", &CuLDABind::Push) .def("__repr__", [](const CuLDABind &a) { return ""; diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py index 08fdb89..1ef893d 100644 --- a/cusim/culda/pyculda.py +++ b/cusim/culda/pyculda.py @@ -6,13 +6,13 @@ # pylint: disable=no-name-in-module,too-few-public-methods,no-member import os -# from os.path import join as pjoin +from os.path import join as pjoin import json import tempfile # import h5py -# import numpy as np +import numpy as np from cusim import aux from cusim.culda.culda_bind import CuLDABind @@ -32,3 +32,20 @@ def __init__(self, opt=None): self.obj = CuLDABind() assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}" os.remove(tmp.name) + + self.words, self.num_words = None, None + self.alpha, self.beta, self.grad_alpha, self.new_beta = \ + None, None, None, None + + def init_model(self): + with open(pjoin(self.opt.data_dir, "keys.txt"), "r") as fin: + self.words = [line.strip() for line in fin] + self.num_words = len(self.words) + self.alpha = \ + np.abs(np.uniform(shape=(self.opt.num_topics,))).astype(np.float32) + self.beta = np.abs(np.uniform( \ + shape=(self.num_words, self.opt.num_topics))).astype(np.float32) + self.beta /= np.sum(self.beta, axis=1)[None, :] + self.grad_alpha = np.zeros(shape=self.alpha.shape, dtype=np.float32) + self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32) + self.obj.load_model(self.alpha, self.beta, self.grad_alpha, self.new_beta) diff --git a/cusim/ioutils/pyioutils.py b/cusim/ioutils/pyioutils.py index 1a65f74..49b12a3 100644 --- a/cusim/ioutils/pyioutils.py +++ b/cusim/ioutils/pyioutils.py @@ -51,7 +51,7 @@ def load_stream_vocab(self, filepath, min_count, keys_path): def convert_stream_to_h5(self, filepath, min_count, out_dir, chunk_indices=10000): os.makedirs(out_dir, exist_ok=True) - keys_path = pjoin(out_dir, "keys.csv") + keys_path = pjoin(out_dir, "keys.txt") token_path = pjoin(out_dir, "token.h5") self.logger.info("save key and token to %s, %s", keys_path, token_path) diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto index b3899aa..b82c204 100644 --- a/cusim/proto/config.proto +++ b/cusim/proto/config.proto @@ -20,4 +20,5 @@ message CuLDAConfigProto { optional int32 num_topics = 3 [default = 10]; optional int32 block_dim = 4 [default = 32]; optional int32 hyper_threads = 5 [default = 10]; + required string data_dir = 6; } diff --git a/setup.py b/setup.py index db61bb6..512d262 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 Jisang Yoon +# Copyright (c) 2021 Jisang Yoon # All rights reserved. # # This source code is licensed under the Apache 2.0 license found in the @@ -30,7 +30,8 @@ raise RuntimeError("Python version 3.6 or later required.") assert platform.system() == 'Linux' # TODO: MacOS - +with open("requirements.txt", "r") as fin: + INSTALL_REQUIRES = [line.strip() for line in fin] MAJOR = 0 MINOR = 0 @@ -173,13 +174,16 @@ def setup_package(): name='cusim', maintainer="Jisang Yoon", maintainer_email="vjs10101v@gmail.com", + author="Jisang Yoon", + author_email="vjs10101v@gmail.com", description=DOCLINES[0], long_description="\n".join(DOCLINES[2:]), url="https://github.com/js1010/cusim", download_url="https://github.com/js1010/cusim/releases", include_package_data=False, - license='Apac2', + license='Apache2', packages=['cusim/', "cusim/ioutils/", "cusim/culda/"], + install_requires=INSTALL_REQUIRES, cmdclass=cmdclass, classifiers=[_f for _f in CLASSIFIERS.split('\n') if _f], platforms=['Linux', 'Mac OSX', 'Unix'], From 70dfa2cda71ba933be5d4bc3ef9ca5bcb08b68d1 Mon Sep 17 00:00:00 2001 From: js1010 Date: Tue, 9 Feb 2021 23:53:28 +0900 Subject: [PATCH 15/26] add types.hpp --- cpp/include/culda/culda.hpp | 2 +- cpp/include/utils/cuda_utils_kernels.cuh | 7 +------ cpp/include/utils/types.hpp | 13 +++++++++++++ 3 files changed, 15 insertions(+), 7 deletions(-) create mode 100644 cpp/include/utils/types.hpp diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp index ae1424a..127d515 100644 --- a/cpp/include/culda/culda.hpp +++ b/cpp/include/culda/culda.hpp @@ -28,7 +28,7 @@ #include "json11.hpp" #include "utils/log.hpp" -#include "utils/cuda_utils_kernels.cuh" +#include "utils/types.hpp" namespace cusim { diff --git a/cpp/include/utils/cuda_utils_kernels.cuh b/cpp/include/utils/cuda_utils_kernels.cuh index d925549..026da7f 100644 --- a/cpp/include/utils/cuda_utils_kernels.cuh +++ b/cpp/include/utils/cuda_utils_kernels.cuh @@ -21,15 +21,10 @@ #include #include #include +#include "utils/types.hpp" namespace cusim { -struct DeviceInfo { - int devId, mp_cnt, major, minor, cores; - bool unknown = false; -}; - -#define WARP_SIZE 32 // Error Checking utilities, checks status codes from cuda calls // and throws exceptions on failure (which cython can proxy back to python) #define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); } diff --git a/cpp/include/utils/types.hpp b/cpp/include/utils/types.hpp new file mode 100644 index 0000000..82d93cc --- /dev/null +++ b/cpp/include/utils/types.hpp @@ -0,0 +1,13 @@ +// Copyright (c) 2021 Jisang Yoon +// All rights reserved. +// +// This source code is licensed under the Apache 2.0 license found in the +// LICENSE file in the root directory of this source tree. +#pragma once + +struct DeviceInfo { + int devId, mp_cnt, major, minor, cores; + bool unknown = false; +}; + +#define WARP_SIZE 32 From 5456ff3e57af05126ecd056a8fcea6d1afa01d3b Mon Sep 17 00:00:00 2001 From: js1010 Date: Wed, 10 Feb 2021 00:07:16 +0900 Subject: [PATCH 16/26] bug-fix --- cusim/culda/bindings.cc | 4 ++-- cusim/culda/pyculda.py | 22 +++++++++++++++++----- examples/example1.py | 7 ++++++- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc index 157b837..ec959c5 100644 --- a/cusim/culda/bindings.cc +++ b/cusim/culda/bindings.cc @@ -31,7 +31,7 @@ class CuLDABind { auto alpha_buffer = _alpha.request(); auto beta_buffer = _beta.request(); if (alpha_buffer.ndim != 1 or beta_buffer.ndim != 2 or - alpha_buffer.shape[0] != beta_buffer.shape[0]) { + alpha_buffer.shape[0] != beta_buffer.shape[1]) { throw std::runtime_error("invalid alpha or beta"); } @@ -42,7 +42,7 @@ class CuLDABind { auto new_beta_buffer = _new_beta.request(); if (grad_alpha_buffer.ndim != 1 or new_beta_buffer.ndim != 2 or - grad_alpha_buffer.shape[0] != new_beta_buffer.shape[0]) { + grad_alpha_buffer.shape[0] != new_beta_buffer.shape[1]) { throw std::runtime_error("invalid grad_alpha or new_beta"); } diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py index 1ef893d..1f1639a 100644 --- a/cusim/culda/pyculda.py +++ b/cusim/culda/pyculda.py @@ -38,14 +38,26 @@ def __init__(self, opt=None): None, None, None, None def init_model(self): - with open(pjoin(self.opt.data_dir, "keys.txt"), "r") as fin: + # load voca + self.logger.info("load key from %s", pjoin(self.opt.data_dir, "keys.txt")) + with open(pjoin(self.opt.data_dir, "keys.txt"), "rb") as fin: self.words = [line.strip() for line in fin] self.num_words = len(self.words) + self.logger.info("number of words: %d", self.num_words) + + # random initialize alpha and beta self.alpha = \ - np.abs(np.uniform(shape=(self.opt.num_topics,))).astype(np.float32) - self.beta = np.abs(np.uniform( \ - shape=(self.num_words, self.opt.num_topics))).astype(np.float32) - self.beta /= np.sum(self.beta, axis=1)[None, :] + np.abs(np.random.uniform( \ + size=(self.opt.num_topics,))).astype(np.float32) + self.beta = np.abs(np.random.uniform( \ + size=(self.num_words, self.opt.num_topics))).astype(np.float32) + self.beta /= np.sum(self.beta, axis=0)[None, :] + self.logger.info("alpha %s, beta %s initialized", + self.alpha.shape, self.beta.shape) + + # zero initialize grad alpha and new beta self.grad_alpha = np.zeros(shape=self.alpha.shape, dtype=np.float32) self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32) + + # push it to gpu self.obj.load_model(self.alpha, self.beta, self.grad_alpha, self.new_beta) diff --git a/examples/example1.py b/examples/example1.py index 6d97a84..9886733 100644 --- a/examples/example1.py +++ b/examples/example1.py @@ -39,7 +39,12 @@ def run_io(): def run_lda(): - CuLDA() + opt = { + "data_dir": DATA_PATH2, + } + lda = CuLDA(opt) + lda.init_model() + if __name__ == "__main__": fire.Fire() From e460142b8203faadfb981e8f52520d7f0f69af24 Mon Sep 17 00:00:00 2001 From: js1010 Date: Wed, 10 Feb 2021 20:34:59 +0900 Subject: [PATCH 17/26] add rows and indices => cols --- cpp/include/culda/cuda_lda_kernels.cuh | 6 ++-- cpp/include/utils/ioutils.hpp | 4 +-- cpp/src/culda/culda.cu | 12 ++++---- cpp/src/utils/ioutils.cc | 17 +++++------ cusim/culda/bindings.cc | 16 +++++------ cusim/culda/pyculda.py | 39 ++++++++++++++++++++++---- cusim/ioutils/bindings.cc | 7 +++-- cusim/ioutils/pyioutils.py | 24 ++++++++++------ 8 files changed, 81 insertions(+), 44 deletions(-) diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh index b8648a3..5211695 100644 --- a/cpp/include/culda/cuda_lda_kernels.cuh +++ b/cpp/include/culda/cuda_lda_kernels.cuh @@ -26,8 +26,8 @@ float Digamma(float x) { } __global__ void EstepKernel( - const int* indices, const int* indptr, - const int num_indices, const int num_indptr, + const int* cols, const int* indptr, + const int num_cols, const int num_indptr, const int num_words, const int num_topics, const int num_iters, float* gamma, float* new_gamma, float* phi, float* alpha, float* beta, @@ -55,7 +55,7 @@ __global__ void EstepKernel( // compute phi from gamma for (int k = beg; k < end; ++k) { - const int w = indices[k]; + const int w = cols[k]; // compute phi for (int l = threadIdx.x; l < num_topics; l += blockDim.x) _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l])); diff --git a/cpp/include/utils/ioutils.hpp b/cpp/include/utils/ioutils.hpp index cc3e09e..756b4b2 100644 --- a/cpp/include/utils/ioutils.hpp +++ b/cpp/include/utils/ioutils.hpp @@ -34,12 +34,12 @@ class IoUtils { std::pair ReadStreamForVocab(int num_lines, int num_threads); std::pair TokenizeStream(int num_lines, int num_threads); void GetWordVocab(int min_count, std::string keys_path); - void GetToken(int* indices, int* indptr, int offset); + void GetToken(int* rows, int* cols, int* indptr); private: void ParseLine(std::string line, std::vector& line_vec); void ParseLineImpl(std::string line, std::vector& line_vec); - std::vector> indices_; + std::vector> cols_; std::vector indptr_; std::mutex global_lock_; std::ifstream stream_fin_; diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu index d37c3f5..b57db97 100644 --- a/cpp/src/culda/culda.cu +++ b/cpp/src/culda/culda.cu @@ -61,18 +61,18 @@ void CuLDA::LoadModel(float* alpha, float* beta, CHECK_CUDA(cudaDeviceSynchronize()); } -void CuLDA::FeedData(const int* indices, const int* indptr, - const int num_indices, const int num_indptr, const int num_iters) { - thrust::device_vector dev_indices(num_indices); +void CuLDA::FeedData(const int* cols, const int* indptr, + const int num_cols, const int num_indptr, const int num_iters) { + thrust::device_vector dev_cols(num_cols); thrust::device_vector dev_indptr(num_indptr + 1); - thrust::copy(indices, indices + num_indices, dev_indices.begin()); + thrust::copy(cols, cols + num_cols, dev_cols.begin()); thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin()); CHECK_CUDA(cudaDeviceSynchronize()); EstepKernel<<>>( - thrust::raw_pointer_cast(dev_indices.data()), + thrust::raw_pointer_cast(dev_cols.data()), thrust::raw_pointer_cast(dev_indptr.data()), - num_indices, num_indptr, num_words_, num_topics_, num_iters, + num_cols, num_indptr, num_words_, num_topics_, num_iters, thrust::raw_pointer_cast(dev_gamma_.data()), thrust::raw_pointer_cast(dev_new_gamma_.data()), thrust::raw_pointer_cast(dev_phi_.data()), diff --git a/cpp/src/utils/ioutils.cc b/cpp/src/utils/ioutils.cc index f0b49d0..0cc6495 100644 --- a/cpp/src/utils/ioutils.cc +++ b/cpp/src/utils/ioutils.cc @@ -69,8 +69,8 @@ std::pair IoUtils::TokenizeStream(int num_lines, int num_threads) { int read_lines = std::min(num_lines, remain_lines_); if (not read_lines) return {0, 0}; remain_lines_ -= read_lines; - indices_.clear(); - indices_.resize(read_lines); + cols_.clear(); + cols_.resize(read_lines); indptr_.resize(read_lines); std::fill(indptr_.begin(), indptr_.end(), 0); #pragma omp parallel num_threads(num_threads) @@ -91,27 +91,28 @@ std::pair IoUtils::TokenizeStream(int num_lines, int num_threads) { // tokenize for (auto& word: line_vec) { if (not word_count_.count(word)) continue; - indices_[i].push_back(word_count_[word]); + cols_[i].push_back(word_count_[word]); } } } int cumsum = 0; for (int i = 0; i < read_lines; ++i) { - cumsum += indices_[i].size(); + cumsum += cols_[i].size(); indptr_[i] = cumsum; } return {read_lines, indptr_[read_lines - 1]}; } -void IoUtils::GetToken(int* indices, int* indptr, int offset) { - int n = indices_.size(); +void IoUtils::GetToken(int* rows, int* cols, int* indptr) { + int n = cols_.size(); for (int i = 0; i < n; ++i) { int beg = i == 0? 0: indptr_[i - 1]; int end = indptr_[i]; for (int j = beg; j < end; ++j) { - indices[j] = indices_[i][j - beg]; + rows[j] = i; + cols[j] = cols_[i][j - beg]; } - indptr[i] = offset + indptr_[i]; + indptr[i] = indptr_[i]; } } diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc index ec959c5..8c28639 100644 --- a/cusim/culda/bindings.cc +++ b/cusim/culda/bindings.cc @@ -54,17 +54,17 @@ class CuLDABind { _new_beta.mutable_data(0), num_words); } - void FeedData(py::object& indices, py::object indptr, const int num_iters) { - int_array _indices(indices); + void FeedData(py::object& cols, py::object indptr, const int num_iters) { + int_array _cols(cols); int_array _indptr(indptr); - auto indices_buffer = _indices.request(); + auto cols_buffer = _cols.request(); auto indptr_buffer = _indptr.request(); - if (indices_buffer.ndim != 1 or indptr_buffer.ndim != 1) { - throw std::runtime_error("invalid indices or indptr"); + if (cols_buffer.ndim != 1 or indptr_buffer.ndim != 1) { + throw std::runtime_error("invalid cols or indptr"); } - int num_indices = indices_buffer.shape[0]; + int num_cols = cols_buffer.shape[0]; int num_indptr = indptr_buffer.shape[0]; - obj_.FeedData(_indices.data(0), _indptr.data(0), num_indices, num_indptr, num_iters); + obj_.FeedData(_cols.data(0), _indptr.data(0), num_cols, num_indptr, num_iters); } void Pull() { @@ -89,7 +89,7 @@ PYBIND11_PLUGIN(culda_bind) { py::arg("alpha"), py::arg("beta"), py::arg("grad_alpha"), py::arg("new_beta")) .def("feed_data", &CuLDABind::FeedData, - py::arg("indices"), py::arg("indptr"), py::arg("num_iters")) + py::arg("cols"), py::arg("indptr"), py::arg("num_iters")) .def("pull", &CuLDABind::Pull) .def("push", &CuLDABind::Push) .def("__repr__", diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py index 1f1639a..8f59b91 100644 --- a/cusim/culda/pyculda.py +++ b/cusim/culda/pyculda.py @@ -37,6 +37,14 @@ def __init__(self, opt=None): self.alpha, self.beta, self.grad_alpha, self.new_beta = \ None, None, None, None + def preprocess_data(self): + if self.opt.skip_preprocess: + return + iou = IoUtils() + if not self.opt.data_dir: + self.opt.data_dir = tempfile.TemporaryDirectory().name + iou.convert_stream_to_h5(self.opt.data_path, self.opt.data_dir) + def init_model(self): # load voca self.logger.info("load key from %s", pjoin(self.opt.data_dir, "keys.txt")) @@ -46,11 +54,10 @@ def init_model(self): self.logger.info("number of words: %d", self.num_words) # random initialize alpha and beta - self.alpha = \ - np.abs(np.random.uniform( \ - size=(self.opt.num_topics,))).astype(np.float32) - self.beta = np.abs(np.random.uniform( \ - size=(self.num_words, self.opt.num_topics))).astype(np.float32) + self.alpha = np.random.uniform( \ + size=(self.opt.num_topics,)).astype(np.float32) + self.beta = np.random.uniform( \ + size=(self.num_words, self.opt.num_topics)).astype(np.float32) self.beta /= np.sum(self.beta, axis=0)[None, :] self.logger.info("alpha %s, beta %s initialized", self.alpha.shape, self.beta.shape) @@ -61,3 +68,25 @@ def init_model(self): # push it to gpu self.obj.load_model(self.alpha, self.beta, self.grad_alpha, self.new_beta) + + def train_model(self): + self.preprocess_data() + self.init_model() + + def _train_Estep(self, h5f): + offset, size = 0, h5f["indptr"].shape[0] - 1 + steps = (size - 1) // batch_size + 1 + pbar = aux.Progbar(size) + for step in range(steps): + next_offset = min(size, offset + batch_size) + indptr = h5f["indptr"][offset:next_offset + 1] + beg, end = indptr[0], indptr[-1] + indptr -= beg + indices = h5f["indices"][beg:end] + offset = next_offset + + self.obj.FeedData(indices, indptr, self.opt.num_iters_in_Estep) + pbar.update(offset) + + def _train_Mstep(self): + pass diff --git a/cusim/ioutils/bindings.cc b/cusim/ioutils/bindings.cc index ad35ebb..28fbbc8 100644 --- a/cusim/ioutils/bindings.cc +++ b/cusim/ioutils/bindings.cc @@ -39,10 +39,11 @@ class IoUtilsBind { obj_.GetWordVocab(min_count, keys_path); } - void GetToken(py::object& indices, py::object& indptr, int offset) { - int_array _indices(indices); + void GetToken(py::object& rows, py::object& cols, py::object& indptr) { + int_array _rows(rows); + int_array _cols(cols); int_array _indptr(indptr); - obj_.GetToken(_indices.mutable_data(0), _indptr.mutable_data(0), offset); + obj_.GetToken(_rows.mutable_data(0), _cols.mutable_data(0), _indptr.mutable_data(0)); } private: diff --git a/cusim/ioutils/pyioutils.py b/cusim/ioutils/pyioutils.py index 49b12a3..5f6bc5e 100644 --- a/cusim/ioutils/pyioutils.py +++ b/cusim/ioutils/pyioutils.py @@ -23,7 +23,7 @@ def __init__(self, opt=None): self.opt = aux.get_opt_as_proto(opt or {}, IoUtilsConfigProto) self.logger = aux.get_logger("ioutils", level=self.opt.py_log_level) - tmp = tempfile.NamedTemporaryFile(mode='w', delete=False) + tmp = tempfile.NamedTemporaryFile(mode='w') opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2) tmp.write(opt_content) tmp.close() @@ -60,9 +60,12 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir, pbar = aux.Progbar(full_num_lines, unit_name="line") processed = 0 h5f = h5py.File(token_path, "w") - indices = h5f.create_dataset("indices", shape=(chunk_indices,), - maxshape=(None,), dtype=np.int32, - chunks=(chunk_indices,)) + rows = h5f.create_dataset("rows", shape=(chunk_indices,), + maxshape=(None,), dtype=np.int32, + chunks=(chunk_indices,)) + cols = h5f.create_dataset("cols", shape=(chunk_indices,), + maxshape=(None,), dtype=np.int32, + chunks=(chunk_indices,)) indptr = h5f.create_dataset("indptr", shape=(full_num_lines + 1,), dtype=np.int32, chunks=True) processed, offset = 1, 0 @@ -70,12 +73,15 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir, while True: read_lines, data_size = self.obj.tokenize_stream( self.opt.chunk_lines, self.opt.num_threads) - _indices = np.empty(shape=(data_size,), dtype=np.int32) + _rows = np.empty(shape=(data_size,), dtype=np.int32) + _cols = np.empty(shape=(data_size,), dtype=np.int32) _indptr = np.empty(shape=(read_lines,), dtype=np.int32) - self.obj.get_token(_indices, _indptr, offset) - indices.resize((offset + data_size,)) - indices[offset:offset + data_size] = _indices - indptr[processed:processed + read_lines] = _indptr + self.obj.get_token(_rows, _cols, _indptr) + rows.resize((offset + data_size,)) + rows[offset:offset + data_size] = _rows + (processed - 1) + cols.resize((offset + data_size,)) + cols[offset:offset + data_size] = _cols + indptr[processed:processed + read_lines] = _indptr + offset offset += data_size processed += read_lines pbar.update(processed - 1) From 36d92caaa9655f45333267f38b97e98a7cbe9e3d Mon Sep 17 00:00:00 2001 From: js1010 Date: Wed, 10 Feb 2021 21:57:38 +0900 Subject: [PATCH 18/26] implement M step --- cpp/include/culda/culda.hpp | 1 + cpp/src/culda/culda.cu | 10 +++-- cusim/culda/bindings.cc | 9 ++++- cusim/culda/pyculda.py | 77 ++++++++++++++++++++++++++++--------- cusim/proto/config.proto | 9 ++++- 5 files changed, 82 insertions(+), 24 deletions(-) diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp index 127d515..ee649de 100644 --- a/cpp/include/culda/culda.hpp +++ b/cpp/include/culda/culda.hpp @@ -68,6 +68,7 @@ class CuLDA { const int num_indices, const int num_indptr, const int num_iters); void Pull(); void Push(); + int GetBlockCnt(); private: DeviceInfo dev_info_; json11::Json opt_; diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu index b57db97..592e3ed 100644 --- a/cpp/src/culda/culda.cu +++ b/cpp/src/culda/culda.cu @@ -49,11 +49,11 @@ void CuLDA::LoadModel(float* alpha, float* beta, // resize device vector grad_alpha_ = grad_alpha; new_beta_ = new_beta; - dev_grad_alpha_.resize(num_topics_); + dev_grad_alpha_.resize(block_cnt_ * num_topics_); dev_new_beta_.resize(num_topics_ * num_words_); // copy to device - thrust::copy(grad_alpha_, grad_alpha_ + num_topics_, dev_grad_alpha_.begin()); + thrust::copy(grad_alpha_, grad_alpha_ + block_cnt_ * num_topics_, dev_grad_alpha_.begin()); thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin()); dev_gamma_.resize(num_topics_ * block_cnt_); dev_new_gamma_.resize(num_topics_ * block_cnt_); @@ -92,10 +92,14 @@ void CuLDA::Pull() { void CuLDA::Push() { thrust::copy(alpha_, alpha_ + num_topics_, dev_alpha_.begin()); - thrust::copy(grad_alpha_, grad_alpha_ + num_topics_, dev_grad_alpha_.begin()); + thrust::copy(grad_alpha_, grad_alpha_ + block_cnt_ * num_topics_, dev_grad_alpha_.begin()); thrust::copy(beta_, beta_ + num_words_ * num_topics_, dev_beta_.begin()); thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin()); CHECK_CUDA(cudaDeviceSynchronize()); } +int CuLDA::GetBlockCnt() { + return block_cnt_; +} + } // namespace cusim diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc index 8c28639..716624d 100644 --- a/cusim/culda/bindings.cc +++ b/cusim/culda/bindings.cc @@ -40,9 +40,9 @@ class CuLDABind { float_array _new_beta(new_beta); auto grad_alpha_buffer = _grad_alpha.request(); auto new_beta_buffer = _new_beta.request(); - if (grad_alpha_buffer.ndim != 1 or + if (grad_alpha_buffer.ndim != 2 or new_beta_buffer.ndim != 2 or - grad_alpha_buffer.shape[0] != new_beta_buffer.shape[1]) { + grad_alpha_buffer.shape[1] != new_beta_buffer.shape[1]) { throw std::runtime_error("invalid grad_alpha or new_beta"); } @@ -75,6 +75,10 @@ class CuLDABind { obj_.Push(); } + int GetBlockCnt() { + return obj_.GetBlockCnt(); + } + private: cusim::CuLDA obj_; }; @@ -92,6 +96,7 @@ PYBIND11_PLUGIN(culda_bind) { py::arg("cols"), py::arg("indptr"), py::arg("num_iters")) .def("pull", &CuLDABind::Pull) .def("push", &CuLDABind::Push) + .def("get_block_cnt", &CuLDABind::GetBlockCnt) .def("__repr__", [](const CuLDABind &a) { return ""; diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py index 8f59b91..b7b2020 100644 --- a/cusim/culda/pyculda.py +++ b/cusim/culda/pyculda.py @@ -11,10 +11,11 @@ import json import tempfile -# import h5py +import h5py import numpy as np +from scipy.special import polygamma as pg -from cusim import aux +from cusim import aux, IoUtils from cusim.culda.culda_bind import CuLDABind from cusim.config_pb2 import CuLDAConfigProto @@ -33,7 +34,7 @@ def __init__(self, opt=None): assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}" os.remove(tmp.name) - self.words, self.num_words = None, None + self.words, self.num_words, self.num_docs = None, None, None self.alpha, self.beta, self.grad_alpha, self.new_beta = \ None, None, None, None @@ -43,7 +44,8 @@ def preprocess_data(self): iou = IoUtils() if not self.opt.data_dir: self.opt.data_dir = tempfile.TemporaryDirectory().name - iou.convert_stream_to_h5(self.opt.data_path, self.opt.data_dir) + iou.convert_stream_to_h5(self.opt.data_path, self.opt.word_min_count, + self.opt.data_dir) def init_model(self): # load voca @@ -51,7 +53,14 @@ def init_model(self): with open(pjoin(self.opt.data_dir, "keys.txt"), "rb") as fin: self.words = [line.strip() for line in fin] self.num_words = len(self.words) - self.logger.info("number of words: %d", self.num_words) + + # count number of docs + h5f = h5py.File(pjoin(self.opt.data_dir, "token.h5"), "r") + self.num_docs = h5f["indptr"].shape[0] - 1 + h5f.close() + + self.logger.info("number of words: %d, docs: %d", + self.num_words, self.num_docs) # random initialize alpha and beta self.alpha = np.random.uniform( \ @@ -63,7 +72,9 @@ def init_model(self): self.alpha.shape, self.beta.shape) # zero initialize grad alpha and new beta - self.grad_alpha = np.zeros(shape=self.alpha.shape, dtype=np.float32) + block_cnt = self.obj.get_block_cnt() + self.grad_alpha = np.zeros(shape=(block_cnt, self.num_topics), + dtype=np.float32) self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32) # push it to gpu @@ -72,21 +83,51 @@ def init_model(self): def train_model(self): self.preprocess_data() self.init_model() - - def _train_Estep(self, h5f): - offset, size = 0, h5f["indptr"].shape[0] - 1 - steps = (size - 1) // batch_size + 1 + h5f = h5py.File(pjoin(self.opt.data_dir, "token.h5"), "r") + for epoch in range(1, self.opt.epochs + 1): + self.logger.info("Epoch %d / %d", epoch, self.opt.epochs) + self._train_e_step(h5f) + self._train_m_step() + h5f.close() + + def _train_e_step(self, h5f): + offset, size = 0, h5f["cols"].shape[0] pbar = aux.Progbar(size) - for step in range(steps): - next_offset = min(size, offset + batch_size) + while True: + target = h5f["indptr"][offset] + self.opt.batch_size + if target < size: + next_offset = h5f["rows"][target] + else: + next_offset = h5f["indptr"].shape[0] - 1 indptr = h5f["indptr"][offset:next_offset + 1] beg, end = indptr[0], indptr[-1] indptr -= beg - indices = h5f["indices"][beg:end] + cols = h5f["cols"][beg:end] offset = next_offset - self.obj.FeedData(indices, indptr, self.opt.num_iters_in_Estep) - pbar.update(offset) - - def _train_Mstep(self): - pass + self.obj.FeedData(cols, indptr, self.opt.num_iters_in_e_step) + pbar.update(end) + if end == size: + break + + def _train_m_step(self): + self.obj.pull() + + # update beta + self.beta[:, :] = self.new_beta / np.sum(self.new_beta, axis=0)[None, :] + self.new_beta[:, :] = 0 + + # update alpha + alpha_sum = np.sum(self.alpha) + gvec = np.sum(self.grad_alpha, axis=0) + gvec += self.num_docs * (pg(0, alpha_sum) - pg(0, self.alpha)) + hvec = self.num_docs * pg(1, self.alpha) + z_0 = pg(1, alpha_sum) + c_nume = np.sum(gvec / hvec) + c_deno = 1 / z_0 + np.sum(1 / hvec) + c_0 = c_nume / c_deno + delta = (gvec - c_0) / hvec + self.alpha -= delta + self.grad_alpha[:,:] = 0 + + self.obj.push() diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto index b82c204..0e0ad10 100644 --- a/cusim/proto/config.proto +++ b/cusim/proto/config.proto @@ -14,11 +14,18 @@ message IoUtilsConfigProto { } message CuLDAConfigProto { + required string data_path = 7; + optional int32 py_log_level = 1 [default = 2]; optional int32 c_log_level = 2 [default = 2]; optional int32 num_topics = 3 [default = 10]; optional int32 block_dim = 4 [default = 32]; optional int32 hyper_threads = 5 [default = 10]; - required string data_dir = 6; + optional string data_dir = 6; + optional bool skip_preprocess = 8; + optional int32 word_min_count = 9 [default = 5]; + optional int32 batch_size = 10 [default = 100000]; + optional int32 epochs = 11 [default = 10]; + optional int32 num_iters_in_e_step = 12 [default = 5]; } From bab04b0b36e2711e0f38c8397dd7f3da0474305b Mon Sep 17 00:00:00 2001 From: js1010 Date: Wed, 10 Feb 2021 22:37:55 +0900 Subject: [PATCH 19/26] train only train samples --- cpp/include/culda/cuda_lda_kernels.cuh | 47 ++++++++++++++++++-------- cpp/include/culda/culda.hpp | 3 +- cpp/include/utils/types.hpp | 1 + cpp/src/culda/culda.cu | 27 +++++++++++++-- cusim/culda/bindings.cc | 13 ++++--- cusim/culda/pyculda.py | 24 +++++++++++-- cusim/ioutils/pyioutils.py | 6 ++++ cusim/proto/config.proto | 1 + 8 files changed, 98 insertions(+), 24 deletions(-) diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh index 5211695..ad92afd 100644 --- a/cpp/include/culda/cuda_lda_kernels.cuh +++ b/cpp/include/culda/cuda_lda_kernels.cuh @@ -26,17 +26,19 @@ float Digamma(float x) { } __global__ void EstepKernel( - const int* cols, const int* indptr, + const int* cols, const int* indptr, const bool* vali, const int num_cols, const int num_indptr, const int num_words, const int num_topics, const int num_iters, float* gamma, float* new_gamma, float* phi, float* alpha, float* beta, - float* grad_alpha, float* new_beta) { + float* grad_alpha, float* new_beta, float* train_losses, float* vali_losses) { // storage for block float* _gamma = gamma + num_topics * blockIdx.x; float* _new_gamma = new_gamma + num_topics * blockIdx.x; float* _phi = phi + num_topics * blockIdx.x; + float* _grad_alpha = grad_alpha + num_topics * blockIdx.x; + for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) { int beg = indptr[i], end = indptr[i + 1]; @@ -56,18 +58,34 @@ __global__ void EstepKernel( // compute phi from gamma for (int k = beg; k < end; ++k) { const int w = cols[k]; + const bool _vali = vali[k]; + // compute phi - for (int l = threadIdx.x; l < num_topics; l += blockDim.x) - _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l])); - __syncthreads(); - - // normalize phi and add it to new gamma and new beta - float phi_sum = ReduceSum(_phi, num_topics); - for (int l = threadIdx.x; l < num_topics; l += blockDim.x) { - _phi[l] /= phi_sum; - _new_gamma[l] += _phi[l]; - if (j + 1 == num_iters) - new_beta[w * num_topics + l] += phi[l]; + if (not _vali or j + 1 == num_iters) { + for (int l = threadIdx.x; l < num_topics; l += blockDim.x) + _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l])); + __syncthreads(); + + // normalize phi and add it to new gamma and new beta + float phi_sum = ReduceSum(_phi, num_topics); + for (int l = threadIdx.x; l < num_topics; l += blockDim.x) { + _phi[l] /= phi_sum; + if (not _vali) _new_gamma[l] += _phi[l]; + if (j + 1 == num_iters) { + if (not _vali) new_beta[w * num_topics + l] += _phi[l]; + _phi[l] *= beta[w * num_topics + l]; + } + } + __syncthreads(); + } + if (j + 1 == num_iters) { + float p = ReduceSum(_phi, num_topics); + if (threadIdx.x == 0) { + if (_vali) + vali_losses[blockIdx.x] += logf(p + EPS); + else + train_losses[blockIdx.x] += logf(p + EPS); + } } __syncthreads(); } @@ -79,7 +97,8 @@ __global__ void EstepKernel( } float gamma_sum = ReduceSum(_gamma, num_topics); for (int j = threadIdx.x; j < num_topics; j += blockDim.x) - grad_alpha[j] += (Digamma(_gamma[j]) - Digamma(gamma_sum)); + _grad_alpha[j] += (Digamma(_gamma[j]) - Digamma(gamma_sum)); + __syncthreads(); } } diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp index ee649de..b779e3a 100644 --- a/cpp/include/culda/culda.hpp +++ b/cpp/include/culda/culda.hpp @@ -64,7 +64,8 @@ class CuLDA { bool Init(std::string opt_path); void LoadModel(float* alpha, float* beta, float* grad_alpha, float* new_beta, const int num_words); - void FeedData(const int* indices, const int* indptr, + std::pair FeedData( + const int* indices, const int* indptr, const bool* vali, const int num_indices, const int num_indptr, const int num_iters); void Pull(); void Push(); diff --git a/cpp/include/utils/types.hpp b/cpp/include/utils/types.hpp index 82d93cc..87e864b 100644 --- a/cpp/include/utils/types.hpp +++ b/cpp/include/utils/types.hpp @@ -11,3 +11,4 @@ struct DeviceInfo { }; #define WARP_SIZE 32 +#define EPS 1e-10f diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu index 592e3ed..8f5b5fb 100644 --- a/cpp/src/culda/culda.cu +++ b/cpp/src/culda/culda.cu @@ -61,17 +61,27 @@ void CuLDA::LoadModel(float* alpha, float* beta, CHECK_CUDA(cudaDeviceSynchronize()); } -void CuLDA::FeedData(const int* cols, const int* indptr, +std::pair CuLDA::FeedData( + const int* cols, const int* indptr, const bool* vali, const int num_cols, const int num_indptr, const int num_iters) { + + // copy feed data to GPU memory thrust::device_vector dev_cols(num_cols); thrust::device_vector dev_indptr(num_indptr + 1); + thrust::device_vector dev_vali(num_cols); + thrust::device_vector dev_train_losses(block_cnt_, 0.0f); + thrust::device_vector dev_vali_losses(block_cnt_, 0.0f); thrust::copy(cols, cols + num_cols, dev_cols.begin()); thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin()); + thrust::copy(vali, vali + num_cols, dev_vali.begin()); + CHECK_CUDA(cudaDeviceSynchronize()); + // run E step in GPU EstepKernel<<>>( thrust::raw_pointer_cast(dev_cols.data()), thrust::raw_pointer_cast(dev_indptr.data()), + thrust::raw_pointer_cast(dev_vali.data()), num_cols, num_indptr, num_words_, num_topics_, num_iters, thrust::raw_pointer_cast(dev_gamma_.data()), thrust::raw_pointer_cast(dev_new_gamma_.data()), @@ -79,9 +89,22 @@ void CuLDA::FeedData(const int* cols, const int* indptr, thrust::raw_pointer_cast(dev_alpha_.data()), thrust::raw_pointer_cast(dev_beta_.data()), thrust::raw_pointer_cast(dev_grad_alpha_.data()), - thrust::raw_pointer_cast(dev_new_beta_.data())); + thrust::raw_pointer_cast(dev_new_beta_.data()), + thrust::raw_pointer_cast(dev_train_losses.data()), + thrust::raw_pointer_cast(dev_vali_losses.data())); CHECK_CUDA(cudaDeviceSynchronize()); + + // pull loss + std::vector train_losses(block_cnt_), vali_losses(block_cnt_); + thrust::copy(dev_train_losses.begin(), dev_train_losses.end(), train_losses.begin()); + thrust::copy(dev_vali_losses.begin(), dev_vali_losses.end(), vali_losses.begin()); + CHECK_CUDA(cudaDeviceSynchronize()); + + // accumulate + float train_loss = std::accumulate(train_losses.begin(), train_losses.end(), 0.0f); + float vali_loss = std::accumulate(vali_losses.begin(), vali_losses.end(), 0.0f); + return {train_loss, vali_loss}; } void CuLDA::Pull() { diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc index 716624d..9e42c65 100644 --- a/cusim/culda/bindings.cc +++ b/cusim/culda/bindings.cc @@ -14,6 +14,7 @@ namespace py = pybind11; typedef py::array_t float_array; typedef py::array_t int_array; +typedef py::array_t bool_array; class CuLDABind { public: @@ -54,17 +55,21 @@ class CuLDABind { _new_beta.mutable_data(0), num_words); } - void FeedData(py::object& cols, py::object indptr, const int num_iters) { + std::pair FeedData(py::object& cols, py::object& indptr, py::object& vali, const int num_iters) { int_array _cols(cols); int_array _indptr(indptr); + bool_arrray _vali(vali); auto cols_buffer = _cols.request(); auto indptr_buffer = _indptr.request(); - if (cols_buffer.ndim != 1 or indptr_buffer.ndim != 1) { + auto vali_buffer = _vali.request(); + if (cols_buffer.ndim != 1 or indptr_buffer.ndim != 1 or vali_buffer.ndim != 1 + or cols_buffer.shape[0] != vali_buffer.shape[0]) { throw std::runtime_error("invalid cols or indptr"); } int num_cols = cols_buffer.shape[0]; int num_indptr = indptr_buffer.shape[0]; - obj_.FeedData(_cols.data(0), _indptr.data(0), num_cols, num_indptr, num_iters); + return obj_.FeedData(_cols.data(0), _indptr.data(0), _vali.data(0), + num_cols, num_indptr, num_iters); } void Pull() { @@ -93,7 +98,7 @@ PYBIND11_PLUGIN(culda_bind) { py::arg("alpha"), py::arg("beta"), py::arg("grad_alpha"), py::arg("new_beta")) .def("feed_data", &CuLDABind::FeedData, - py::arg("cols"), py::arg("indptr"), py::arg("num_iters")) + py::arg("cols"), py::arg("indptr"), py::arg("vali"), py::arg("num_iters")) .def("pull", &CuLDABind::Pull) .def("push", &CuLDABind::Push) .def("get_block_cnt", &CuLDABind::GetBlockCnt) diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py index b7b2020..b790bb0 100644 --- a/cusim/culda/pyculda.py +++ b/cusim/culda/pyculda.py @@ -92,7 +92,9 @@ def train_model(self): def _train_e_step(self, h5f): offset, size = 0, h5f["cols"].shape[0] - pbar = aux.Progbar(size) + pbar = aux.Progbar(size, stateful_metrics=["train_loss", "vali_loss"]) + train_loss_nume, train_loss_deno = 0, 0 + vali_loss_nume, vali_loss_deno = 0, 0 while True: target = h5f["indptr"][offset] + self.opt.batch_size if target < size: @@ -103,10 +105,26 @@ def _train_e_step(self, h5f): beg, end = indptr[0], indptr[-1] indptr -= beg cols = h5f["cols"][beg:end] + vali = (h5f["vali"][beg:end] < self.opt.vali_p).astype(np.bool) offset = next_offset - self.obj.FeedData(cols, indptr, self.opt.num_iters_in_e_step) - pbar.update(end) + # call cuda kernel + train_loss, vali_loss = \ + self.obj.FeedData(cols, indptr, vali, self.opt.num_iters_in_e_step) + + # accumulate loss + train_loss_nume -= train_loss + vali_loss_nume -= vali_loss + vali_cnt = np.count_nonzero(vali) + train_cnt = len(vali) - vali_cnt + train_loss_nume += train_cnt + vali_loss_nume += train_cnt + train_loss = train_loss_nume / train_loss_deno + vali_loss = vali_loss_nume / vali_loss_deno + + # update progress bar + pbar.update(end, values=[("train_loss", train_loss), + ("vali_loss", vali_loss)]) if end == size: break diff --git a/cusim/ioutils/pyioutils.py b/cusim/ioutils/pyioutils.py index 5f6bc5e..be045c8 100644 --- a/cusim/ioutils/pyioutils.py +++ b/cusim/ioutils/pyioutils.py @@ -66,6 +66,9 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir, cols = h5f.create_dataset("cols", shape=(chunk_indices,), maxshape=(None,), dtype=np.int32, chunks=(chunk_indices,)) + vali = h5f.create_dataset("vali", shape=(chunk_indices,), + maxshape=(None,), dtype=np.float32, + chunks=(chunk_indices,)) indptr = h5f.create_dataset("indptr", shape=(full_num_lines + 1,), dtype=np.int32, chunks=True) processed, offset = 1, 0 @@ -81,6 +84,9 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir, rows[offset:offset + data_size] = _rows + (processed - 1) cols.resize((offset + data_size,)) cols[offset:offset + data_size] = _cols + vali.resize((offset + data_size,)) + vali[offset:offset + data_size] = \ + np.uniform(size=(data_size,)).astype(np.float32) indptr[processed:processed + read_lines] = _indptr + offset offset += data_size processed += read_lines diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto index 0e0ad10..14f9004 100644 --- a/cusim/proto/config.proto +++ b/cusim/proto/config.proto @@ -28,4 +28,5 @@ message CuLDAConfigProto { optional int32 batch_size = 10 [default = 100000]; optional int32 epochs = 11 [default = 10]; optional int32 num_iters_in_e_step = 12 [default = 5]; + optional double vali_p = 13 [default = 0.2]; } From a5d945665adef879bee60fb341a02ea7a9ea1f5b Mon Sep 17 00:00:00 2001 From: js1010 Date: Wed, 10 Feb 2021 22:42:25 +0900 Subject: [PATCH 20/26] fix typo and compile succeed --- cusim/culda/bindings.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc index 9e42c65..9a59c6e 100644 --- a/cusim/culda/bindings.cc +++ b/cusim/culda/bindings.cc @@ -58,7 +58,7 @@ class CuLDABind { std::pair FeedData(py::object& cols, py::object& indptr, py::object& vali, const int num_iters) { int_array _cols(cols); int_array _indptr(indptr); - bool_arrray _vali(vali); + bool_array _vali(vali); auto cols_buffer = _cols.request(); auto indptr_buffer = _indptr.request(); auto vali_buffer = _vali.request(); From df154946e7aebddf9f28a5da95c4037515743868 Mon Sep 17 00:00:00 2001 From: js1010 Date: Wed, 10 Feb 2021 22:46:46 +0900 Subject: [PATCH 21/26] bug-fix and set random seed --- cusim/culda/pyculda.py | 1 + cusim/ioutils/pyioutils.py | 7 ++++--- cusim/proto/config.proto | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py index b790bb0..35658eb 100644 --- a/cusim/culda/pyculda.py +++ b/cusim/culda/pyculda.py @@ -63,6 +63,7 @@ def init_model(self): self.num_words, self.num_docs) # random initialize alpha and beta + np.random.seed(self.opt.seed) self.alpha = np.random.uniform( \ size=(self.opt.num_topics,)).astype(np.float32) self.beta = np.random.uniform( \ diff --git a/cusim/ioutils/pyioutils.py b/cusim/ioutils/pyioutils.py index be045c8..5bce9b7 100644 --- a/cusim/ioutils/pyioutils.py +++ b/cusim/ioutils/pyioutils.py @@ -23,7 +23,7 @@ def __init__(self, opt=None): self.opt = aux.get_opt_as_proto(opt or {}, IoUtilsConfigProto) self.logger = aux.get_logger("ioutils", level=self.opt.py_log_level) - tmp = tempfile.NamedTemporaryFile(mode='w') + tmp = tempfile.NamedTemporaryFile(mode='w', delete=False) opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2) tmp.write(opt_content) tmp.close() @@ -49,7 +49,8 @@ def load_stream_vocab(self, filepath, min_count, keys_path): self.obj.get_word_vocab(min_count, keys_path) def convert_stream_to_h5(self, filepath, min_count, out_dir, - chunk_indices=10000): + chunk_indices=10000, seed=777): + np.random.seed(seed) os.makedirs(out_dir, exist_ok=True) keys_path = pjoin(out_dir, "keys.txt") token_path = pjoin(out_dir, "token.h5") @@ -86,7 +87,7 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir, cols[offset:offset + data_size] = _cols vali.resize((offset + data_size,)) vali[offset:offset + data_size] = \ - np.uniform(size=(data_size,)).astype(np.float32) + np.random.uniform(size=(data_size,)).astype(np.float32) indptr[processed:processed + read_lines] = _indptr + offset offset += data_size processed += read_lines diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto index 14f9004..5c4e0d9 100644 --- a/cusim/proto/config.proto +++ b/cusim/proto/config.proto @@ -29,4 +29,5 @@ message CuLDAConfigProto { optional int32 epochs = 11 [default = 10]; optional int32 num_iters_in_e_step = 12 [default = 5]; optional double vali_p = 13 [default = 0.2]; + optional int32 seed = 14 [default = 777]; } From f094b95aaf6cf84f7ea73c39818f109b9d0ad592 Mon Sep 17 00:00:00 2001 From: js1010 Date: Wed, 10 Feb 2021 22:52:06 +0900 Subject: [PATCH 22/26] bug-fix in pyculda --- cusim/culda/pyculda.py | 4 ++-- examples/example1.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py index 35658eb..20a4f87 100644 --- a/cusim/culda/pyculda.py +++ b/cusim/culda/pyculda.py @@ -74,7 +74,7 @@ def init_model(self): # zero initialize grad alpha and new beta block_cnt = self.obj.get_block_cnt() - self.grad_alpha = np.zeros(shape=(block_cnt, self.num_topics), + self.grad_alpha = np.zeros(shape=(block_cnt, self.opt.num_topics), dtype=np.float32) self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32) @@ -111,7 +111,7 @@ def _train_e_step(self, h5f): # call cuda kernel train_loss, vali_loss = \ - self.obj.FeedData(cols, indptr, vali, self.opt.num_iters_in_e_step) + self.obj.feed_data(cols, indptr, vali, self.opt.num_iters_in_e_step) # accumulate loss train_loss_nume -= train_loss diff --git a/examples/example1.py b/examples/example1.py index 9886733..4297768 100644 --- a/examples/example1.py +++ b/examples/example1.py @@ -40,10 +40,11 @@ def run_io(): def run_lda(): opt = { + "data_path": DATA_PATH, "data_dir": DATA_PATH2, } lda = CuLDA(opt) - lda.init_model() + lda.train_model() if __name__ == "__main__": From b97c1489721f1910be23c77fb731aa305587115d Mon Sep 17 00:00:00 2001 From: js1010 Date: Wed, 10 Feb 2021 23:33:09 +0900 Subject: [PATCH 23/26] bug-fix in tokenization --- cpp/include/culda/cuda_lda_kernels.cuh | 5 +---- cpp/src/culda/culda.cu | 11 ++++++----- cpp/src/utils/ioutils.cc | 5 +++-- cusim/culda/bindings.cc | 4 ++-- cusim/culda/pyculda.py | 12 ++++++++---- examples/example1.py | 2 ++ 6 files changed, 22 insertions(+), 17 deletions(-) diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh index ad92afd..e6885f8 100644 --- a/cpp/include/culda/cuda_lda_kernels.cuh +++ b/cpp/include/culda/cuda_lda_kernels.cuh @@ -28,7 +28,7 @@ float Digamma(float x) { __global__ void EstepKernel( const int* cols, const int* indptr, const bool* vali, const int num_cols, const int num_indptr, - const int num_words, const int num_topics, const int num_iters, + const int num_topics, const int num_iters, float* gamma, float* new_gamma, float* phi, float* alpha, float* beta, float* grad_alpha, float* new_beta, float* train_losses, float* vali_losses) { @@ -39,10 +39,8 @@ __global__ void EstepKernel( float* _phi = phi + num_topics * blockIdx.x; float* _grad_alpha = grad_alpha + num_topics * blockIdx.x; - for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) { int beg = indptr[i], end = indptr[i + 1]; - // initialize gamma for (int j = threadIdx.x; j < num_topics; j += blockDim.x) _gamma[j] = alpha[j] + (end - beg) / num_topics; @@ -59,7 +57,6 @@ __global__ void EstepKernel( for (int k = beg; k < end; ++k) { const int w = cols[k]; const bool _vali = vali[k]; - // compute phi if (not _vali or j + 1 == num_iters) { for (int l = threadIdx.x; l < num_topics; l += blockDim.x) diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu index 8f5b5fb..ac7cd8f 100644 --- a/cpp/src/culda/culda.cu +++ b/cpp/src/culda/culda.cu @@ -39,7 +39,7 @@ bool CuLDA::Init(std::string opt_path) { void CuLDA::LoadModel(float* alpha, float* beta, float* grad_alpha, float* new_beta, int num_words) { num_words_ = num_words; - DEBUG("copy model({} x {})", num_topics_, num_words_); + DEBUG("copy model({} x {})", num_words_, num_topics_); dev_alpha_.resize(num_topics_); dev_beta_.resize(num_topics_ * num_words_); thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin()); @@ -49,7 +49,7 @@ void CuLDA::LoadModel(float* alpha, float* beta, // resize device vector grad_alpha_ = grad_alpha; new_beta_ = new_beta; - dev_grad_alpha_.resize(block_cnt_ * num_topics_); + dev_grad_alpha_.resize(num_topics_ * block_cnt_); dev_new_beta_.resize(num_topics_ * num_words_); // copy to device @@ -74,15 +74,15 @@ std::pair CuLDA::FeedData( thrust::copy(cols, cols + num_cols, dev_cols.begin()); thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin()); thrust::copy(vali, vali + num_cols, dev_vali.begin()); - CHECK_CUDA(cudaDeviceSynchronize()); + DEBUG0("copy feed data to GPU memory"); // run E step in GPU EstepKernel<<>>( thrust::raw_pointer_cast(dev_cols.data()), thrust::raw_pointer_cast(dev_indptr.data()), thrust::raw_pointer_cast(dev_vali.data()), - num_cols, num_indptr, num_words_, num_topics_, num_iters, + num_cols, num_indptr, num_topics_, num_iters, thrust::raw_pointer_cast(dev_gamma_.data()), thrust::raw_pointer_cast(dev_new_gamma_.data()), thrust::raw_pointer_cast(dev_phi_.data()), @@ -92,14 +92,15 @@ std::pair CuLDA::FeedData( thrust::raw_pointer_cast(dev_new_beta_.data()), thrust::raw_pointer_cast(dev_train_losses.data()), thrust::raw_pointer_cast(dev_vali_losses.data())); - CHECK_CUDA(cudaDeviceSynchronize()); + DEBUG0("run E step in GPU"); // pull loss std::vector train_losses(block_cnt_), vali_losses(block_cnt_); thrust::copy(dev_train_losses.begin(), dev_train_losses.end(), train_losses.begin()); thrust::copy(dev_vali_losses.begin(), dev_vali_losses.end(), vali_losses.begin()); CHECK_CUDA(cudaDeviceSynchronize()); + DEBUG0("pull loss values"); // accumulate float train_loss = std::accumulate(train_losses.begin(), train_losses.end(), 0.0f); diff --git a/cpp/src/utils/ioutils.cc b/cpp/src/utils/ioutils.cc index 0cc6495..aa325c9 100644 --- a/cpp/src/utils/ioutils.cc +++ b/cpp/src/utils/ioutils.cc @@ -90,8 +90,8 @@ std::pair IoUtils::TokenizeStream(int num_lines, int num_threads) { // tokenize for (auto& word: line_vec) { - if (not word_count_.count(word)) continue; - cols_[i].push_back(word_count_[word]); + if (not word_idmap_.count(word)) continue; + cols_[i].push_back(word_idmap_[word]); } } } @@ -155,6 +155,7 @@ std::pair IoUtils::ReadStreamForVocab(int num_lines, int num_threads) void IoUtils::GetWordVocab(int min_count, std::string keys_path) { INFO("number of raw words: {}", word_count_.size()); + word_idmap_.clear(); word_list_.clear(); for (auto& it: word_count_) { if (it.second >= min_count) { word_idmap_[it.first] = word_idmap_.size(); diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc index 9a59c6e..f85b2d5 100644 --- a/cusim/culda/bindings.cc +++ b/cusim/culda/bindings.cc @@ -47,7 +47,7 @@ class CuLDABind { throw std::runtime_error("invalid grad_alpha or new_beta"); } - int num_words = beta_buffer.shape[1]; + int num_words = beta_buffer.shape[0]; return obj_.LoadModel(_alpha.mutable_data(0), _beta.mutable_data(0), @@ -67,7 +67,7 @@ class CuLDABind { throw std::runtime_error("invalid cols or indptr"); } int num_cols = cols_buffer.shape[0]; - int num_indptr = indptr_buffer.shape[0]; + int num_indptr = indptr_buffer.shape[0] - 1; return obj_.FeedData(_cols.data(0), _indptr.data(0), _vali.data(0), num_cols, num_indptr, num_iters); } diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py index 20a4f87..6282ec3 100644 --- a/cusim/culda/pyculda.py +++ b/cusim/culda/pyculda.py @@ -19,6 +19,8 @@ from cusim.culda.culda_bind import CuLDABind from cusim.config_pb2 import CuLDAConfigProto +EPS = 1e-10 + class CuLDA: def __init__(self, opt=None): self.opt = aux.get_opt_as_proto(opt or {}, CuLDAConfigProto) @@ -77,6 +79,8 @@ def init_model(self): self.grad_alpha = np.zeros(shape=(block_cnt, self.opt.num_topics), dtype=np.float32) self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32) + self.logger.info("grad alpha %s, new beta %s initialized", + self.grad_alpha.shape, self.new_beta.shape) # push it to gpu self.obj.load_model(self.alpha, self.beta, self.grad_alpha, self.new_beta) @@ -118,10 +122,10 @@ def _train_e_step(self, h5f): vali_loss_nume -= vali_loss vali_cnt = np.count_nonzero(vali) train_cnt = len(vali) - vali_cnt - train_loss_nume += train_cnt - vali_loss_nume += train_cnt - train_loss = train_loss_nume / train_loss_deno - vali_loss = vali_loss_nume / vali_loss_deno + train_loss_deno += train_cnt + vali_loss_deno += vali_cnt + train_loss = train_loss_nume / (train_loss_deno + EPS) + vali_loss = vali_loss_nume / (vali_loss_deno + EPS) # update progress bar pbar.update(end, values=[("train_loss", train_loss), diff --git a/examples/example1.py b/examples/example1.py index 4297768..5f8f685 100644 --- a/examples/example1.py +++ b/examples/example1.py @@ -42,6 +42,8 @@ def run_lda(): opt = { "data_path": DATA_PATH, "data_dir": DATA_PATH2, + # "skip_preprocess": True, + "c_log_level": 3, } lda = CuLDA(opt) lda.train_model() From c7df6e4dd32949343145346af97f163823fc955b Mon Sep 17 00:00:00 2001 From: js1010 Date: Thu, 11 Feb 2021 00:07:37 +0900 Subject: [PATCH 24/26] ensure non-negativitty --- cpp/include/culda/cuda_lda_kernels.cuh | 8 ++++---- cpp/src/utils/ioutils.cc | 4 ++-- cusim/culda/pyculda.py | 10 ++++++++++ examples/example1.py | 18 ++++++++++++++++-- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh index e6885f8..400b251 100644 --- a/cpp/include/culda/cuda_lda_kernels.cuh +++ b/cpp/include/culda/cuda_lda_kernels.cuh @@ -30,7 +30,7 @@ __global__ void EstepKernel( const int num_cols, const int num_indptr, const int num_topics, const int num_iters, float* gamma, float* new_gamma, float* phi, - float* alpha, float* beta, + const float* alpha, const float* beta, float* grad_alpha, float* new_beta, float* train_losses, float* vali_losses) { // storage for block @@ -76,12 +76,12 @@ __global__ void EstepKernel( __syncthreads(); } if (j + 1 == num_iters) { - float p = ReduceSum(_phi, num_topics); + float p = fmaxf(EPS, ReduceSum(_phi, num_topics)); if (threadIdx.x == 0) { if (_vali) - vali_losses[blockIdx.x] += logf(p + EPS); + vali_losses[blockIdx.x] += logf(p); else - train_losses[blockIdx.x] += logf(p + EPS); + train_losses[blockIdx.x] += logf(p); } } __syncthreads(); diff --git a/cpp/src/utils/ioutils.cc b/cpp/src/utils/ioutils.cc index aa325c9..14bd94e 100644 --- a/cpp/src/utils/ioutils.cc +++ b/cpp/src/utils/ioutils.cc @@ -37,10 +37,10 @@ void IoUtils::ParseLineImpl(std::string line, std::vector& ret) { int n = line.size(); std::string element; for (int i = 0; i < n; ++i) { - if (line[i] == ' ' or line[i] == ',') { + if (line[i] == ' ') { ret.push_back(element); element.clear(); - } else if (line[i] != '"') { + } else { element += std::tolower(line[i]); } } diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py index 6282ec3..7bdbf8c 100644 --- a/cusim/culda/pyculda.py +++ b/cusim/culda/pyculda.py @@ -137,6 +137,7 @@ def _train_m_step(self): self.obj.pull() # update beta + self.new_beta[:, :] = np.maximum(self.new_beta, EPS) self.beta[:, :] = self.new_beta / np.sum(self.new_beta, axis=0)[None, :] self.new_beta[:, :] = 0 @@ -151,6 +152,15 @@ def _train_m_step(self): c_0 = c_nume / c_deno delta = (gvec - c_0) / hvec self.alpha -= delta + self.alpha[:] = np.maximum(self.alpha, EPS) self.grad_alpha[:,:] = 0 self.obj.push() + + def save_model(self, model_path): + self.logger.info("save model path: %s", model_path) + h5f = h5py.File(model_path, "w") + h5f.create_dataset("alpha", data=self.alpha) + h5f.create_dataset("beta", data=self.beta) + h5f.create_dataset("keys", data=np.array(self.words)) + h5f.close() diff --git a/examples/example1.py b/examples/example1.py index 5f8f685..0338dd2 100644 --- a/examples/example1.py +++ b/examples/example1.py @@ -9,6 +9,8 @@ import subprocess import fire +import h5py +import numpy as np from gensim import downloader as api from cusim import aux, IoUtils, CuLDA @@ -43,11 +45,23 @@ def run_lda(): "data_path": DATA_PATH, "data_dir": DATA_PATH2, # "skip_preprocess": True, - "c_log_level": 3, + # "c_log_level": 3, } lda = CuLDA(opt) lda.train_model() - + lda.save_model("res/lda.h5") + h5f = h5py.File("res/lda.h5", "r") + beta = h5f["beta"][:] + for i in range(lda.opt.num_topics): + print("=" * 50) + print(f"topic {i + 1}") + words = np.argsort(-beta.T[i])[:10] + print("-" * 50) + for j in range(10): + word = lda.words[words[j]].decode("utf8") + prob = beta[words[j], i] + print(f"rank {j + 1}. word: {word}, prob: {prob}") + h5f.close() if __name__ == "__main__": fire.Fire() From 38c83db71b0947ac1dbba89a01c270a613e39adb Mon Sep 17 00:00:00 2001 From: js1010 Date: Thu, 11 Feb 2021 00:20:11 +0900 Subject: [PATCH 25/26] change proto fieldname --- cusim/culda/pyculda.py | 15 ++++++++------- cusim/proto/config.proto | 2 +- examples/example1.py | 24 +++++++++++++----------- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py index 7bdbf8c..e0fa3d9 100644 --- a/cusim/culda/pyculda.py +++ b/cusim/culda/pyculda.py @@ -44,20 +44,21 @@ def preprocess_data(self): if self.opt.skip_preprocess: return iou = IoUtils() - if not self.opt.data_dir: - self.opt.data_dir = tempfile.TemporaryDirectory().name + if not self.opt.processed_data_dir: + self.opt.processed_data_dir = tempfile.TemporaryDirectory().name iou.convert_stream_to_h5(self.opt.data_path, self.opt.word_min_count, - self.opt.data_dir) + self.opt.processed_data_dir) def init_model(self): # load voca - self.logger.info("load key from %s", pjoin(self.opt.data_dir, "keys.txt")) - with open(pjoin(self.opt.data_dir, "keys.txt"), "rb") as fin: + data_dir = self.opt.processed_data_dir + self.logger.info("load key from %s", pjoin(data_dir, "keys.txt")) + with open(pjoin(data_dir, "keys.txt"), "rb") as fin: self.words = [line.strip() for line in fin] self.num_words = len(self.words) # count number of docs - h5f = h5py.File(pjoin(self.opt.data_dir, "token.h5"), "r") + h5f = h5py.File(pjoin(data_dir, "token.h5"), "r") self.num_docs = h5f["indptr"].shape[0] - 1 h5f.close() @@ -88,7 +89,7 @@ def init_model(self): def train_model(self): self.preprocess_data() self.init_model() - h5f = h5py.File(pjoin(self.opt.data_dir, "token.h5"), "r") + h5f = h5py.File(pjoin(self.opt.processed_data_dir, "token.h5"), "r") for epoch in range(1, self.opt.epochs + 1): self.logger.info("Epoch %d / %d", epoch, self.opt.epochs) self._train_e_step(h5f) diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto index 5c4e0d9..10b3820 100644 --- a/cusim/proto/config.proto +++ b/cusim/proto/config.proto @@ -22,7 +22,7 @@ message CuLDAConfigProto { optional int32 num_topics = 3 [default = 10]; optional int32 block_dim = 4 [default = 32]; optional int32 hyper_threads = 5 [default = 10]; - optional string data_dir = 6; + optional string processed_data_dir = 6; optional bool skip_preprocess = 8; optional int32 word_min_count = 9 [default = 5]; optional int32 batch_size = 10 [default = 100000]; diff --git a/examples/example1.py b/examples/example1.py index 0338dd2..32c8dd5 100644 --- a/examples/example1.py +++ b/examples/example1.py @@ -17,10 +17,12 @@ LOGGER = aux.get_logger() DOWNLOAD_PATH = "./res" # DATASET = "wiki-english-20171001" -DATASET = "fake-news" +DATASET = "quora-duplicate-questions" DATA_PATH = f"./res/{DATASET}.stream.txt" -DATA_PATH2 = f"./res/{DATASET}-converted" +LDA_PATH = f"./res/{DATASET}-lda.h5" +PROCESSED_DATA_DIR = f"./res/{DATASET}-converted" MIN_COUNT = 5 +TOPK = 10 def download(): if os.path.exists(DATA_PATH): @@ -37,28 +39,28 @@ def download(): def run_io(): download() iou = IoUtils(opt={"chunk_lines": 10000, "num_threads": 8}) - iou.convert_stream_to_h5(DATA_PATH, 5, DATA_PATH2) + iou.convert_stream_to_h5(DATA_PATH, 5, PROCESSED_DATA_DIR) def run_lda(): opt = { "data_path": DATA_PATH, - "data_dir": DATA_PATH2, - # "skip_preprocess": True, - # "c_log_level": 3, + "processed_data_dir": PROCESSED_DATA_DIR, } lda = CuLDA(opt) lda.train_model() - lda.save_model("res/lda.h5") - h5f = h5py.File("res/lda.h5", "r") + lda.save_model(LDA_PATH) + h5f = h5py.File(LDA_PATH, "r") beta = h5f["beta"][:] - for i in range(lda.opt.num_topics): + word_list = h5f["keys"][:] + num_topics = h5f["alpha"].shape[0] + for i in range(num_topics): print("=" * 50) print(f"topic {i + 1}") words = np.argsort(-beta.T[i])[:10] print("-" * 50) - for j in range(10): - word = lda.words[words[j]].decode("utf8") + for j in range(TOPK): + word = word_list[words[j]].decode("utf8") prob = beta[words[j], i] print(f"rank {j + 1}. word: {word}, prob: {prob}") h5f.close() From c064f69f4e63352a38d027fb4c73809e96b217e7 Mon Sep 17 00:00:00 2001 From: js1010 Date: Thu, 11 Feb 2021 14:22:36 +0900 Subject: [PATCH 26/26] use mutex --- cpp/include/culda/cuda_lda_kernels.cuh | 24 +++++++++++++++++++++--- cpp/include/culda/culda.hpp | 3 +++ cpp/src/culda/culda.cu | 10 ++++++++-- examples/example1.py | 1 + 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh index 400b251..02dbb37 100644 --- a/cpp/include/culda/cuda_lda_kernels.cuh +++ b/cpp/include/culda/cuda_lda_kernels.cuh @@ -31,7 +31,8 @@ __global__ void EstepKernel( const int num_topics, const int num_iters, float* gamma, float* new_gamma, float* phi, const float* alpha, const float* beta, - float* grad_alpha, float* new_beta, float* train_losses, float* vali_losses) { + float* grad_alpha, float* new_beta, + float* train_losses, float* vali_losses, int* mutex) { // storage for block float* _gamma = gamma + num_topics * blockIdx.x; @@ -57,6 +58,7 @@ __global__ void EstepKernel( for (int k = beg; k < end; ++k) { const int w = cols[k]; const bool _vali = vali[k]; + // compute phi if (not _vali or j + 1 == num_iters) { for (int l = threadIdx.x; l < num_topics; l += blockDim.x) @@ -65,17 +67,33 @@ __global__ void EstepKernel( // normalize phi and add it to new gamma and new beta float phi_sum = ReduceSum(_phi, num_topics); + for (int l = threadIdx.x; l < num_topics; l += blockDim.x) { _phi[l] /= phi_sum; if (not _vali) _new_gamma[l] += _phi[l]; + } + __syncthreads(); + } + + if (j + 1 == num_iters) { + // write access of w th vector of new_beta + if (threadIdx.x == 0) { + while (atomicCAS(&mutex[w], 0, 1)) {} + } + + __syncthreads(); + for (int l = threadIdx.x; l < num_topics; l += blockDim.x) { if (j + 1 == num_iters) { if (not _vali) new_beta[w * num_topics + l] += _phi[l]; _phi[l] *= beta[w * num_topics + l]; } } __syncthreads(); - } - if (j + 1 == num_iters) { + + // release lock + if (threadIdx.x == 0) mutex[w] = 0; + __syncthreads(); + float p = fmaxf(EPS, ReduceSum(_phi, num_topics)); if (threadIdx.x == 0) { if (_vali) diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp index b779e3a..3a126cc 100644 --- a/cpp/include/culda/culda.hpp +++ b/cpp/include/culda/culda.hpp @@ -70,6 +70,7 @@ class CuLDA { void Pull(); void Push(); int GetBlockCnt(); + private: DeviceInfo dev_info_; json11::Json opt_; @@ -77,6 +78,8 @@ class CuLDA { thrust::device_vector dev_alpha_, dev_beta_; thrust::device_vector dev_grad_alpha_, dev_new_beta_; thrust::device_vector dev_gamma_, dev_new_gamma_, dev_phi_; + thrust::device_vector dev_mutex_; + float *alpha_, *beta_, *grad_alpha_, *new_beta_; int block_cnt_, block_dim_; int num_topics_, num_words_; diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu index ac7cd8f..9217902 100644 --- a/cpp/src/culda/culda.cu +++ b/cpp/src/culda/culda.cu @@ -51,13 +51,18 @@ void CuLDA::LoadModel(float* alpha, float* beta, new_beta_ = new_beta; dev_grad_alpha_.resize(num_topics_ * block_cnt_); dev_new_beta_.resize(num_topics_ * num_words_); - // copy to device thrust::copy(grad_alpha_, grad_alpha_ + block_cnt_ * num_topics_, dev_grad_alpha_.begin()); thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin()); dev_gamma_.resize(num_topics_ * block_cnt_); dev_new_gamma_.resize(num_topics_ * block_cnt_); dev_phi_.resize(num_topics_ * block_cnt_); + + // set mutex + dev_mutex_.resize(num_words_); + std::vector host_mutex(num_words_, 0); + thrust::copy(host_mutex.begin(), host_mutex.end(), dev_mutex_.begin()); + CHECK_CUDA(cudaDeviceSynchronize()); } @@ -91,7 +96,8 @@ std::pair CuLDA::FeedData( thrust::raw_pointer_cast(dev_grad_alpha_.data()), thrust::raw_pointer_cast(dev_new_beta_.data()), thrust::raw_pointer_cast(dev_train_losses.data()), - thrust::raw_pointer_cast(dev_vali_losses.data())); + thrust::raw_pointer_cast(dev_vali_losses.data()), + thrust::raw_pointer_cast(dev_mutex_.data())); CHECK_CUDA(cudaDeviceSynchronize()); DEBUG0("run E step in GPU"); diff --git a/examples/example1.py b/examples/example1.py index 32c8dd5..f9362cb 100644 --- a/examples/example1.py +++ b/examples/example1.py @@ -46,6 +46,7 @@ def run_lda(): opt = { "data_path": DATA_PATH, "processed_data_dir": PROCESSED_DATA_DIR, + "skip_preprocess":True, } lda = CuLDA(opt) lda.train_model()