From 84eb848da596167b7086daec547458a21c7cb10b Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Sun, 7 Feb 2021 18:40:18 +0900
Subject: [PATCH 01/26] merge task/implement-io

---
 cpp/src/culda.cu | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)
diff --git a/cpp/src/culda.cu b/cpp/src/culda.cu
index 0c12182..edcd403 100644
--- a/cpp/src/culda.cu
+++ b/cpp/src/culda.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 Jisang Yoon
+// Copyright (c) 2021 Jisang Yoon
 // All rights reserved.
 //
 // This source code is licensed under the Apache 2.0 license found in the
@@ -13,4 +13,47 @@ CuLDA::CuLDA() {
 
 CuLDA::~CuLDA() {}
 
+bool CuLDA::Init(std::string opt_path) {
+  std::ifstream in(opt_path.c_str());
+  if (not in.is_open()) return false;
+
+  std::string str((std::istreambuf_iterator<char>(in)),
+      std::istreambuf_iterator<char>());
+  std::string err_cmt;
+  auto _opt = json11::Json::parse(str, err_cmt);
+  if (not err_cmt.empty()) return false;
+  opt_ = _opt;
+  CuSimLogger().set_log_level(opt_["c_log_level"].int_value());
+  num_topics_ = opt_["num_topics"].int_value();
+  block_dim_ = opt_["block_dim"].int_value();
+  return true;
+}
+
+void CuLDA::LoadModel(float* alpha, float* beta, int num_words) {
+  num_words_ = num_words;
+  DEBUG("copy model({} x {})", num_topics_, num_words_);
+  dev_alpha_.resize(num_topics_);
+  dev_beta_.resize(num_topics_ * num_words_);
+  #ifdef HALF_PRECISION
+    // conversion to half data and copy
+    std::vector<cuda_scalar> halpha(num_topics_), hbeta(num_topics_ * num_words_);
+    for (int i = 0; i < num_topics_; ++i) {
+      halpha[i] = conversion(alpha[i]);
+      for (int j = 0; j < num_words_; ++j) {
+        hbeta[i * num_words + j] = conversion(beta[i * num_words + j]);
+      }
+    }
+    thrust::copy(halpha.begin(), halpha.end(), dev_alapha_.begin());
+    thrust::copy(hbeta.begin(), hbeta.end(), dev_beta_.begin());
+  #else
+    thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin());
+    thrust::copy(beta, beta + num_topics_ * num_words_, dev_beta_.begin());
+  #endif
+  alpha_ = alpha; beta_ = beta;
+}
+
+void CuLDA::FeedData(const int* indices, const int* indptr, int num_indices, int num_indptr) {
+  
+}
+
 } // namespace cusim

From ba98fa5081a9074b9c4c138fc853f136a12c1d70 Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Sun, 7 Feb 2021 18:43:53 +0900
Subject: [PATCH 02/26] restructure directory

---
 cpp/include/{ => culda}/culda.hpp   | 0
 cpp/include/{ => utils}/ioutils.hpp | 0
 cpp/include/{ => utils}/log.hpp     | 0
 cpp/include/{ => utils}/types.hpp   | 0
 cpp/src/{ => culda}/culda.cu        | 0
 cpp/src/{ => utils}/ioutils.cc      | 0
 cpp/src/{ => utils}/log.cc          | 0
 7 files changed, 0 insertions(+), 0 deletions(-)
 rename cpp/include/{ => culda}/culda.hpp (100%)
 rename cpp/include/{ => utils}/ioutils.hpp (100%)
 rename cpp/include/{ => utils}/log.hpp (100%)
 rename cpp/include/{ => utils}/types.hpp (100%)
 rename cpp/src/{ => culda}/culda.cu (100%)
 rename cpp/src/{ => utils}/ioutils.cc (100%)
 rename cpp/src/{ => utils}/log.cc (100%)

diff --git a/cpp/include/culda.hpp b/cpp/include/culda/culda.hpp
similarity index 100%
rename from cpp/include/culda.hpp
rename to cpp/include/culda/culda.hpp
diff --git a/cpp/include/ioutils.hpp b/cpp/include/utils/ioutils.hpp
similarity index 100%
rename from cpp/include/ioutils.hpp
rename to cpp/include/utils/ioutils.hpp
diff --git a/cpp/include/log.hpp b/cpp/include/utils/log.hpp
similarity index 100%
rename from cpp/include/log.hpp
rename to cpp/include/utils/log.hpp
diff --git a/cpp/include/types.hpp b/cpp/include/utils/types.hpp
similarity index 100%
rename from cpp/include/types.hpp
rename to cpp/include/utils/types.hpp
diff --git a/cpp/src/culda.cu b/cpp/src/culda/culda.cu
similarity index 100%
rename from cpp/src/culda.cu
rename to cpp/src/culda/culda.cu
diff --git a/cpp/src/ioutils.cc b/cpp/src/utils/ioutils.cc
similarity index 100%
rename from cpp/src/ioutils.cc
rename to cpp/src/utils/ioutils.cc
diff --git a/cpp/src/log.cc b/cpp/src/utils/log.cc
similarity index 100%
rename from cpp/src/log.cc
rename to cpp/src/utils/log.cc

From 525f9f81cee1938b20ab2ac9f27ece01eaa56f34 Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Sun, 7 Feb 2021 18:45:16 +0900
Subject: [PATCH 03/26] change include path

---
 cpp/include/culda/culda.hpp   | 4 ++--
 cpp/include/utils/ioutils.hpp | 4 ++--
 cpp/src/culda/culda.cu        | 2 +-
 cpp/src/utils/ioutils.cc      | 2 +-
 cpp/src/utils/log.cc          | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp
index 0c2ba61..b26a96a 100644
--- a/cpp/include/culda/culda.hpp
+++ b/cpp/include/culda/culda.hpp
@@ -27,8 +27,8 @@
 #include <chrono> // NOLINT
 
 #include "json11.hpp"
-#include "log.hpp"
-#include "types.hpp"
+#include "utils/log.hpp"
+#include "utils/types.hpp"
 
 namespace cusim {
 
diff --git a/cpp/include/utils/ioutils.hpp b/cpp/include/utils/ioutils.hpp
index 3e8304d..b94ec0a 100644
--- a/cpp/include/utils/ioutils.hpp
+++ b/cpp/include/utils/ioutils.hpp
@@ -21,8 +21,8 @@
 #include <unordered_map>
 
 #include "json11.hpp"
-#include "log.hpp"
-#include "types.hpp"
+#include "utils/log.hpp"
+#include "utils/types.hpp"
 
 namespace cusim {
 
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
index edcd403..e050c88 100644
--- a/cpp/src/culda/culda.cu
+++ b/cpp/src/culda/culda.cu
@@ -3,7 +3,7 @@
 //
 // This source code is licensed under the Apache 2.0 license found in the
 // LICENSE file in the root directory of this source tree.
-#include "culda.hpp"
+#include "culda/culda.hpp"
 
 namespace cusim {
 
diff --git a/cpp/src/utils/ioutils.cc b/cpp/src/utils/ioutils.cc
index 45d551b..098b1f2 100644
--- a/cpp/src/utils/ioutils.cc
+++ b/cpp/src/utils/ioutils.cc
@@ -3,7 +3,7 @@
 //
 // This source code is licensed under the Apache 2.0 license found in the
 // LICENSE file in the root directory of this source tree.
-#include "ioutils.hpp"
+#include "utils/ioutils.hpp"
 
 namespace cusim {
 
diff --git a/cpp/src/utils/log.cc b/cpp/src/utils/log.cc
index ef5252b..ddfcb0c 100644
--- a/cpp/src/utils/log.cc
+++ b/cpp/src/utils/log.cc
@@ -5,7 +5,7 @@
 // LICENSE file in the root directory of this source tree.
 
 // reference: https://github.com/kakao/buffalo/blob/5f571c2c7d8227e6625c6e538da929e4db11b66d/lib/misc/log.cc
-#include "log.hpp"
+#include "utils/log.hpp"
 
 
 namespace cusim {

From 3ef51ed3ef465c0834b5326410c696937e9185bd Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Sun, 7 Feb 2021 18:51:02 +0900
Subject: [PATCH 04/26] add functions to culda.hpp

---
 cpp/include/culda/culda.hpp | 11 ++++++++++-
 cusim/ioutils/bindings.cc   |  2 +-
 setup.py                    |  4 +++-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp
index b26a96a..397aeb5 100644
--- a/cpp/include/culda/culda.hpp
+++ b/cpp/include/culda/culda.hpp
@@ -36,9 +36,18 @@ class CuLDA {
  public:
   CuLDA();
   ~CuLDA();
+  bool Init(std::string opt_path);
+  void LoadModel(float* alpha, float* beta, int num_words);
+  void FeedData(const int* indices, const int* indptr,
+      int num_indices, int num_indptr);
+
  private:
+  json11::Json opt_;
   std::shared_ptr<spdlog::logger> logger_;
-  thrust::device_vector<cuda_scalar> device_data_;
+  thrust::device_vector<cuda_scalar> dev_alpha_, dev_beta_;
+  const float *alpha_, *beta_;
+  int block_cnt_, block_dim_;
+  int num_topics_, num_words_;
 };
 
 } // namespace cusim
diff --git a/cusim/ioutils/bindings.cc b/cusim/ioutils/bindings.cc
index 5b2c1dd..ad35ebb 100644
--- a/cusim/ioutils/bindings.cc
+++ b/cusim/ioutils/bindings.cc
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include <iostream>
-#include "ioutils.hpp"
+#include "utils/ioutils.hpp"
 
 namespace py = pybind11;
 
diff --git a/setup.py b/setup.py
index dacb5f3..5031bcb 100644
--- a/setup.py
+++ b/setup.py
@@ -68,7 +68,9 @@ def __init__(self, name):
 extend_compile_flags = get_extend_compile_flags()
 extra_compile_args = ['-fopenmp', '-std=c++14', '-ggdb', '-O3'] + \
   extend_compile_flags
-csrcs = glob.glob("cpp/src/*.cu") + glob.glob("cpp/src/*.cc")
+csrcs = \
+  glob.glob("cpp/src/**/*.cu", recursive=True) + \
+  glob.glob("cpp/src/**/*.cc", recursive=True)
 extensions = [
   Extension("cusim.ioutils.ioutils_bind",
             sources= csrcs + [ \

From d90ff18e935a2b33af869e4cc6e37abf45b8d801 Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Sun, 7 Feb 2021 19:26:02 +0900
Subject: [PATCH 05/26] add culda binding

---
 cpp/include/utils/cuda_info.cuh | 96 +++++++++++++++++++++++++++++++++
 cusim/__init__.py               |  1 +
 cusim/culda/__init__.py         |  6 +++
 cusim/culda/bindings.cc         | 57 ++++++++++++++++++++
 cusim/culda/pyculda.py          | 34 ++++++++++++
 5 files changed, 194 insertions(+)
 create mode 100644 cpp/include/utils/cuda_info.cuh
 create mode 100644 cusim/culda/__init__.py
 create mode 100644 cusim/culda/bindings.cc
 create mode 100644 cusim/culda/pyculda.py

diff --git a/cpp/include/utils/cuda_info.cuh b/cpp/include/utils/cuda_info.cuh
new file mode 100644
index 0000000..ab6fefe
--- /dev/null
+++ b/cpp/include/utils/cuda_info.cuh
@@ -0,0 +1,96 @@
+// Copyright (c) 2021 Jisang Yoon
+// All rights reserved.
+//
+// This source code is licensed under the Apache 2.0 license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <thrust/copy.h>
+#include <thrust/fill.h>
+#include <thrust/random.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+
+#include <omp.h>
+#include <set>
+#include <random>
+#include <memory>
+#include <string>
+#include <fstream>
+#include <utility>
+#include <queue>
+#include <deque>
+#include <functional>
+#include <vector>
+#include <cmath>
+#include <chrono> // NOLINT
+
+#include "json11.hpp"
+#include "utils/log.hpp"
+#include "utils/types.hpp"
+
+#define DEBUG1(x) LOGGER->debug("[{}:{}] " x "\n", __FILENAME__, __LINE__);
+
+namespace cusim {
+
+std::shared_ptr<spdlog::logger> LOGGER = CuSimLogger().get_logger();
+
+struct DeviceInfo {
+  int devId, mp_cnt, major, minor, cores;
+}
+
+DeviceInfo GetDeviceInfo() {
+  DeviceInfo ret;
+  CHECK_CUDA(cudaGetDevice(&ret.devId));
+  cudaDeviceProp prop;
+  CHECK_CUDA(cudaGetDeviceProperties(&prop, ret.devId));
+  ret.mp_cnt = prop.multiProcessorCount;
+  ret.major = prop.major;
+  ret.minor = prop.minor;
+  // reference: https://stackoverflow.com/a/32531982
+  switch (ret.major) {
+    case 2: // Fermi
+      if (ret.minor == 1)
+        ret.cores = ret.mp_cnt * 48;
+      else
+        ret.cores = ret.mp_cnt * 32;
+      break;
+    case 3: // Kepler
+      ret.cores = ret.mp_cnt * 192;
+      break;
+    case 5: // Maxwell
+      ret.cores = ret.mp_cnt * 128;
+      break;
+    case 6: // Pascal
+      if (ret.minor == 1 or ret.minor == 2)
+        ret.cores = ret.mp_cnt * 128;
+      else if (ret.minor == 0)
+        ret.cores = ret.mp_cnt * 64;
+      else
+        DEBUG1("Unknown device type");
+      break;
+    case 7: // Volta and Turing
+      if (ret.minor == 0 or ret.minor == 5)
+        ret.cores = ret.mp_cnt * 64;
+      else
+        DEBUG1("Unknown device type");
+      break;
+    case 8: // Ampere
+      if (ret.minor == 0)
+        ret.cores = ret.mp_cnt * 64;
+      else if (minor_ == 6)
+        ret.cores = ret.mp_cnt * 128;
+      else
+        DEBUG1("Unknown device type");
+      break;
+    default:
+      DEBUG1("Unknown device type");
+      break;
+  }
+  if (ret.cores == -1) ret.cores = ret.mp_cnt * 128;
+  return ret
+}
+
+
+} // namespace cusim
diff --git a/cusim/__init__.py b/cusim/__init__.py
index 796d7b2..24fe984 100644
--- a/cusim/__init__.py
+++ b/cusim/__init__.py
@@ -4,3 +4,4 @@
 # This source code is licensed under the Apache 2.0 license found in the
 # LICENSE file in the root directory of this source tree.
 from cusim.ioutils import IoUtils
+from cusim.culda import CuLDA
diff --git a/cusim/culda/__init__.py b/cusim/culda/__init__.py
new file mode 100644
index 0000000..e27fb6a
--- /dev/null
+++ b/cusim/culda/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) 2021 Jisang Yoon
+# All rights reserved.
+#
+# This source code is licensed under the Apache 2.0 license found in the
+# LICENSE file in the root directory of this source tree.
+from cusim.culda.pyculda import CuLDA
diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc
new file mode 100644
index 0000000..d3ee2b4
--- /dev/null
+++ b/cusim/culda/bindings.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2021 Jisang Yoon
+//  All rights reserved.
+//
+//  This source code is licensed under the Apache 2.0 license found in the
+//  LICENSE file in the root directory of this source tree.
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+#include <iostream>
+#include "culda/culda.hpp"
+
+namespace py = pybind11;
+
+typedef py::array_t<float, py::array::c_style | py::array::forcecast> float_array;
+typedef py::array_t<int, py::array::c_style | py::array::forcecast> int_array;
+
+class CuLDABind {
+ public:
+  CuLDABind() {}
+
+  bool Init(std::string opt_path) {
+    return obj_.Init(opt_path);
+  }
+
+  void LoadModel(py::object& alpha, py::object& beta) {
+    float_array _alpha(alpha);
+    float_array _beta(beta);
+    auto alpha_buffer = _alphpa.request();
+    auto beta_buffer = _beta.request();
+    if (alpha_buffer.ndim != 1 or beta_buffer.ndim != 2 or
+        alpha_buffer.shape[0] != beta_buffer.shape[0]) {
+      throw std::runtime_error("invalid alpha or beta");
+    }
+    int num_words = beta_buffer.shape[1];
+    return obj_.LoadModel(_alpha.mutable_data(0), _beta.mutable_data(0), num_words);
+  }
+
+ private:
+  cusim::CuLDA obj_;
+};
+
+PYBIND11_PLUGIN(culda_bind) {
+  py::module m("CuLDABind");
+
+  py::class_<IoUtilsBind>(m, "CuLDABind")
+  .def(py::init())
+  .def("init", &CuLDABind::Init, py::arg("opt_path"))
+  .def("load_model", &IoUtilsBind::LoadModel,
+      py::arg("alpha"), py::arg("beta"))
+  .def("__repr__",
+  [](const CuLDABind &a) {
+    return "<CuLDABind>";
+  }
+  );
+  return m.ptr();
+}
diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
new file mode 100644
index 0000000..08fdb89
--- /dev/null
+++ b/cusim/culda/pyculda.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2021 Jisang Yoon
+# All rights reserved.
+#
+# This source code is licensed under the Apache 2.0 license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pylint: disable=no-name-in-module,too-few-public-methods,no-member
+import os
+# from os.path import join as pjoin
+
+import json
+import tempfile
+
+# import h5py
+# import numpy as np
+
+from cusim import aux
+from cusim.culda.culda_bind import CuLDABind
+from cusim.config_pb2 import CuLDAConfigProto
+
+class CuLDA:
+  def __init__(self, opt=None):
+    self.opt = aux.get_opt_as_proto(opt or {}, CuLDAConfigProto)
+    self.logger = aux.get_logger("culda", level=self.opt.py_log_level)
+
+    tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)
+    opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2)
+    tmp.write(opt_content)
+    tmp.close()
+
+    self.logger.info("opt: %s", opt_content)
+    self.obj = CuLDABind()
+    assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}"
+    os.remove(tmp.name)

From 6cebe9103e71befcb4b3a87dff0582c85e5f73db Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Sun, 7 Feb 2021 22:22:08 +0900
Subject: [PATCH 06/26] bug-fix

---
 cpp/include/culda/culda.hpp             |  2 +
 cpp/include/utils/cuda_base_kernels.cuh | 63 +++++++++++++++++++++++++
 cpp/include/utils/cuda_info.cuh         | 23 ++++-----
 cpp/src/culda/culda.cu                  |  6 ++-
 cusim/culda/bindings.cc                 |  6 +--
 cusim/proto/config.proto                |  9 ++++
 setup.py                                | 23 ++++++---
 7 files changed, 109 insertions(+), 23 deletions(-)
 create mode 100644 cpp/include/utils/cuda_base_kernels.cuh

diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp
index 397aeb5..4d85124 100644
--- a/cpp/include/culda/culda.hpp
+++ b/cpp/include/culda/culda.hpp
@@ -29,6 +29,7 @@
 #include "json11.hpp"
 #include "utils/log.hpp"
 #include "utils/types.hpp"
+#include "utils/cuda_info.cuh"
 
 namespace cusim {
 
@@ -42,6 +43,7 @@ class CuLDA {
       int num_indices, int num_indptr);
 
  private:
+  DeviceInfo dev_info_;
   json11::Json opt_;
   std::shared_ptr<spdlog::logger> logger_;
   thrust::device_vector<cuda_scalar> dev_alpha_, dev_beta_;
diff --git a/cpp/include/utils/cuda_base_kernels.cuh b/cpp/include/utils/cuda_base_kernels.cuh
new file mode 100644
index 0000000..3cb2979
--- /dev/null
+++ b/cpp/include/utils/cuda_base_kernels.cuh
@@ -0,0 +1,63 @@
+// Copyright (c) 2021 Jisang Yoon
+// All rights reserved.
+//
+// This source code is licensed under the Apache 2.0 license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <unistd.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#include <thrust/copy.h>
+#include <thrust/fill.h>
+#include <thrust/random.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+#include <cooperative_groups.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <ctime>
+#include <utility>
+
+#include "utils/types.hpp"
+
+namespace cusim {
+
+// Error Checking utilities, checks status codes from cuda calls
+// and throws exceptions on failure (which cython can proxy back to python)
+#define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); }
+inline void checkCuda(cudaError_t code, const char *file, int line) {
+  if (code != cudaSuccess) {
+    std::stringstream err;
+    err << "Cuda Error: " << cudaGetErrorString(code) << " (" << file << ":" << line << ")";
+    throw std::runtime_error(err.str());
+  }
+}
+
+inline const char* cublasGetErrorString(cublasStatus_t status) {
+  switch (status) {
+    case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+  }
+  return "Unknown";
+}
+
+#define CHECK_CUBLAS(code) { checkCublas((code), __FILE__, __LINE__); }
+inline void checkCublas(cublasStatus_t code, const char * file, int line) {
+  if (code != CUBLAS_STATUS_SUCCESS) {
+    std::stringstream err;
+    err << "cublas error: " << cublasGetErrorString(code)
+        << " (" << file << ":" << line << ")";
+    throw std::runtime_error(err.str());
+  }
+}
+} // namespace cusim
diff --git a/cpp/include/utils/cuda_info.cuh b/cpp/include/utils/cuda_info.cuh
index ab6fefe..0cd90f8 100644
--- a/cpp/include/utils/cuda_info.cuh
+++ b/cpp/include/utils/cuda_info.cuh
@@ -26,19 +26,16 @@
 #include <cmath>
 #include <chrono> // NOLINT
 
-#include "json11.hpp"
-#include "utils/log.hpp"
-#include "utils/types.hpp"
+#include "utils/cuda_base_kernels.cuh"
 
-#define DEBUG1(x) LOGGER->debug("[{}:{}] " x "\n", __FILENAME__, __LINE__);
 
 namespace cusim {
 
-std::shared_ptr<spdlog::logger> LOGGER = CuSimLogger().get_logger();
 
 struct DeviceInfo {
   int devId, mp_cnt, major, minor, cores;
-}
+  bool unknown = false;
+};
 
 DeviceInfo GetDeviceInfo() {
   DeviceInfo ret;
@@ -68,29 +65,29 @@ DeviceInfo GetDeviceInfo() {
       else if (ret.minor == 0)
         ret.cores = ret.mp_cnt * 64;
       else
-        DEBUG1("Unknown device type");
+        ret.unknown = true;
       break;
     case 7: // Volta and Turing
       if (ret.minor == 0 or ret.minor == 5)
         ret.cores = ret.mp_cnt * 64;
       else
-        DEBUG1("Unknown device type");
+        ret.unknown = true;
       break;
     case 8: // Ampere
       if (ret.minor == 0)
         ret.cores = ret.mp_cnt * 64;
-      else if (minor_ == 6)
+      else if (ret.minor == 6)
         ret.cores = ret.mp_cnt * 128;
       else
-        DEBUG1("Unknown device type");
+        ret.unknown = true;
       break;
     default:
-      DEBUG1("Unknown device type");
+        ret.unknown = true;
       break;
   }
   if (ret.cores == -1) ret.cores = ret.mp_cnt * 128;
-  return ret
+  return ret;
 }
 
 
-} // namespace cusim
+}  // namespace cusim
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
index e050c88..d87b2e0 100644
--- a/cpp/src/culda/culda.cu
+++ b/cpp/src/culda/culda.cu
@@ -9,6 +9,10 @@ namespace cusim {
 
 CuLDA::CuLDA() {
   logger_ = CuSimLogger().get_logger();
+  dev_info_ = GetDeviceInfo();
+  if (dev_info_.unknown) DEBUG0("Unknown device type");
+  INFO("cuda device info, major: {}, minor: {}, multi processors: {}, cores: {}",
+       dev_info_.major, dev_info_.minor, dev_info_.mp_cnt, dev_info_.cores);
 }
 
 CuLDA::~CuLDA() {}
@@ -26,6 +30,7 @@ bool CuLDA::Init(std::string opt_path) {
   CuSimLogger().set_log_level(opt_["c_log_level"].int_value());
   num_topics_ = opt_["num_topics"].int_value();
   block_dim_ = opt_["block_dim"].int_value();
+  block_cnt_ = opt_["hyper_threads"].number_value() * (dev_info_.cores / block_dim_);
   return true;
 }
 
@@ -53,7 +58,6 @@ void CuLDA::LoadModel(float* alpha, float* beta, int num_words) {
 }
 
 void CuLDA::FeedData(const int* indices, const int* indptr, int num_indices, int num_indptr) {
-  
 }
 
 } // namespace cusim
diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc
index d3ee2b4..0c6f72f 100644
--- a/cusim/culda/bindings.cc
+++ b/cusim/culda/bindings.cc
@@ -26,7 +26,7 @@ class CuLDABind {
   void LoadModel(py::object& alpha, py::object& beta) {
     float_array _alpha(alpha);
     float_array _beta(beta);
-    auto alpha_buffer = _alphpa.request();
+    auto alpha_buffer = _alpha.request();
     auto beta_buffer = _beta.request();
     if (alpha_buffer.ndim != 1 or beta_buffer.ndim != 2 or
         alpha_buffer.shape[0] != beta_buffer.shape[0]) {
@@ -43,10 +43,10 @@ class CuLDABind {
 PYBIND11_PLUGIN(culda_bind) {
   py::module m("CuLDABind");
 
-  py::class_<IoUtilsBind>(m, "CuLDABind")
+  py::class_<CuLDABind>(m, "CuLDABind")
   .def(py::init())
   .def("init", &CuLDABind::Init, py::arg("opt_path"))
-  .def("load_model", &IoUtilsBind::LoadModel,
+  .def("load_model", &CuLDABind::LoadModel,
       py::arg("alpha"), py::arg("beta"))
   .def("__repr__",
   [](const CuLDABind &a) {
diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto
index 071184b..b3899aa 100644
--- a/cusim/proto/config.proto
+++ b/cusim/proto/config.proto
@@ -12,3 +12,12 @@ message IoUtilsConfigProto {
   optional int32 chunk_lines = 3 [default = 100000];
   optional int32 num_threads = 4 [default = 4];
 }
+
+message CuLDAConfigProto {
+  optional int32 py_log_level = 1 [default = 2];
+  optional int32 c_log_level = 2 [default = 2];
+
+  optional int32 num_topics = 3 [default = 10];
+  optional int32 block_dim = 4 [default = 32];
+  optional int32 hyper_threads = 5 [default = 10];
+}
diff --git a/setup.py b/setup.py
index 5031bcb..cadbe5b 100644
--- a/setup.py
+++ b/setup.py
@@ -68,24 +68,35 @@ def __init__(self, name):
 extend_compile_flags = get_extend_compile_flags()
 extra_compile_args = ['-fopenmp', '-std=c++14', '-ggdb', '-O3'] + \
   extend_compile_flags
-csrcs = \
-  glob.glob("cpp/src/**/*.cu", recursive=True) + \
-  glob.glob("cpp/src/**/*.cc", recursive=True)
+util_srcs = glob.glob("cpp/src/utils/*.cc")
 extensions = [
   Extension("cusim.ioutils.ioutils_bind",
-            sources= csrcs + [ \
+            sources= util_srcs + [ \
               "cusim/ioutils/bindings.cc",
               "3rd/json11/json11.cpp"],
             language="c++",
             extra_compile_args=extra_compile_args,
             extra_link_args=["-fopenmp"],
+            extra_objects=[],
+            include_dirs=[ \
+              "cpp/include/", np.get_include(), pybind11.get_include(),
+              pybind11.get_include(True),
+              "3rd/json11", "3rd/spdlog/include"]),
+  Extension("cusim.culda.culda_bind",
+            sources= util_srcs + [ \
+              "cpp/src/culda/culda.cu",
+              "cusim/culda/bindings.cc",
+              "3rd/json11/json11.cpp"],
+            language="c++",
+            extra_compile_args=extra_compile_args,
+            extra_link_args=["-fopenmp"],
             library_dirs=[CUDA['lib64']],
             libraries=['cudart', 'cublas', 'curand'],
             extra_objects=[],
             include_dirs=[ \
               "cpp/include/", np.get_include(), pybind11.get_include(),
               pybind11.get_include(True), CUDA['include'],
-              "3rd/json11", "3rd/spdlog/include"])
+              "3rd/json11", "3rd/spdlog/include"]),
 ]
 
 
@@ -168,7 +179,7 @@ def setup_package():
     download_url="https://github.com/js1010/cusim/releases",
     include_package_data=False,
     license='Apac2',
-    packages=['cusim/', "cusim/ioutils/"],
+    packages=['cusim/', "cusim/ioutils/", "cusim/culda/"],
     cmdclass=cmdclass,
     classifiers=[_f for _f in CLASSIFIERS.split('\n') if _f],
     platforms=['Linux', 'Mac OSX', 'Unix'],

From 56c08b6071ea19c911d4128821a97d4ed183b14a Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Mon, 8 Feb 2021 17:57:20 +0900
Subject: [PATCH 07/26] update

---
 cpp/include/utils/cuda_base_kernels.cuh | 57 +++++++++++++++++++++++++
 cpp/include/utils/cuda_info.cuh         |  2 +-
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/cpp/include/utils/cuda_base_kernels.cuh b/cpp/include/utils/cuda_base_kernels.cuh
index 3cb2979..ea2f5f1 100644
--- a/cpp/include/utils/cuda_base_kernels.cuh
+++ b/cpp/include/utils/cuda_base_kernels.cuh
@@ -26,6 +26,63 @@
 
 namespace cusim {
 
+struct DeviceInfo {
+  int devId, mp_cnt, major, minor, cores;
+  bool unknown = false;
+};
+
+DeviceInfo GetDeviceInfo2() {
+  DeviceInfo ret;
+  CHECK_CUDA(cudaGetDevice(&ret.devId));
+  cudaDeviceProp prop;
+  CHECK_CUDA(cudaGetDeviceProperties(&prop, ret.devId));
+  ret.mp_cnt = prop.multiProcessorCount;
+  ret.major = prop.major;
+  ret.minor = prop.minor;
+  // reference: https://stackoverflow.com/a/32531982
+  switch (ret.major) {
+    case 2: // Fermi
+      if (ret.minor == 1)
+        ret.cores = ret.mp_cnt * 48;
+      else
+        ret.cores = ret.mp_cnt * 32;
+      break;
+    case 3: // Kepler
+      ret.cores = ret.mp_cnt * 192;
+      break;
+    case 5: // Maxwell
+      ret.cores = ret.mp_cnt * 128;
+      break;
+    case 6: // Pascal
+      if (ret.minor == 1 or ret.minor == 2)
+        ret.cores = ret.mp_cnt * 128;
+      else if (ret.minor == 0)
+        ret.cores = ret.mp_cnt * 64;
+      else
+        ret.unknown = true;
+      break;
+    case 7: // Volta and Turing
+      if (ret.minor == 0 or ret.minor == 5)
+        ret.cores = ret.mp_cnt * 64;
+      else
+        ret.unknown = true;
+      break;
+    case 8: // Ampere
+      if (ret.minor == 0)
+        ret.cores = ret.mp_cnt * 64;
+      else if (ret.minor == 6)
+        ret.cores = ret.mp_cnt * 128;
+      else
+        ret.unknown = true;
+      break;
+    default:
+        ret.unknown = true;
+      break;
+  }
+  if (ret.cores == -1) ret.cores = ret.mp_cnt * 128;
+  return ret;
+}
+
 // Error Checking utilities, checks status codes from cuda calls
 // and throws exceptions on failure (which cython can proxy back to python)
 #define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); }
diff --git a/cpp/include/utils/cuda_info.cuh b/cpp/include/utils/cuda_info.cuh
index 0cd90f8..7ff30ba 100644
--- a/cpp/include/utils/cuda_info.cuh
+++ b/cpp/include/utils/cuda_info.cuh
@@ -37,7 +37,7 @@ struct DeviceInfo {
   bool unknown = false;
 };
 
-DeviceInfo GetDeviceInfo() {
+DeviceInfo GetDeviceInfo2() {
   DeviceInfo ret;
   CHECK_CUDA(cudaGetDevice(&ret.devId));
   cudaDeviceProp prop;

From a71c178a2a8549073d70bfd08000f26f660004b9 Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Mon, 8 Feb 2021 18:26:02 +0900
Subject: [PATCH 08/26] build succeed

---
 cpp/include/culda/culda.hpp                   |  2 +-
 cpp/include/utils/cuda_info.cuh               | 93 -------------------
 ...ase_kernels.cuh => cuda_utils_kernels.cuh} | 75 +++++++--------
 cpp/include/utils/ioutils.hpp                 |  1 -
 cpp/include/utils/types.hpp                   |  5 +
 setup.py                                      |  2 +-
 6 files changed, 43 insertions(+), 135 deletions(-)
 delete mode 100644 cpp/include/utils/cuda_info.cuh
 rename cpp/include/utils/{cuda_base_kernels.cuh => cuda_utils_kernels.cuh} (96%)

diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp
index 4d85124..bc514a9 100644
--- a/cpp/include/culda/culda.hpp
+++ b/cpp/include/culda/culda.hpp
@@ -29,7 +29,7 @@
 #include "json11.hpp"
 #include "utils/log.hpp"
 #include "utils/types.hpp"
-#include "utils/cuda_info.cuh"
+#include "utils/cuda_utils_kernels.cuh"
 
 namespace cusim {
 
diff --git a/cpp/include/utils/cuda_info.cuh b/cpp/include/utils/cuda_info.cuh
deleted file mode 100644
index 7ff30ba..0000000
--- a/cpp/include/utils/cuda_info.cuh
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2021 Jisang Yoon
-// All rights reserved.
-//
-// This source code is licensed under the Apache 2.0 license found in the
-// LICENSE file in the root directory of this source tree.
-#pragma once
-#include <thrust/copy.h>
-#include <thrust/fill.h>
-#include <thrust/random.h>
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
-
-#include <omp.h>
-#include <set>
-#include <random>
-#include <memory>
-#include <string>
-#include <fstream>
-#include <utility>
-#include <queue>
-#include <deque>
-#include <functional>
-#include <vector>
-#include <cmath>
-#include <chrono> // NOLINT
-
-#include "utils/cuda_base_kernels.cuh"
-
-
-namespace cusim {
-
-
-struct DeviceInfo {
-  int devId, mp_cnt, major, minor, cores;
-  bool unknown = false;
-};
-
-DeviceInfo GetDeviceInfo2() {
-  DeviceInfo ret;
-  CHECK_CUDA(cudaGetDevice(&ret.devId));
-  cudaDeviceProp prop;
-  CHECK_CUDA(cudaGetDeviceProperties(&prop, ret.devId));
-  ret.mp_cnt = prop.multiProcessorCount;
-  ret.major = prop.major;
-  ret.minor = prop.minor;
-  // reference: https://stackoverflow.com/a/32531982
-  switch (ret.major) {
-    case 2: // Fermi
-      if (ret.minor == 1)
-        ret.cores = ret.mp_cnt * 48;
-      else
-        ret.cores = ret.mp_cnt * 32;
-      break;
-    case 3: // Kepler
-      ret.cores = ret.mp_cnt * 192;
-      break;
-    case 5: // Maxwell
-      ret.cores = ret.mp_cnt * 128;
-      break;
-    case 6: // Pascal
-      if (ret.minor == 1 or ret.minor == 2)
-        ret.cores = ret.mp_cnt * 128;
-      else if (ret.minor == 0)
-        ret.cores = ret.mp_cnt * 64;
-      else
-        ret.unknown = true;
-      break;
-    case 7: // Volta and Turing
-      if (ret.minor == 0 or ret.minor == 5)
-        ret.cores = ret.mp_cnt * 64;
-      else
-        ret.unknown = true;
-      break;
-    case 8: // Ampere
-      if (ret.minor == 0)
-        ret.cores = ret.mp_cnt * 64;
-      else if (ret.minor == 6)
-        ret.cores = ret.mp_cnt * 128;
-      else
-        ret.unknown = true;
-      break;
-    default:
-        ret.unknown = true;
-      break;
-  }
-  if (ret.cores == -1) ret.cores = ret.mp_cnt * 128;
-  return ret;
-}
-
-
-}  // namespace cusim
diff --git a/cpp/include/utils/cuda_base_kernels.cuh b/cpp/include/utils/cuda_utils_kernels.cuh
similarity index 96%
rename from cpp/include/utils/cuda_base_kernels.cuh
rename to cpp/include/utils/cuda_utils_kernels.cuh
index ea2f5f1..69c02e2 100644
--- a/cpp/include/utils/cuda_base_kernels.cuh
+++ b/cpp/include/utils/cuda_utils_kernels.cuh
@@ -26,12 +26,43 @@
 
 namespace cusim {
 
-struct DeviceInfo {
-  int devId, mp_cnt, major, minor, cores;
-  bool unknown = false;
-};
 
-DeviceInfo GetDeviceInfo2() {
+// Error Checking utilities, checks status codes from cuda calls
+// and throws exceptions on failure (which cython can proxy back to python)
+#define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); }
+inline void checkCuda(cudaError_t code, const char *file, int line) {
+  if (code != cudaSuccess) {
+    std::stringstream err;
+    err << "Cuda Error: " << cudaGetErrorString(code) << " (" << file << ":" << line << ")";
+    throw std::runtime_error(err.str());
+  }
+}
+
+inline const char* cublasGetErrorString(cublasStatus_t status) {
+  switch (status) {
+    case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+  }
+  return "Unknown";
+}
+
+#define CHECK_CUBLAS(code) { checkCublas((code), __FILE__, __LINE__); }
+inline void checkCublas(cublasStatus_t code, const char * file, int line) {
+  if (code != CUBLAS_STATUS_SUCCESS) {
+    std::stringstream err;
+    err << "cublas error: " << cublasGetErrorString(code)
+        << " (" << file << ":" << line << ")";
+    throw std::runtime_error(err.str());
+  }
+}
+
+inline DeviceInfo GetDeviceInfo() {
   DeviceInfo ret;
   CHECK_CUDA(cudaGetDevice(&ret.devId));
   cudaDeviceProp prop;
@@ -83,38 +114,4 @@ DeviceInfo GetDeviceInfo2() {
   return ret;
 }
 
-// Error Checking utilities, checks status codes from cuda calls
-// and throws exceptions on failure (which cython can proxy back to python)
-#define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); }
-inline void checkCuda(cudaError_t code, const char *file, int line) {
-  if (code != cudaSuccess) {
-    std::stringstream err;
-    err << "Cuda Error: " << cudaGetErrorString(code) << " (" << file << ":" << line << ")";
-    throw std::runtime_error(err.str());
-  }
-}
-
-inline const char* cublasGetErrorString(cublasStatus_t status) {
-  switch (status) {
-    case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
-    case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
-    case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
-    case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
-    case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
-    case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
-    case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
-    case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
-  }
-  return "Unknown";
-}
-
-#define CHECK_CUBLAS(code) { checkCublas((code), __FILE__, __LINE__); }
-inline void checkCublas(cublasStatus_t code, const char * file, int line) {
-  if (code != CUBLAS_STATUS_SUCCESS) {
-    std::stringstream err;
-    err << "cublas error: " << cublasGetErrorString(code)
-        << " (" << file << ":" << line << ")";
-    throw std::runtime_error(err.str());
-  }
-}
 } // namespace cusim
diff --git a/cpp/include/utils/ioutils.hpp b/cpp/include/utils/ioutils.hpp
index b94ec0a..cc3e09e 100644
--- a/cpp/include/utils/ioutils.hpp
+++ b/cpp/include/utils/ioutils.hpp
@@ -22,7 +22,6 @@
 
 #include "json11.hpp"
 #include "utils/log.hpp"
-#include "utils/types.hpp"
 
 namespace cusim {
 
diff --git a/cpp/include/utils/types.hpp b/cpp/include/utils/types.hpp
index 3af46c7..4efefe0 100644
--- a/cpp/include/utils/types.hpp
+++ b/cpp/include/utils/types.hpp
@@ -39,3 +39,8 @@
 #endif
 
 #define WARP_SIZE 32
+
+struct DeviceInfo {
+  int devId, mp_cnt, major, minor, cores;
+  bool unknown = false;
+};
diff --git a/setup.py b/setup.py
index cadbe5b..db61bb6 100644
--- a/setup.py
+++ b/setup.py
@@ -71,7 +71,7 @@ def __init__(self, name):
 util_srcs = glob.glob("cpp/src/utils/*.cc")
 extensions = [
   Extension("cusim.ioutils.ioutils_bind",
-            sources= util_srcs + [ \
+            sources = util_srcs + [ \
               "cusim/ioutils/bindings.cc",
               "3rd/json11/json11.cpp"],
             language="c++",

From b99c6ffebb5e9bceeeb48b2f66b7a7d7d1fb60bf Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Mon, 8 Feb 2021 20:36:33 +0900
Subject: [PATCH 09/26] implement lda kernel

---
 cpp/include/culda/cuda_lda_kernels.cuh   | 74 ++++++++++++++++++++++++
 cpp/include/utils/cuda_utils_kernels.cuh | 57 ++++++++++++++++++
 cpp/src/culda/culda.cu                   | 10 +++-
 examples/example1.py                     |  7 ++-
 4 files changed, 145 insertions(+), 3 deletions(-)
 create mode 100644 cpp/include/culda/cuda_lda_kernels.cuh

diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh
new file mode 100644
index 0000000..2811421
--- /dev/null
+++ b/cpp/include/culda/cuda_lda_kernels.cuh
@@ -0,0 +1,74 @@
+// Copyright (c) 2021 Jisang Yoon
+// All rights reserved.
+//
+// This source code is licensed under the Apache 2.0 license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include "utils/cuda_utils_kernels.cuh"
+
+namespace cusim {
+
+__inline__ __device__
+cuda_scalar Psi(cuda_scalar x) {
+  
+}
+
+__global__ void EstepKernel(
+  const int* indices, const int* indptr, 
+  const int num_indices, const int num_indptr,
+  const int num_words, const int num_topics, const int num_iters,
+  cuda_scalar* gamma, cuda_scalar* new_gamma, cuda_scalar* phi,
+  cuda_scalar* alpha, cuda_scalar* beta,
+  cuda_scalar* grad_alpha, cuda_scalar* new_beta) {
+  
+  // storage for block
+  cuda_scalar* _gamma = gamma + num_topics * blockIdx.x;
+  cuda_scalar* _new_gamma = new_gamma + num_topics * blockIdx.x;
+  cuda_scalar* _phi = phi + num_topics * blockIdx.x;
+
+  for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) {
+    int beg = indptr[i], end = indptr[i + 1];
+    
+    // initialize gamma
+    for (int j = threadIdx.x; j < num_topics; j += blockDim.x)
+      _gamma[j] = alpha[j] + (end - beg) / num_topics;
+    __syncthreads();
+
+    // iterate E step
+    for (int j = 0; j < num_iters; ++j) {
+      // initialize new gamma
+      for (int k = threadIdx.x; k < num_topics; k += blockDim.x)
+        _new_gamma[k] = 0;
+      __synctheads();
+
+      // compute phi from gamma
+      for (int k = beg; k < end; ++k) {
+        int w = indices[k];
+        // compute phi
+        for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
+          _phi[l] = beta[w * num_topics + l] * exp(Psi(_gamma[l]));
+        __syncthreads();
+        
+        // normalize phi and add it to new gamma and new beta
+        cuda_scalar phi_sum = Sum(_phi, num_topics);
+        for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
+          _phi[l] /= phi_sum;
+          _new_gamma[l] += _phi[l];
+          if (j + 1 == num_iters) new_beta[w * num_topics + l] += phi[l];
+        }
+        __syncthreads();
+      }
+
+      // update gamma
+      for (int k = threadIdx.x; k < num_topics; l += blockDim.x)
+        _gamma[k] = _new_gamma[k] + alpha[k];
+      __syncthreads();
+    }
+    cuda_scalar gamma_sum = Sum(_gamma, num_topics);
+    for (int j = threadIdx.x; j < num_topics, j += blockDim.x)
+      grad_alpha[j] += (Psi(_gamma[j]) - Psi(gamma_sum));
+    __syncthreaads()
+  } 
+}
+
+}  // cusim
diff --git a/cpp/include/utils/cuda_utils_kernels.cuh b/cpp/include/utils/cuda_utils_kernels.cuh
index 69c02e2..ef71021 100644
--- a/cpp/include/utils/cuda_utils_kernels.cuh
+++ b/cpp/include/utils/cuda_utils_kernels.cuh
@@ -114,4 +114,61 @@ inline DeviceInfo GetDeviceInfo() {
   return ret;
 }
 
+__inline__ __device__
+cuda_scalar warp_reduce_sum(cuda_scalar val) {
+  #if __CUDACC_VER_MAJOR__ >= 9
+  // __shfl_down is deprecated with cuda 9+. use newer variants
+  unsigned int active = __activemask();
+  #pragma unroll
+  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+      val = add(val, __shfl_down_sync(active, val, offset));
+  }
+  #else
+  #pragma unroll
+  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+      val = add(val, __shfl_down(val, offset));
+  }
+  #endif
+  return val;
+}
+
+__inline__ __device__
+cuda_scalar Sum(const cuda_scalar* vec, const int length) {
+  
+  static __shared__ cuda_scalar shared[32];
+
+  // figure out the warp/ position inside the warp
+  int warp =  threadIdx.x / WARP_SIZE;
+  int lane = threadIdx.x % WARP_SIZE;
+  
+  // paritial sum
+  cuda_scalar val = 0;
+  for (int i = threadIdx.x; i < length; i += blockDim.x) 
+    val += vec[i];
+  val = warp_reduce_sum(val);
+  
+  // write out the partial reduction to shared memory if appropiate
+  if (lane == 0) {
+    shared[warp] = val;
+  }
+  __syncthreads();
+  
+  // if we we don't have multiple warps, we're done
+  if (blockDim.x <= WARP_SIZE) {
+    return shared[0];
+  }
+
+  // otherwise reduce again in the first warp
+  val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: conversion(0.0f);
+  if (warp == 0) {
+    val = warp_reduce_sum(val);
+    // broadcast back to shared memory
+    if (threadIdx.x == 0) {
+        shared[0] = val;
+    }
+  }
+  __syncthreads();
+  return shared[0];
+}
+
 } // namespace cusim
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
index d87b2e0..da5db80 100644
--- a/cpp/src/culda/culda.cu
+++ b/cpp/src/culda/culda.cu
@@ -31,6 +31,7 @@ bool CuLDA::Init(std::string opt_path) {
   num_topics_ = opt_["num_topics"].int_value();
   block_dim_ = opt_["block_dim"].int_value();
   block_cnt_ = opt_["hyper_threads"].number_value() * (dev_info_.cores / block_dim_);
+  INFO("num_topics: {}, block_dim: {}, block_cnt: {}", num_topics_, block_dim_, block_cnt_);
   return true;
 }
 
@@ -57,7 +58,14 @@ void CuLDA::LoadModel(float* alpha, float* beta, int num_words) {
   alpha_ = alpha; beta_ = beta;
 }
 
-void CuLDA::FeedData(const int* indices, const int* indptr, int num_indices, int num_indptr) {
+void CuLDA::FeedData(const int* indices, const int* indptr, 
+    int num_indices, int num_indptr, float* gamma) {
+  thrust::vector<int> dev_phi(num_indices * num_topics_);
+  thrust::vector<int> dev_gamma(num_indptr * num_topics_);
+
+
+
+
 }
 
 } // namespace cusim
diff --git a/examples/example1.py b/examples/example1.py
index 6cbdaa9..6d97a84 100644
--- a/examples/example1.py
+++ b/examples/example1.py
@@ -10,7 +10,7 @@
 import fire
 
 from gensim import downloader as api
-from cusim import aux, IoUtils
+from cusim import aux, IoUtils, CuLDA
 
 LOGGER = aux.get_logger()
 DOWNLOAD_PATH = "./res"
@@ -32,11 +32,14 @@ def download():
   LOGGER.info("cmd: %s", cmd)
   subprocess.call(cmd, shell=True)
 
-def run():
+def run_io():
   download()
   iou = IoUtils(opt={"chunk_lines": 10000, "num_threads": 8})
   iou.convert_stream_to_h5(DATA_PATH, 5, DATA_PATH2)
 
 
+def run_lda():
+  CuLDA()
+
 if __name__ == "__main__":
   fire.Fire()

From 083c82eec0f24c2e1272ef2243fec42d49255777 Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Mon, 8 Feb 2021 21:33:45 +0900
Subject: [PATCH 10/26] not support half

---
 cpp/include/utils/cuda_utils_kernels.cuh | 14 +++++++-------
 cpp/include/utils/types.hpp              |  4 ++++
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/cpp/include/utils/cuda_utils_kernels.cuh b/cpp/include/utils/cuda_utils_kernels.cuh
index ef71021..1503f90 100644
--- a/cpp/include/utils/cuda_utils_kernels.cuh
+++ b/cpp/include/utils/cuda_utils_kernels.cuh
@@ -115,34 +115,34 @@ inline DeviceInfo GetDeviceInfo() {
 }
 
 __inline__ __device__
-cuda_scalar warp_reduce_sum(cuda_scalar val) {
+float warp_reduce_sum(float val) {
   #if __CUDACC_VER_MAJOR__ >= 9
   // __shfl_down is deprecated with cuda 9+. use newer variants
   unsigned int active = __activemask();
   #pragma unroll
   for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
-      val = add(val, __shfl_down_sync(active, val, offset));
+      val += __shfl_down_sync(active, val, offset);
   }
   #else
   #pragma unroll
   for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
-      val = add(val, __shfl_down(val, offset));
+      val += __shfl_down(val, offset);
   }
   #endif
   return val;
 }
 
 __inline__ __device__
-cuda_scalar Sum(const cuda_scalar* vec, const int length) {
+float ReduceSum(const float* vec, const int length) {
   
-  static __shared__ cuda_scalar shared[32];
+  static __shared__ floaat shared[32];
 
   // figure out the warp/ position inside the warp
   int warp =  threadIdx.x / WARP_SIZE;
   int lane = threadIdx.x % WARP_SIZE;
   
   // paritial sum
-  cuda_scalar val = 0;
+  float val = 0.0f;
   for (int i = threadIdx.x; i < length; i += blockDim.x) 
     val += vec[i];
   val = warp_reduce_sum(val);
@@ -159,7 +159,7 @@ cuda_scalar Sum(const cuda_scalar* vec, const int length) {
   }
 
   // otherwise reduce again in the first warp
-  val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: conversion(0.0f);
+  val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: 0.0f;
   if (warp == 0) {
     val = warp_reduce_sum(val);
     // broadcast back to shared memory
diff --git a/cpp/include/utils/types.hpp b/cpp/include/utils/types.hpp
index 4efefe0..475f50c 100644
--- a/cpp/include/utils/types.hpp
+++ b/cpp/include/utils/types.hpp
@@ -19,23 +19,27 @@
   #define mul(x, y) ( __hmul(x, y) )
   #define add(x, y) ( __hadd(x, y) )
   #define sub(x, y) ( __hsub(x, y) )
+  #define div(x, y) ( __hdiv(x, y) )
   #define gt(x, y) ( __hgt(x, y) )  // x > y
   #define ge(x, y) ( __hge(x, y) )  // x >= y
   #define lt(x, y) ( __hlt(x, y) )  // x < y
   #define le(x, y) ( __hle(x, y) )  // x <= y
   #define out_scalar(x) ( __half2float(x) )
   #define conversion(x) ( __float2half(x) )
+  #define cuda_exp(x) ( __hexp(x) )
 #else
   typedef float cuda_scalar;
   #define mul(x, y) ( x * y )
   #define add(x, y) ( x + y )
   #define sub(x, y) ( x - y )
+  #define div(x, y) ( x / y )
   #define gt(x, y) ( x > y )
   #define ge(x, y) ( x >= y )
   #define lt(x, y) ( x < y )
   #define le(x, y) ( x <= y )
   #define out_scalar(x) ( x )
   #define conversion(x) ( x )
+  #define cuda_exp(x) ( expf(x) )
 #endif
 
 #define WARP_SIZE 32

From 7824d9d7bd6adc12d8fbbca702ae791b7bbd1eb6 Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Mon, 8 Feb 2021 21:42:50 +0900
Subject: [PATCH 11/26] fix typo

---
 cpp/include/culda/cuda_lda_kernels.cuh   | 40 +++++++++++++------
 cpp/include/culda/culda.hpp              |  5 ++-
 cpp/include/utils/cuda_utils_kernels.cuh |  9 +++--
 cpp/include/utils/types.hpp              | 50 ------------------------
 cpp/src/culda/culda.cu                   | 27 +++----------
 5 files changed, 43 insertions(+), 88 deletions(-)
 delete mode 100644 cpp/include/utils/types.hpp

diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh
index 2811421..1569209 100644
--- a/cpp/include/culda/cuda_lda_kernels.cuh
+++ b/cpp/include/culda/cuda_lda_kernels.cuh
@@ -6,25 +6,35 @@
 #pragma once
 #include "utils/cuda_utils_kernels.cuh"
 
+
 namespace cusim {
 
 __inline__ __device__
-cuda_scalar Psi(cuda_scalar x) {
-  
+float Digamma(float x) {
+  float result = 0f, xx, xx2, xx4;
+  for ( ; x < 7.0f; ++x)
+    result -= 1.0f / x;
+  x -= 0.5f;
+  xx = 1.0f / x;
+  xx2 = xx * xx;
+  xx4 = xx2 * xx2;
+  result += logf(x) + 1.0f / 24.0f * xx2 - 7.0f / 960.0f * xx4 + 
+    31.0f / 8064.0f * xx4 * xx2 - 127.0f / 30720.0f * xx4 * xx4;
+  return result;
 }
 
 __global__ void EstepKernel(
   const int* indices, const int* indptr, 
   const int num_indices, const int num_indptr,
   const int num_words, const int num_topics, const int num_iters,
-  cuda_scalar* gamma, cuda_scalar* new_gamma, cuda_scalar* phi,
-  cuda_scalar* alpha, cuda_scalar* beta,
-  cuda_scalar* grad_alpha, cuda_scalar* new_beta) {
+  float* gamma, float* new_gamma, float* phi,
+  float* alpha, float* beta,
+  float* grad_alpha, float* new_beta) {
   
   // storage for block
-  cuda_scalar* _gamma = gamma + num_topics * blockIdx.x;
-  cuda_scalar* _new_gamma = new_gamma + num_topics * blockIdx.x;
-  cuda_scalar* _phi = phi + num_topics * blockIdx.x;
+  float* _gamma = gamma + num_topics * blockIdx.x;
+  float* _new_gamma = new_gamma + num_topics * blockIdx.x;
+  float* _phi = phi + num_topics * blockIdx.x;
 
   for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) {
     int beg = indptr[i], end = indptr[i + 1];
@@ -46,11 +56,11 @@ __global__ void EstepKernel(
         int w = indices[k];
         // compute phi
         for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
-          _phi[l] = beta[w * num_topics + l] * exp(Psi(_gamma[l]));
+          _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l]));
         __syncthreads();
         
         // normalize phi and add it to new gamma and new beta
-        cuda_scalar phi_sum = Sum(_phi, num_topics);
+        float phi_sum = ReduceSum(_phi, num_topics);
         for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
           _phi[l] /= phi_sum;
           _new_gamma[l] += _phi[l];
@@ -64,11 +74,19 @@ __global__ void EstepKernel(
         _gamma[k] = _new_gamma[k] + alpha[k];
       __syncthreads();
     }
-    cuda_scalar gamma_sum = Sum(_gamma, num_topics);
+    float gamma_sum = ReduceSum(_gamma, num_topics);
     for (int j = threadIdx.x; j < num_topics, j += blockDim.x)
       grad_alpha[j] += (Psi(_gamma[j]) - Psi(gamma_sum));
     __syncthreaads()
   } 
 }
 
+__global__ void MstepKernel(
+  float* alpha, float* beta,
+  float* grad_alpha, float* new_beta,
+  const int num_words, const int num_topic) {
+  
+
+}
+
 }  // cusim
diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp
index bc514a9..7309b5e 100644
--- a/cpp/include/culda/culda.hpp
+++ b/cpp/include/culda/culda.hpp
@@ -28,7 +28,6 @@
 
 #include "json11.hpp"
 #include "utils/log.hpp"
-#include "utils/types.hpp"
 #include "utils/cuda_utils_kernels.cuh"
 
 namespace cusim {
@@ -46,7 +45,9 @@ class CuLDA {
   DeviceInfo dev_info_;
   json11::Json opt_;
   std::shared_ptr<spdlog::logger> logger_;
-  thrust::device_vector<cuda_scalar> dev_alpha_, dev_beta_;
+  thrust::device_vector<float> dev_alpha_, dev_beta_;
+  thrust::device_vector<float> dev_grad_alpha_, dev_new_beta_;
+  thrust::device_vector<float> dev_gamma_, dev_new_gamma_, dev_phi_;
   const float *alpha_, *beta_;
   int block_cnt_, block_dim_;
   int num_topics_, num_words_;
diff --git a/cpp/include/utils/cuda_utils_kernels.cuh b/cpp/include/utils/cuda_utils_kernels.cuh
index 1503f90..d925549 100644
--- a/cpp/include/utils/cuda_utils_kernels.cuh
+++ b/cpp/include/utils/cuda_utils_kernels.cuh
@@ -22,11 +22,14 @@
 #include <ctime>
 #include <utility>
 
-#include "utils/types.hpp"
-
 namespace cusim {
 
+struct DeviceInfo {
+  int devId, mp_cnt, major, minor, cores;
+  bool unknown = false;
+};
 
+#define WARP_SIZE 32
 // Error Checking utilities, checks status codes from cuda calls
 // and throws exceptions on failure (which cython can proxy back to python)
 #define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); }
@@ -135,7 +138,7 @@ float warp_reduce_sum(float val) {
 __inline__ __device__
 float ReduceSum(const float* vec, const int length) {
   
-  static __shared__ floaat shared[32];
+  static __shared__ float shared[32];
 
   // figure out the warp/ position inside the warp
   int warp =  threadIdx.x / WARP_SIZE;
diff --git a/cpp/include/utils/types.hpp b/cpp/include/utils/types.hpp
deleted file mode 100644
index 475f50c..0000000
--- a/cpp/include/utils/types.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2020 Jisang Yoon
-// All rights reserved.
-//
-// This source code is licensed under the Apache 2.0 license found in the
-// LICENSE file in the root directory of this source tree.
-#pragma once
-#include <cuda_fp16.h>
-
-// experimental codes to use half precision
-// not properly working yet..
-// #define HALF_PRECISION 1
-
-// #if __CUDA_ARCH__ < 530
-//   #undef HALF_PRECISION
-// #endif
-
-#ifdef HALF_PRECISION
-  typedef half cuda_scalar;
-  #define mul(x, y) ( __hmul(x, y) )
-  #define add(x, y) ( __hadd(x, y) )
-  #define sub(x, y) ( __hsub(x, y) )
-  #define div(x, y) ( __hdiv(x, y) )
-  #define gt(x, y) ( __hgt(x, y) )  // x > y
-  #define ge(x, y) ( __hge(x, y) )  // x >= y
-  #define lt(x, y) ( __hlt(x, y) )  // x < y
-  #define le(x, y) ( __hle(x, y) )  // x <= y
-  #define out_scalar(x) ( __half2float(x) )
-  #define conversion(x) ( __float2half(x) )
-  #define cuda_exp(x) ( __hexp(x) )
-#else
-  typedef float cuda_scalar;
-  #define mul(x, y) ( x * y )
-  #define add(x, y) ( x + y )
-  #define sub(x, y) ( x - y )
-  #define div(x, y) ( x / y )
-  #define gt(x, y) ( x > y )
-  #define ge(x, y) ( x >= y )
-  #define lt(x, y) ( x < y )
-  #define le(x, y) ( x <= y )
-  #define out_scalar(x) ( x )
-  #define conversion(x) ( x )
-  #define cuda_exp(x) ( expf(x) )
-#endif
-
-#define WARP_SIZE 32
-
-struct DeviceInfo {
-  int devId, mp_cnt, major, minor, cores;
-  bool unknown = false;
-};
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
index da5db80..a51313e 100644
--- a/cpp/src/culda/culda.cu
+++ b/cpp/src/culda/culda.cu
@@ -40,32 +40,15 @@ void CuLDA::LoadModel(float* alpha, float* beta, int num_words) {
   DEBUG("copy model({} x {})", num_topics_, num_words_);
   dev_alpha_.resize(num_topics_);
   dev_beta_.resize(num_topics_ * num_words_);
-  #ifdef HALF_PRECISION
-    // conversion to half data and copy
-    std::vector<cuda_scalar> halpha(num_topics_), hbeta(num_topics_ * num_words_);
-    for (int i = 0; i < num_topics_; ++i) {
-      halpha[i] = conversion(alpha[i]);
-      for (int j = 0; j < num_words_; ++j) {
-        hbeta[i * num_words + j] = conversion(beta[i * num_words + j]);
-      }
-    }
-    thrust::copy(halpha.begin(), halpha.end(), dev_alapha_.begin());
-    thrust::copy(hbeta.begin(), hbeta.end(), dev_beta_.begin());
-  #else
-    thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin());
-    thrust::copy(beta, beta + num_topics_ * num_words_, dev_beta_.begin());
-  #endif
+  thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin());
+  thrust::copy(beta, beta + num_topics_ * num_words_, dev_beta_.begin());
   alpha_ = alpha; beta_ = beta;
 }
 
 void CuLDA::FeedData(const int* indices, const int* indptr, 
-    int num_indices, int num_indptr, float* gamma) {
-  thrust::vector<int> dev_phi(num_indices * num_topics_);
-  thrust::vector<int> dev_gamma(num_indptr * num_topics_);
-
-
-
-
+    int num_indices, int num_indptr) {
+  thrust::device_vector<int> dev_phi(num_indices * num_topics_);
+  thrust::device_vector<int> dev_gamma(num_indptr * num_topics_);
 }
 
 } // namespace cusim

From 5e9f0cc7ae8b79b215d4baf236c02586f581e82c Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Mon, 8 Feb 2021 22:31:00 +0900
Subject: [PATCH 12/26] implement e-step and fix typos

---
 cpp/include/culda/cuda_lda_kernels.cuh | 33 +++++++---------
 cpp/include/culda/culda.hpp            | 33 ++++++++++++++--
 cpp/src/culda/culda.cu                 | 55 ++++++++++++++++++++++++--
 3 files changed, 96 insertions(+), 25 deletions(-)

diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh
index 1569209..b8648a3 100644
--- a/cpp/include/culda/cuda_lda_kernels.cuh
+++ b/cpp/include/culda/cuda_lda_kernels.cuh
@@ -9,17 +9,19 @@
 
 namespace cusim {
 
+// reference: http://web.science.mq.edu.au/~mjohnson/code/digamma.c
 __inline__ __device__
 float Digamma(float x) {
-  float result = 0f, xx, xx2, xx4;
+  float result = 0.0f, xx, xx2, xx4;
   for ( ; x < 7.0f; ++x)
     result -= 1.0f / x;
   x -= 0.5f;
   xx = 1.0f / x;
   xx2 = xx * xx;
   xx4 = xx2 * xx2;
-  result += logf(x) + 1.0f / 24.0f * xx2 - 7.0f / 960.0f * xx4 + 
-    31.0f / 8064.0f * xx4 * xx2 - 127.0f / 30720.0f * xx4 * xx4;
+  result += logf(x) + 1.0f / 24.0f * xx2 
+    - 7.0f / 960.0f * xx4 + 31.0f / 8064.0f * xx4 * xx2 
+    - 127.0f / 30720.0f * xx4 * xx4;
   return result;
 }
 
@@ -48,12 +50,12 @@ __global__ void EstepKernel(
     for (int j = 0; j < num_iters; ++j) {
       // initialize new gamma
       for (int k = threadIdx.x; k < num_topics; k += blockDim.x)
-        _new_gamma[k] = 0;
-      __synctheads();
+        _new_gamma[k] = 0.0f;
+      __syncthreads();
 
       // compute phi from gamma
       for (int k = beg; k < end; ++k) {
-        int w = indices[k];
+        const int w = indices[k];
         // compute phi
         for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
           _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l]));
@@ -64,29 +66,22 @@ __global__ void EstepKernel(
         for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
           _phi[l] /= phi_sum;
           _new_gamma[l] += _phi[l];
-          if (j + 1 == num_iters) new_beta[w * num_topics + l] += phi[l];
+          if (j + 1 == num_iters) 
+            new_beta[w * num_topics + l] += phi[l];
         }
         __syncthreads();
       }
 
       // update gamma
-      for (int k = threadIdx.x; k < num_topics; l += blockDim.x)
+      for (int k = threadIdx.x; k < num_topics; k += blockDim.x)
         _gamma[k] = _new_gamma[k] + alpha[k];
       __syncthreads();
     }
     float gamma_sum = ReduceSum(_gamma, num_topics);
-    for (int j = threadIdx.x; j < num_topics, j += blockDim.x)
-      grad_alpha[j] += (Psi(_gamma[j]) - Psi(gamma_sum));
-    __syncthreaads()
+    for (int j = threadIdx.x; j < num_topics; j += blockDim.x)
+      grad_alpha[j] += (Digamma(_gamma[j]) - Digamma(gamma_sum));
+    __syncthreads();
   } 
 }
 
-__global__ void MstepKernel(
-  float* alpha, float* beta,
-  float* grad_alpha, float* new_beta,
-  const int num_words, const int num_topic) {
-  
-
-}
-
 }  // cusim
diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp
index 7309b5e..fe57cd3 100644
--- a/cpp/include/culda/culda.hpp
+++ b/cpp/include/culda/culda.hpp
@@ -32,15 +32,41 @@
 
 namespace cusim {
 
+
+// reference: https://people.math.sc.edu/Burkardt/cpp_src/asa121/asa121.cpp
+inline float Trigamma(float x) {
+  const float a = 0.0001f;
+  const float b = 5.0f;
+  const float b2 =  0.1666666667f;
+  const float b4 = -0.03333333333f;
+  const float b6 =  0.02380952381f;
+  const float b8 = -0.03333333333f;
+  float value = 0, y = 0, z = x;
+  if (x <= a) return 1.0f / x / x;
+  while (z < b) {
+    value += 1.0f / z / z;
+    z++;
+  }
+  y = 1.0f / z / z;
+  value += value + 0.5 * y + (1.0
+    + y * (b2
+    + y * (b4
+    + y * (b6
+    + y * b8)))) / z;
+  return value;
+}
+
+
 class CuLDA {
  public:
   CuLDA();
   ~CuLDA();
   bool Init(std::string opt_path);
-  void LoadModel(float* alpha, float* beta, int num_words);
+  void LoadModel(float* alpha, float* beta, const int num_words);
+  void InitModel();
   void FeedData(const int* indices, const int* indptr,
-      int num_indices, int num_indptr);
-
+      const int num_indices, const int num_indptr, const int num_iters);
+  void Mstep(const int num_docs);
  private:
   DeviceInfo dev_info_;
   json11::Json opt_;
@@ -49,6 +75,7 @@ class CuLDA {
   thrust::device_vector<float> dev_grad_alpha_, dev_new_beta_;
   thrust::device_vector<float> dev_gamma_, dev_new_gamma_, dev_phi_;
   const float *alpha_, *beta_;
+  std::vector<float> grad_alpha_, new_beta_;
   int block_cnt_, block_dim_;
   int num_topics_, num_words_;
 };
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
index a51313e..ab2f6a2 100644
--- a/cpp/src/culda/culda.cu
+++ b/cpp/src/culda/culda.cu
@@ -4,6 +4,7 @@
 // This source code is licensed under the Apache 2.0 license found in the
 // LICENSE file in the root directory of this source tree.
 #include "culda/culda.hpp"
+#include "culda/cuda_lda_kernels.cuh"
 
 namespace cusim {
 
@@ -43,12 +44,60 @@ void CuLDA::LoadModel(float* alpha, float* beta, int num_words) {
   thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin());
   thrust::copy(beta, beta + num_topics_ * num_words_, dev_beta_.begin());
   alpha_ = alpha; beta_ = beta;
+  InitModel();
+}
+
+void CuLDA::InitModel() {
+  // resize device vector
+  dev_grad_alpha_.resize(num_topics_);
+  dev_new_beta_.resize(num_topics_ * num_words_);
+  dev_gamma_.resize(num_topics_ * block_cnt_);
+  dev_new_gamma_.resize(num_topics_ * block_cnt_);
+  dev_phi_.resize(num_topics_ * block_cnt_);
+  
+  // resize host vector
+  grad_alpha_.resize(num_topics_);
+  new_beta_.resize(num_topics_ * num_words_);
+
+  // fill zeros
+  std::fill(grad_alpha_.begin(), grad_alpha_.end(), 0);
+  std::fill(new_beta_.begin(), new_beta_.end(), 0);
+
+  // copy to device
+  thrust::copy(grad_alpha_.begin(), grad_alpha_.end(), dev_grad_alpha_.begin());
+  thrust::copy(new_beta_.begin(), new_beta_.end(), dev_new_beta_.begin());
+  CHECK_CUDA(cudaDeviceSynchronize());
 }
 
 void CuLDA::FeedData(const int* indices, const int* indptr, 
-    int num_indices, int num_indptr) {
-  thrust::device_vector<int> dev_phi(num_indices * num_topics_);
-  thrust::device_vector<int> dev_gamma(num_indptr * num_topics_);
+    const int num_indices, const int num_indptr, const int num_iters) {
+  thrust::device_vector<int> dev_indices(num_indices);
+  thrust::device_vector<int> dev_indptr(num_indptr + 1);
+  thrust::copy(indices, indices + num_indices, dev_indices.begin());
+  thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin());
+  CHECK_CUDA(cudaDeviceSynchronize());
+
+  EstepKernel<<<block_cnt_, block_dim_>>>(
+    thrust::raw_pointer_cast(dev_indices.data()),
+    thrust::raw_pointer_cast(dev_indptr.data()),
+    num_indices, num_indptr, num_words_, num_topics_, num_iters,
+    thrust::raw_pointer_cast(dev_gamma_.data()),
+    thrust::raw_pointer_cast(dev_new_gamma_.data()),
+    thrust::raw_pointer_cast(dev_phi_.data()),
+    thrust::raw_pointer_cast(dev_alpha_.data()),
+    thrust::raw_pointer_cast(dev_beta_.data()),
+    thrust::raw_pointer_cast(dev_grad_alpha_.data()),
+    thrust::raw_pointer_cast(dev_new_beta_.data()));
+  
+  CHECK_CUDA(cudaDeviceSynchronize());
+}
+
+void CuLDA::Mstep(const int num_docs) {
+  thrust::copy(dev_grad_alpha_.begin(), dev_grad_alpha_.end(), grad_alpha_.begin());
+  thrust::copy(dev_new_beta_.begin(), dev_new_beta_.end(), new_beta_.begin());
+  CHECK_CUDA(cudaDeviceSynchronize());
+  
+
 }
 
 } // namespace cusim

From 18451db95bfb992149bb673ad65fad98b4ffe151 Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Mon, 8 Feb 2021 23:04:38 +0900
Subject: [PATCH 13/26] add pull and push

---
 cpp/include/culda/culda.hpp | 10 ++++-----
 cpp/src/culda/culda.cu      | 42 ++++++++++++++++++-------------------
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp
index fe57cd3..ae1424a 100644
--- a/cpp/include/culda/culda.hpp
+++ b/cpp/include/culda/culda.hpp
@@ -62,11 +62,12 @@ class CuLDA {
   CuLDA();
   ~CuLDA();
   bool Init(std::string opt_path);
-  void LoadModel(float* alpha, float* beta, const int num_words);
-  void InitModel();
+  void LoadModel(float* alpha, float* beta,
+      float* grad_alpha, float* new_beta, const int num_words);
   void FeedData(const int* indices, const int* indptr,
       const int num_indices, const int num_indptr, const int num_iters);
-  void Mstep(const int num_docs);
+  void Pull();
+  void Push();
  private:
   DeviceInfo dev_info_;
   json11::Json opt_;
@@ -74,8 +75,7 @@ class CuLDA {
   thrust::device_vector<float> dev_alpha_, dev_beta_;
   thrust::device_vector<float> dev_grad_alpha_, dev_new_beta_;
   thrust::device_vector<float> dev_gamma_, dev_new_gamma_, dev_phi_;
-  const float *alpha_, *beta_;
-  std::vector<float> grad_alpha_, new_beta_;
+  float *alpha_, *beta_, *grad_alpha_, *new_beta_;
   int block_cnt_, block_dim_;
   int num_topics_, num_words_;
 };
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
index ab2f6a2..d37c3f5 100644
--- a/cpp/src/culda/culda.cu
+++ b/cpp/src/culda/culda.cu
@@ -36,7 +36,8 @@ bool CuLDA::Init(std::string opt_path) {
   return true;
 }
 
-void CuLDA::LoadModel(float* alpha, float* beta, int num_words) {
+void CuLDA::LoadModel(float* alpha, float* beta, 
+    float* grad_alpha, float* new_beta, int num_words) {
   num_words_ = num_words;
   DEBUG("copy model({} x {})", num_topics_, num_words_);
   dev_alpha_.resize(num_topics_);
@@ -44,28 +45,19 @@ void CuLDA::LoadModel(float* alpha, float* beta, int num_words) {
   thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin());
   thrust::copy(beta, beta + num_topics_ * num_words_, dev_beta_.begin());
   alpha_ = alpha; beta_ = beta;
-  InitModel();
-}
-
-void CuLDA::InitModel() {
+  
   // resize device vector
+  grad_alpha_ = grad_alpha;
+  new_beta_ = new_beta;
   dev_grad_alpha_.resize(num_topics_);
   dev_new_beta_.resize(num_topics_ * num_words_);
+
+  // copy to device
+  thrust::copy(grad_alpha_, grad_alpha_ + num_topics_, dev_grad_alpha_.begin());
+  thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin());
   dev_gamma_.resize(num_topics_ * block_cnt_);
   dev_new_gamma_.resize(num_topics_ * block_cnt_);
   dev_phi_.resize(num_topics_ * block_cnt_);
-  
-  // resize host vector
-  grad_alpha_.resize(num_topics_);
-  new_beta_.resize(num_topics_ * num_words_);
-
-  // fill zeros
-  std::fill(grad_alpha_.begin(), grad_alpha_.end(), 0);
-  std::fill(new_beta_.begin(), new_beta_.end(), 0);
-
-  // copy to device
-  thrust::copy(grad_alpha_.begin(), grad_alpha_.end(), dev_grad_alpha_.begin());
-  thrust::copy(new_beta_.begin(), new_beta_.end(), dev_new_beta_.begin());
   CHECK_CUDA(cudaDeviceSynchronize());
 }
 
@@ -92,12 +84,18 @@ void CuLDA::FeedData(const int* indices, const int* indptr,
   CHECK_CUDA(cudaDeviceSynchronize());
 }
 
-void CuLDA::Mstep(const int num_docs) {
-  thrust::copy(dev_grad_alpha_.begin(), dev_grad_alpha_.end(), grad_alpha_.begin());
-  thrust::copy(dev_new_beta_.begin(), dev_new_beta_.end(), new_beta_.begin());
+void CuLDA::Pull() {
+  thrust::copy(dev_grad_alpha_.begin(), dev_grad_alpha_.end(), grad_alpha_);
+  thrust::copy(dev_new_beta_.begin(), dev_new_beta_.end(), new_beta_);
   CHECK_CUDA(cudaDeviceSynchronize());
-  
+}
 
+void CuLDA::Push() {
+  thrust::copy(alpha_, alpha_ + num_topics_, dev_alpha_.begin());
+  thrust::copy(grad_alpha_, grad_alpha_ + num_topics_, dev_grad_alpha_.begin());
+  thrust::copy(beta_, beta_ + num_words_ * num_topics_, dev_beta_.begin());
+  thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin());
+  CHECK_CUDA(cudaDeviceSynchronize());
 }
 
-} // namespace cusim
+}  // namespace cusim

From 4896f5f2ea883b719aaaa37a225d54418e9231a6 Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Tue, 9 Feb 2021 23:29:07 +0900
Subject: [PATCH 14/26] bind lda

---
 cpp/src/utils/ioutils.cc   |  4 +--
 cusim/culda/bindings.cc    | 50 +++++++++++++++++++++++++++++++++++---
 cusim/culda/pyculda.py     | 21 ++++++++++++++--
 cusim/ioutils/pyioutils.py |  2 +-
 cusim/proto/config.proto   |  1 +
 setup.py                   | 10 +++++---
 6 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/cpp/src/utils/ioutils.cc b/cpp/src/utils/ioutils.cc
index 098b1f2..f0b49d0 100644
--- a/cpp/src/utils/ioutils.cc
+++ b/cpp/src/utils/ioutils.cc
@@ -165,11 +165,9 @@ void IoUtils::GetWordVocab(int min_count, std::string keys_path) {
   // write keys to csv file
   std::ofstream fout(keys_path.c_str());
   INFO("dump keys to {}", keys_path);
-  std::string header = "index,key\n";
-  fout.write(header.c_str(), header.size());
   int n = word_list_.size();
   for (int i = 0; i < n; ++i) {
-    std::string line = std::to_string(i) + ",\"" + word_list_[i] + "\"\n";
+    std::string line = word_list_[i] + "\n";
     fout.write(line.c_str(), line.size());
   }
   fout.close();
diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc
index 0c6f72f..157b837 100644
--- a/cusim/culda/bindings.cc
+++ b/cusim/culda/bindings.cc
@@ -23,7 +23,9 @@ class CuLDABind {
     return obj_.Init(opt_path);
   }
 
-  void LoadModel(py::object& alpha, py::object& beta) {
+  void LoadModel(py::object& alpha, py::object& beta,
+      py::object& grad_alpha, py::object& new_beta) {
+    // check shape of alpha and beta
     float_array _alpha(alpha);
     float_array _beta(beta);
     auto alpha_buffer = _alpha.request();
@@ -32,8 +34,45 @@ class CuLDABind {
         alpha_buffer.shape[0] != beta_buffer.shape[0]) {
       throw std::runtime_error("invalid alpha or beta");
     }
+
+    // check shape of grad alpha and new beta
+    float_array _grad_alpha(grad_alpha);
+    float_array _new_beta(new_beta);
+    auto grad_alpha_buffer = _grad_alpha.request();
+    auto new_beta_buffer = _new_beta.request();
+    if (grad_alpha_buffer.ndim != 1 or
+        new_beta_buffer.ndim != 2 or
+        grad_alpha_buffer.shape[0] != new_beta_buffer.shape[0]) {
+      throw std::runtime_error("invalid grad_alpha or new_beta");
+    }
+
     int num_words = beta_buffer.shape[1];
-    return obj_.LoadModel(_alpha.mutable_data(0), _beta.mutable_data(0), num_words);
+
+    return obj_.LoadModel(_alpha.mutable_data(0),
+        _beta.mutable_data(0),
+        _grad_alpha.mutable_data(0),
+        _new_beta.mutable_data(0), num_words);
+  }
+
+  void FeedData(py::object& indices, py::object indptr, const int num_iters) {
+    int_array _indices(indices);
+    int_array _indptr(indptr);
+    auto indices_buffer = _indices.request();
+    auto indptr_buffer = _indptr.request();
+    if (indices_buffer.ndim != 1 or indptr_buffer.ndim != 1) {
+      throw std::runtime_error("invalid indices or indptr");
+    }
+    int num_indices = indices_buffer.shape[0];
+    int num_indptr = indptr_buffer.shape[0];
+    obj_.FeedData(_indices.data(0), _indptr.data(0), num_indices, num_indptr, num_iters);
+  }
+
+  void Pull() {
+    obj_.Pull();
+  }
+
+  void Push() {
+    obj_.Push();
   }
 
  private:
@@ -47,7 +86,12 @@ PYBIND11_PLUGIN(culda_bind) {
   .def(py::init())
   .def("init", &CuLDABind::Init, py::arg("opt_path"))
   .def("load_model", &CuLDABind::LoadModel,
-      py::arg("alpha"), py::arg("beta"))
+      py::arg("alpha"), py::arg("beta"),
+      py::arg("grad_alpha"), py::arg("new_beta"))
+  .def("feed_data", &CuLDABind::FeedData,
+      py::arg("indices"), py::arg("indptr"), py::arg("num_iters"))
+  .def("pull", &CuLDABind::Pull)
+  .def("push", &CuLDABind::Push)
   .def("__repr__",
   [](const CuLDABind &a) {
     return "<CuLDABind>";
diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
index 08fdb89..1ef893d 100644
--- a/cusim/culda/pyculda.py
+++ b/cusim/culda/pyculda.py
@@ -6,13 +6,13 @@
 
 # pylint: disable=no-name-in-module,too-few-public-methods,no-member
 import os
-# from os.path import join as pjoin
+from os.path import join as pjoin
 
 import json
 import tempfile
 
 # import h5py
-# import numpy as np
+import numpy as np
 
 from cusim import aux
 from cusim.culda.culda_bind import CuLDABind
@@ -32,3 +32,20 @@ def __init__(self, opt=None):
     self.obj = CuLDABind()
     assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}"
     os.remove(tmp.name)
+
+    self.words, self.num_words = None, None
+    self.alpha, self.beta, self.grad_alpha, self.new_beta = \
+      None, None, None, None
+
+  def init_model(self):
+    with open(pjoin(self.opt.data_dir, "keys.txt"), "r") as fin:
+      self.words = [line.strip() for line in fin]
+    self.num_words = len(self.words)
+    self.alpha = \
+      np.abs(np.uniform(shape=(self.opt.num_topics,))).astype(np.float32)
+    self.beta = np.abs(np.uniform( \
+      shape=(self.num_words, self.opt.num_topics))).astype(np.float32)
+    self.beta /= np.sum(self.beta, axis=1)[None, :]
+    self.grad_alpha = np.zeros(shape=self.alpha.shape, dtype=np.float32)
+    self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32)
+    self.obj.load_model(self.alpha, self.beta, self.grad_alpha, self.new_beta)
diff --git a/cusim/ioutils/pyioutils.py b/cusim/ioutils/pyioutils.py
index 1a65f74..49b12a3 100644
--- a/cusim/ioutils/pyioutils.py
+++ b/cusim/ioutils/pyioutils.py
@@ -51,7 +51,7 @@ def load_stream_vocab(self, filepath, min_count, keys_path):
   def convert_stream_to_h5(self, filepath, min_count, out_dir,
                            chunk_indices=10000):
     os.makedirs(out_dir, exist_ok=True)
-    keys_path = pjoin(out_dir, "keys.csv")
+    keys_path = pjoin(out_dir, "keys.txt")
     token_path = pjoin(out_dir, "token.h5")
     self.logger.info("save key and token to %s, %s",
                      keys_path, token_path)
diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto
index b3899aa..b82c204 100644
--- a/cusim/proto/config.proto
+++ b/cusim/proto/config.proto
@@ -20,4 +20,5 @@ message CuLDAConfigProto {
   optional int32 num_topics = 3 [default = 10];
   optional int32 block_dim = 4 [default = 32];
   optional int32 hyper_threads = 5 [default = 10];
+  required string data_dir = 6;
 }
diff --git a/setup.py b/setup.py
index db61bb6..512d262 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 Jisang Yoon
+# Copyright (c) 2021 Jisang Yoon
 # All rights reserved.
 #
 # This source code is licensed under the Apache 2.0 license found in the
@@ -30,7 +30,8 @@
   raise RuntimeError("Python version 3.6 or later required.")
 
 assert platform.system() == 'Linux'  # TODO: MacOS
-
+with open("requirements.txt", "r") as fin:
+  INSTALL_REQUIRES = [line.strip() for line in fin]
 
 MAJOR = 0
 MINOR = 0
@@ -173,13 +174,16 @@ def setup_package():
     name='cusim',
     maintainer="Jisang Yoon",
     maintainer_email="vjs10101v@gmail.com",
+    author="Jisang Yoon",
+    author_email="vjs10101v@gmail.com",
     description=DOCLINES[0],
     long_description="\n".join(DOCLINES[2:]),
     url="https://github.com/js1010/cusim",
     download_url="https://github.com/js1010/cusim/releases",
     include_package_data=False,
-    license='Apac2',
+    license='Apache2',
     packages=['cusim/', "cusim/ioutils/", "cusim/culda/"],
+    install_requires=INSTALL_REQUIRES,
     cmdclass=cmdclass,
     classifiers=[_f for _f in CLASSIFIERS.split('\n') if _f],
     platforms=['Linux', 'Mac OSX', 'Unix'],

From 70dfa2cda71ba933be5d4bc3ef9ca5bcb08b68d1 Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Tue, 9 Feb 2021 23:53:28 +0900
Subject: [PATCH 15/26] add types.hpp

---
 cpp/include/culda/culda.hpp              |  2 +-
 cpp/include/utils/cuda_utils_kernels.cuh |  7 +------
 cpp/include/utils/types.hpp              | 13 +++++++++++++
 3 files changed, 15 insertions(+), 7 deletions(-)
 create mode 100644 cpp/include/utils/types.hpp

diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp
index ae1424a..127d515 100644
--- a/cpp/include/culda/culda.hpp
+++ b/cpp/include/culda/culda.hpp
@@ -28,7 +28,7 @@
 
 #include "json11.hpp"
 #include "utils/log.hpp"
-#include "utils/cuda_utils_kernels.cuh"
+#include "utils/types.hpp"
 
 namespace cusim {
 
diff --git a/cpp/include/utils/cuda_utils_kernels.cuh b/cpp/include/utils/cuda_utils_kernels.cuh
index d925549..026da7f 100644
--- a/cpp/include/utils/cuda_utils_kernels.cuh
+++ b/cpp/include/utils/cuda_utils_kernels.cuh
@@ -21,15 +21,10 @@
 #include <sstream>
 #include <ctime>
 #include <utility>
+#include "utils/types.hpp"
 
 namespace cusim {
 
-struct DeviceInfo {
-  int devId, mp_cnt, major, minor, cores;
-  bool unknown = false;
-};
-
-#define WARP_SIZE 32
 // Error Checking utilities, checks status codes from cuda calls
 // and throws exceptions on failure (which cython can proxy back to python)
 #define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); }
diff --git a/cpp/include/utils/types.hpp b/cpp/include/utils/types.hpp
new file mode 100644
index 0000000..82d93cc
--- /dev/null
+++ b/cpp/include/utils/types.hpp
@@ -0,0 +1,13 @@
+// Copyright (c) 2021 Jisang Yoon
+// All rights reserved.
+//
+// This source code is licensed under the Apache 2.0 license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+
+struct DeviceInfo {
+  int devId, mp_cnt, major, minor, cores;
+  bool unknown = false;
+};
+
+#define WARP_SIZE 32

From 5456ff3e57af05126ecd056a8fcea6d1afa01d3b Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Wed, 10 Feb 2021 00:07:16 +0900
Subject: [PATCH 16/26] bug-fix

---
 cusim/culda/bindings.cc |  4 ++--
 cusim/culda/pyculda.py  | 22 +++++++++++++++++-----
 examples/example1.py    |  7 ++++++-
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc
index 157b837..ec959c5 100644
--- a/cusim/culda/bindings.cc
+++ b/cusim/culda/bindings.cc
@@ -31,7 +31,7 @@ class CuLDABind {
     auto alpha_buffer = _alpha.request();
     auto beta_buffer = _beta.request();
     if (alpha_buffer.ndim != 1 or beta_buffer.ndim != 2 or
-        alpha_buffer.shape[0] != beta_buffer.shape[0]) {
+        alpha_buffer.shape[0] != beta_buffer.shape[1]) {
       throw std::runtime_error("invalid alpha or beta");
     }
 
@@ -42,7 +42,7 @@ class CuLDABind {
     auto new_beta_buffer = _new_beta.request();
     if (grad_alpha_buffer.ndim != 1 or
         new_beta_buffer.ndim != 2 or
-        grad_alpha_buffer.shape[0] != new_beta_buffer.shape[0]) {
+        grad_alpha_buffer.shape[0] != new_beta_buffer.shape[1]) {
       throw std::runtime_error("invalid grad_alpha or new_beta");
     }
 
diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
index 1ef893d..1f1639a 100644
--- a/cusim/culda/pyculda.py
+++ b/cusim/culda/pyculda.py
@@ -38,14 +38,26 @@ def __init__(self, opt=None):
       None, None, None, None
 
   def init_model(self):
-    with open(pjoin(self.opt.data_dir, "keys.txt"), "r") as fin:
+    # load voca
+    self.logger.info("load key from %s", pjoin(self.opt.data_dir, "keys.txt"))
+    with open(pjoin(self.opt.data_dir, "keys.txt"), "rb") as fin:
       self.words = [line.strip() for line in fin]
     self.num_words = len(self.words)
+    self.logger.info("number of words: %d", self.num_words)
+
+    # random initialize alpha and beta
     self.alpha = \
-      np.abs(np.uniform(shape=(self.opt.num_topics,))).astype(np.float32)
-    self.beta = np.abs(np.uniform( \
-      shape=(self.num_words, self.opt.num_topics))).astype(np.float32)
-    self.beta /= np.sum(self.beta, axis=1)[None, :]
+      np.abs(np.random.uniform( \
+        size=(self.opt.num_topics,))).astype(np.float32)
+    self.beta = np.abs(np.random.uniform( \
+      size=(self.num_words, self.opt.num_topics))).astype(np.float32)
+    self.beta /= np.sum(self.beta, axis=0)[None, :]
+    self.logger.info("alpha %s, beta %s initialized",
+                     self.alpha.shape, self.beta.shape)
+
+    # zero initialize grad alpha and new beta
     self.grad_alpha = np.zeros(shape=self.alpha.shape, dtype=np.float32)
     self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32)
+
+    # push it to gpu
     self.obj.load_model(self.alpha, self.beta, self.grad_alpha, self.new_beta)
diff --git a/examples/example1.py b/examples/example1.py
index 6d97a84..9886733 100644
--- a/examples/example1.py
+++ b/examples/example1.py
@@ -39,7 +39,12 @@ def run_io():
 
 
 def run_lda():
-  CuLDA()
+  opt = {
+    "data_dir": DATA_PATH2,
+  }
+  lda = CuLDA(opt)
+  lda.init_model()
+
 
 if __name__ == "__main__":
   fire.Fire()

From e460142b8203faadfb981e8f52520d7f0f69af24 Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Wed, 10 Feb 2021 20:34:59 +0900
Subject: [PATCH 17/26] add rows and indices => cols

---
 cpp/include/culda/cuda_lda_kernels.cuh |  6 ++--
 cpp/include/utils/ioutils.hpp          |  4 +--
 cpp/src/culda/culda.cu                 | 12 ++++----
 cpp/src/utils/ioutils.cc               | 17 +++++------
 cusim/culda/bindings.cc                | 16 +++++------
 cusim/culda/pyculda.py                 | 39 ++++++++++++++++++++++----
 cusim/ioutils/bindings.cc              |  7 +++--
 cusim/ioutils/pyioutils.py             | 24 ++++++++++------
 8 files changed, 81 insertions(+), 44 deletions(-)

diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh
index b8648a3..5211695 100644
--- a/cpp/include/culda/cuda_lda_kernels.cuh
+++ b/cpp/include/culda/cuda_lda_kernels.cuh
@@ -26,8 +26,8 @@ float Digamma(float x) {
 }
 
 __global__ void EstepKernel(
-  const int* indices, const int* indptr, 
-  const int num_indices, const int num_indptr,
+  const int* cols, const int* indptr, 
+  const int num_cols, const int num_indptr,
   const int num_words, const int num_topics, const int num_iters,
   float* gamma, float* new_gamma, float* phi,
   float* alpha, float* beta,
@@ -55,7 +55,7 @@ __global__ void EstepKernel(
 
       // compute phi from gamma
       for (int k = beg; k < end; ++k) {
-        const int w = indices[k];
+        const int w = cols[k];
         // compute phi
         for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
           _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l]));
diff --git a/cpp/include/utils/ioutils.hpp b/cpp/include/utils/ioutils.hpp
index cc3e09e..756b4b2 100644
--- a/cpp/include/utils/ioutils.hpp
+++ b/cpp/include/utils/ioutils.hpp
@@ -34,12 +34,12 @@ class IoUtils {
   std::pair<int, int> ReadStreamForVocab(int num_lines, int num_threads);
   std::pair<int, int> TokenizeStream(int num_lines, int num_threads);
   void GetWordVocab(int min_count, std::string keys_path);
-  void GetToken(int* indices, int* indptr, int offset);
+  void GetToken(int* rows, int* cols, int* indptr);
  private:
   void ParseLine(std::string line, std::vector<std::string>& line_vec);
   void ParseLineImpl(std::string line, std::vector<std::string>& line_vec);
 
-  std::vector<std::vector<int>> indices_;
+  std::vector<std::vector<int>> cols_;
   std::vector<int> indptr_;
   std::mutex global_lock_;
   std::ifstream stream_fin_;
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
index d37c3f5..b57db97 100644
--- a/cpp/src/culda/culda.cu
+++ b/cpp/src/culda/culda.cu
@@ -61,18 +61,18 @@ void CuLDA::LoadModel(float* alpha, float* beta,
   CHECK_CUDA(cudaDeviceSynchronize());
 }
 
-void CuLDA::FeedData(const int* indices, const int* indptr, 
-    const int num_indices, const int num_indptr, const int num_iters) {
-  thrust::device_vector<int> dev_indices(num_indices);
+void CuLDA::FeedData(const int* cols, const int* indptr, 
+    const int num_cols, const int num_indptr, const int num_iters) {
+  thrust::device_vector<int> dev_cols(num_cols);
   thrust::device_vector<int> dev_indptr(num_indptr + 1);
-  thrust::copy(indices, indices + num_indices, dev_indices.begin());
+  thrust::copy(cols, cols + num_cols, dev_cols.begin());
   thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin());
   CHECK_CUDA(cudaDeviceSynchronize());
 
   EstepKernel<<<block_cnt_, block_dim_>>>(
-    thrust::raw_pointer_cast(dev_indices.data()),
+    thrust::raw_pointer_cast(dev_cols.data()),
     thrust::raw_pointer_cast(dev_indptr.data()),
-    num_indices, num_indptr, num_words_, num_topics_, num_iters,
+    num_cols, num_indptr, num_words_, num_topics_, num_iters,
     thrust::raw_pointer_cast(dev_gamma_.data()),
     thrust::raw_pointer_cast(dev_new_gamma_.data()),
     thrust::raw_pointer_cast(dev_phi_.data()),
diff --git a/cpp/src/utils/ioutils.cc b/cpp/src/utils/ioutils.cc
index f0b49d0..0cc6495 100644
--- a/cpp/src/utils/ioutils.cc
+++ b/cpp/src/utils/ioutils.cc
@@ -69,8 +69,8 @@ std::pair<int, int> IoUtils::TokenizeStream(int num_lines, int num_threads) {
   int read_lines = std::min(num_lines, remain_lines_);
   if (not read_lines) return {0, 0};
   remain_lines_ -= read_lines;
-  indices_.clear();
-  indices_.resize(read_lines);
+  cols_.clear();
+  cols_.resize(read_lines);
   indptr_.resize(read_lines);
   std::fill(indptr_.begin(), indptr_.end(), 0);
   #pragma omp parallel num_threads(num_threads)
@@ -91,27 +91,28 @@ std::pair<int, int> IoUtils::TokenizeStream(int num_lines, int num_threads) {
       // tokenize
       for (auto& word: line_vec) {
         if (not word_count_.count(word)) continue;
-        indices_[i].push_back(word_count_[word]);
+        cols_[i].push_back(word_count_[word]);
       }
     }
   }
   int cumsum = 0;
   for (int i = 0; i < read_lines; ++i) {
-    cumsum += indices_[i].size();
+    cumsum += cols_[i].size();
     indptr_[i] = cumsum;
   }
   return {read_lines, indptr_[read_lines - 1]};
 }
 
-void IoUtils::GetToken(int* indices, int* indptr, int offset) {
-  int n = indices_.size();
+void IoUtils::GetToken(int* rows, int* cols, int* indptr) {
+  int n = cols_.size();
   for (int i = 0; i < n; ++i) {
     int beg = i == 0? 0: indptr_[i - 1];
     int end = indptr_[i];
     for (int j = beg; j < end; ++j) {
-      indices[j] = indices_[i][j - beg];
+      rows[j] = i;
+      cols[j] = cols_[i][j - beg];
     }
-    indptr[i] = offset + indptr_[i];
+    indptr[i] = indptr_[i];
   }
 }
 
diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc
index ec959c5..8c28639 100644
--- a/cusim/culda/bindings.cc
+++ b/cusim/culda/bindings.cc
@@ -54,17 +54,17 @@ class CuLDABind {
         _new_beta.mutable_data(0), num_words);
   }
 
-  void FeedData(py::object& indices, py::object indptr, const int num_iters) {
-    int_array _indices(indices);
+  void FeedData(py::object& cols, py::object indptr, const int num_iters) {
+    int_array _cols(cols);
     int_array _indptr(indptr);
-    auto indices_buffer = _indices.request();
+    auto cols_buffer = _cols.request();
     auto indptr_buffer = _indptr.request();
-    if (indices_buffer.ndim != 1 or indptr_buffer.ndim != 1) {
-      throw std::runtime_error("invalid indices or indptr");
+    if (cols_buffer.ndim != 1 or indptr_buffer.ndim != 1) {
+      throw std::runtime_error("invalid cols or indptr");
     }
-    int num_indices = indices_buffer.shape[0];
+    int num_cols = cols_buffer.shape[0];
     int num_indptr = indptr_buffer.shape[0];
-    obj_.FeedData(_indices.data(0), _indptr.data(0), num_indices, num_indptr, num_iters);
+    obj_.FeedData(_cols.data(0), _indptr.data(0), num_cols, num_indptr, num_iters);
   }
 
   void Pull() {
@@ -89,7 +89,7 @@ PYBIND11_PLUGIN(culda_bind) {
       py::arg("alpha"), py::arg("beta"),
       py::arg("grad_alpha"), py::arg("new_beta"))
   .def("feed_data", &CuLDABind::FeedData,
-      py::arg("indices"), py::arg("indptr"), py::arg("num_iters"))
+      py::arg("cols"), py::arg("indptr"), py::arg("num_iters"))
   .def("pull", &CuLDABind::Pull)
   .def("push", &CuLDABind::Push)
   .def("__repr__",
diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
index 1f1639a..8f59b91 100644
--- a/cusim/culda/pyculda.py
+++ b/cusim/culda/pyculda.py
@@ -37,6 +37,14 @@ def __init__(self, opt=None):
     self.alpha, self.beta, self.grad_alpha, self.new_beta = \
       None, None, None, None
 
+  def preprocess_data(self):
+    if self.opt.skip_preprocess:
+      return
+    iou = IoUtils()
+    if not self.opt.data_dir:
+      self.opt.data_dir = tempfile.TemporaryDirectory().name
+    iou.convert_stream_to_h5(self.opt.data_path, self.opt.data_dir)
+
   def init_model(self):
     # load voca
     self.logger.info("load key from %s", pjoin(self.opt.data_dir, "keys.txt"))
@@ -46,11 +54,10 @@ def init_model(self):
     self.logger.info("number of words: %d", self.num_words)
 
     # random initialize alpha and beta
-    self.alpha = \
-      np.abs(np.random.uniform( \
-        size=(self.opt.num_topics,))).astype(np.float32)
-    self.beta = np.abs(np.random.uniform( \
-      size=(self.num_words, self.opt.num_topics))).astype(np.float32)
+    self.alpha = np.random.uniform( \
+      size=(self.opt.num_topics,)).astype(np.float32)
+    self.beta = np.random.uniform( \
+      size=(self.num_words, self.opt.num_topics)).astype(np.float32)
     self.beta /= np.sum(self.beta, axis=0)[None, :]
     self.logger.info("alpha %s, beta %s initialized",
                      self.alpha.shape, self.beta.shape)
@@ -61,3 +68,25 @@ def init_model(self):
 
     # push it to gpu
     self.obj.load_model(self.alpha, self.beta, self.grad_alpha, self.new_beta)
+
+  def train_model(self):
+    self.preprocess_data()
+    self.init_model()
+
+  def _train_Estep(self, h5f):
+    offset, size = 0, h5f["indptr"].shape[0] - 1
+    steps = (size - 1) // batch_size + 1
+    pbar = aux.Progbar(size)
+    for step in range(steps):
+      next_offset = min(size, offset + batch_size)
+      indptr = h5f["indptr"][offset:next_offset + 1]
+      beg, end = indptr[0], indptr[-1]
+      indptr -= beg
+      indices = h5f["indices"][beg:end]
+      offset = next_offset
+
+      self.obj.FeedData(indices, indptr, self.opt.num_iters_in_Estep)
+      pbar.update(offset)
+
+  def _train_Mstep(self):
+    pass
diff --git a/cusim/ioutils/bindings.cc b/cusim/ioutils/bindings.cc
index ad35ebb..28fbbc8 100644
--- a/cusim/ioutils/bindings.cc
+++ b/cusim/ioutils/bindings.cc
@@ -39,10 +39,11 @@ class IoUtilsBind {
     obj_.GetWordVocab(min_count, keys_path);
   }
 
-  void GetToken(py::object& indices, py::object& indptr, int offset) {
-    int_array _indices(indices);
+  void GetToken(py::object& rows, py::object& cols, py::object& indptr) {
+    int_array _rows(rows);
+    int_array _cols(cols);
     int_array _indptr(indptr);
-    obj_.GetToken(_indices.mutable_data(0), _indptr.mutable_data(0), offset);
+    obj_.GetToken(_rows.mutable_data(0), _cols.mutable_data(0), _indptr.mutable_data(0));
   }
 
  private:
diff --git a/cusim/ioutils/pyioutils.py b/cusim/ioutils/pyioutils.py
index 49b12a3..5f6bc5e 100644
--- a/cusim/ioutils/pyioutils.py
+++ b/cusim/ioutils/pyioutils.py
@@ -23,7 +23,7 @@ def __init__(self, opt=None):
     self.opt = aux.get_opt_as_proto(opt or {}, IoUtilsConfigProto)
     self.logger = aux.get_logger("ioutils", level=self.opt.py_log_level)
 
-    tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)
+    tmp = tempfile.NamedTemporaryFile(mode='w')
     opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2)
     tmp.write(opt_content)
     tmp.close()
@@ -60,9 +60,12 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir,
     pbar = aux.Progbar(full_num_lines, unit_name="line")
     processed = 0
     h5f = h5py.File(token_path, "w")
-    indices = h5f.create_dataset("indices", shape=(chunk_indices,),
-                                 maxshape=(None,), dtype=np.int32,
-                                 chunks=(chunk_indices,))
+    rows = h5f.create_dataset("rows", shape=(chunk_indices,),
+                              maxshape=(None,), dtype=np.int32,
+                              chunks=(chunk_indices,))
+    cols = h5f.create_dataset("cols", shape=(chunk_indices,),
+                              maxshape=(None,), dtype=np.int32,
+                              chunks=(chunk_indices,))
     indptr =  h5f.create_dataset("indptr", shape=(full_num_lines + 1,),
                                  dtype=np.int32, chunks=True)
     processed, offset = 1, 0
@@ -70,12 +73,15 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir,
     while True:
       read_lines, data_size = self.obj.tokenize_stream(
         self.opt.chunk_lines, self.opt.num_threads)
-      _indices = np.empty(shape=(data_size,), dtype=np.int32)
+      _rows = np.empty(shape=(data_size,), dtype=np.int32)
+      _cols = np.empty(shape=(data_size,), dtype=np.int32)
       _indptr = np.empty(shape=(read_lines,), dtype=np.int32)
-      self.obj.get_token(_indices, _indptr, offset)
-      indices.resize((offset + data_size,))
-      indices[offset:offset + data_size] = _indices
-      indptr[processed:processed + read_lines] = _indptr
+      self.obj.get_token(_rows, _cols, _indptr)
+      rows.resize((offset + data_size,))
+      rows[offset:offset + data_size] = _rows + (processed - 1)
+      cols.resize((offset + data_size,))
+      cols[offset:offset + data_size] = _cols
+      indptr[processed:processed + read_lines] = _indptr + offset
       offset += data_size
       processed += read_lines
       pbar.update(processed - 1)

From 36d92caaa9655f45333267f38b97e98a7cbe9e3d Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Wed, 10 Feb 2021 21:57:38 +0900
Subject: [PATCH 18/26] implement M step

---
 cpp/include/culda/culda.hpp |  1 +
 cpp/src/culda/culda.cu      | 10 +++--
 cusim/culda/bindings.cc     |  9 ++++-
 cusim/culda/pyculda.py      | 77 ++++++++++++++++++++++++++++---------
 cusim/proto/config.proto    |  9 ++++-
 5 files changed, 82 insertions(+), 24 deletions(-)

diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp
index 127d515..ee649de 100644
--- a/cpp/include/culda/culda.hpp
+++ b/cpp/include/culda/culda.hpp
@@ -68,6 +68,7 @@ class CuLDA {
       const int num_indices, const int num_indptr, const int num_iters);
   void Pull();
   void Push();
+  int GetBlockCnt();
  private:
   DeviceInfo dev_info_;
   json11::Json opt_;
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
index b57db97..592e3ed 100644
--- a/cpp/src/culda/culda.cu
+++ b/cpp/src/culda/culda.cu
@@ -49,11 +49,11 @@ void CuLDA::LoadModel(float* alpha, float* beta,
   // resize device vector
   grad_alpha_ = grad_alpha;
   new_beta_ = new_beta;
-  dev_grad_alpha_.resize(num_topics_);
+  dev_grad_alpha_.resize(block_cnt_ * num_topics_);
   dev_new_beta_.resize(num_topics_ * num_words_);
 
   // copy to device
-  thrust::copy(grad_alpha_, grad_alpha_ + num_topics_, dev_grad_alpha_.begin());
+  thrust::copy(grad_alpha_, grad_alpha_ + block_cnt_ * num_topics_, dev_grad_alpha_.begin());
   thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin());
   dev_gamma_.resize(num_topics_ * block_cnt_);
   dev_new_gamma_.resize(num_topics_ * block_cnt_);
@@ -92,10 +92,14 @@ void CuLDA::Pull() {
 
 void CuLDA::Push() {
   thrust::copy(alpha_, alpha_ + num_topics_, dev_alpha_.begin());
-  thrust::copy(grad_alpha_, grad_alpha_ + num_topics_, dev_grad_alpha_.begin());
+  thrust::copy(grad_alpha_, grad_alpha_ + block_cnt_ * num_topics_, dev_grad_alpha_.begin());
   thrust::copy(beta_, beta_ + num_words_ * num_topics_, dev_beta_.begin());
   thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin());
   CHECK_CUDA(cudaDeviceSynchronize());
 }
 
+int CuLDA::GetBlockCnt() {
+  return block_cnt_;
+}
+
 }  // namespace cusim
diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc
index 8c28639..716624d 100644
--- a/cusim/culda/bindings.cc
+++ b/cusim/culda/bindings.cc
@@ -40,9 +40,9 @@ class CuLDABind {
     float_array _new_beta(new_beta);
     auto grad_alpha_buffer = _grad_alpha.request();
     auto new_beta_buffer = _new_beta.request();
-    if (grad_alpha_buffer.ndim != 1 or
+    if (grad_alpha_buffer.ndim != 2 or
         new_beta_buffer.ndim != 2 or
-        grad_alpha_buffer.shape[0] != new_beta_buffer.shape[1]) {
+        grad_alpha_buffer.shape[1] != new_beta_buffer.shape[1]) {
       throw std::runtime_error("invalid grad_alpha or new_beta");
     }
 
@@ -75,6 +75,10 @@ class CuLDABind {
     obj_.Push();
   }
 
+  int GetBlockCnt() {
+    return obj_.GetBlockCnt();
+  }
+
  private:
   cusim::CuLDA obj_;
 };
@@ -92,6 +96,7 @@ PYBIND11_PLUGIN(culda_bind) {
       py::arg("cols"), py::arg("indptr"), py::arg("num_iters"))
   .def("pull", &CuLDABind::Pull)
   .def("push", &CuLDABind::Push)
+  .def("get_block_cnt", &CuLDABind::GetBlockCnt)
   .def("__repr__",
   [](const CuLDABind &a) {
     return "<CuLDABind>";
diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
index 8f59b91..b7b2020 100644
--- a/cusim/culda/pyculda.py
+++ b/cusim/culda/pyculda.py
@@ -11,10 +11,11 @@
 import json
 import tempfile
 
-# import h5py
+import h5py
 import numpy as np
+from scipy.special import polygamma as pg
 
-from cusim import aux
+from cusim import aux, IoUtils
 from cusim.culda.culda_bind import CuLDABind
 from cusim.config_pb2 import CuLDAConfigProto
 
@@ -33,7 +34,7 @@ def __init__(self, opt=None):
     assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}"
     os.remove(tmp.name)
 
-    self.words, self.num_words = None, None
+    self.words, self.num_words, self.num_docs = None, None, None
     self.alpha, self.beta, self.grad_alpha, self.new_beta = \
       None, None, None, None
 
@@ -43,7 +44,8 @@ def preprocess_data(self):
     iou = IoUtils()
     if not self.opt.data_dir:
       self.opt.data_dir = tempfile.TemporaryDirectory().name
-    iou.convert_stream_to_h5(self.opt.data_path, self.opt.data_dir)
+    iou.convert_stream_to_h5(self.opt.data_path, self.opt.word_min_count,
+                             self.opt.data_dir)
 
   def init_model(self):
     # load voca
@@ -51,7 +53,14 @@ def init_model(self):
     with open(pjoin(self.opt.data_dir, "keys.txt"), "rb") as fin:
       self.words = [line.strip() for line in fin]
     self.num_words = len(self.words)
-    self.logger.info("number of words: %d", self.num_words)
+
+    # count number of docs
+    h5f = h5py.File(pjoin(self.opt.data_dir, "token.h5"), "r")
+    self.num_docs = h5f["indptr"].shape[0] - 1
+    h5f.close()
+
+    self.logger.info("number of words: %d, docs: %d",
+                     self.num_words, self.num_docs)
 
     # random initialize alpha and beta
     self.alpha = np.random.uniform( \
@@ -63,7 +72,9 @@ def init_model(self):
                      self.alpha.shape, self.beta.shape)
 
     # zero initialize grad alpha and new beta
-    self.grad_alpha = np.zeros(shape=self.alpha.shape, dtype=np.float32)
+    block_cnt = self.obj.get_block_cnt()
+    self.grad_alpha = np.zeros(shape=(block_cnt, self.num_topics),
+                               dtype=np.float32)
     self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32)
 
     # push it to gpu
@@ -72,21 +83,51 @@ def init_model(self):
   def train_model(self):
     self.preprocess_data()
     self.init_model()
-
-  def _train_Estep(self, h5f):
-    offset, size = 0, h5f["indptr"].shape[0] - 1
-    steps = (size - 1) // batch_size + 1
+    h5f = h5py.File(pjoin(self.opt.data_dir, "token.h5"), "r")
+    for epoch in range(1, self.opt.epochs + 1):
+      self.logger.info("Epoch %d / %d", epoch, self.opt.epochs)
+      self._train_e_step(h5f)
+      self._train_m_step()
+    h5f.close()
+
+  def _train_e_step(self, h5f):
+    offset, size = 0, h5f["cols"].shape[0]
     pbar = aux.Progbar(size)
-    for step in range(steps):
-      next_offset = min(size, offset + batch_size)
+    while True:
+      target = h5f["indptr"][offset] + self.opt.batch_size
+      if target < size:
+        next_offset = h5f["rows"][target]
+      else:
+        next_offset = h5f["indptr"].shape[0] - 1
       indptr = h5f["indptr"][offset:next_offset + 1]
       beg, end = indptr[0], indptr[-1]
       indptr -= beg
-      indices = h5f["indices"][beg:end]
+      cols = h5f["cols"][beg:end]
       offset = next_offset
 
-      self.obj.FeedData(indices, indptr, self.opt.num_iters_in_Estep)
-      pbar.update(offset)
-
-  def _train_Mstep(self):
-    pass
+      self.obj.FeedData(cols, indptr, self.opt.num_iters_in_e_step)
+      pbar.update(end)
+      if end == size:
+        break
+
+  def _train_m_step(self):
+    self.obj.pull()
+
+    # update beta
+    self.beta[:, :] = self.new_beta / np.sum(self.new_beta, axis=0)[None, :]
+    self.new_beta[:, :] = 0
+
+    # update alpha
+    alpha_sum = np.sum(self.alpha)
+    gvec = np.sum(self.grad_alpha, axis=0)
+    gvec += self.num_docs * (pg(0, alpha_sum) - pg(0, self.alpha))
+    hvec = self.num_docs * pg(1, self.alpha)
+    z_0 = pg(1, alpha_sum)
+    c_nume = np.sum(gvec / hvec)
+    c_deno = 1 / z_0 + np.sum(1 / hvec)
+    c_0 = c_nume / c_deno
+    delta = (gvec - c_0) / hvec
+    self.alpha -= delta
+    self.grad_alpha[:,:] = 0
+
+    self.obj.push()
diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto
index b82c204..0e0ad10 100644
--- a/cusim/proto/config.proto
+++ b/cusim/proto/config.proto
@@ -14,11 +14,18 @@ message IoUtilsConfigProto {
 }
 
 message CuLDAConfigProto {
+  required string data_path = 7;
+
   optional int32 py_log_level = 1 [default = 2];
   optional int32 c_log_level = 2 [default = 2];
 
   optional int32 num_topics = 3 [default = 10];
   optional int32 block_dim = 4 [default = 32];
   optional int32 hyper_threads = 5 [default = 10];
-  required string data_dir = 6;
+  optional string data_dir = 6;
+  optional bool skip_preprocess = 8;
+  optional int32 word_min_count = 9 [default = 5];
+  optional int32 batch_size = 10 [default = 100000];
+  optional int32 epochs = 11 [default = 10];
+  optional int32 num_iters_in_e_step = 12 [default = 5];
 }

From bab04b0b36e2711e0f38c8397dd7f3da0474305b Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Wed, 10 Feb 2021 22:37:55 +0900
Subject: [PATCH 19/26] train only train samples

---
 cpp/include/culda/cuda_lda_kernels.cuh | 47 ++++++++++++++++++--------
 cpp/include/culda/culda.hpp            |  3 +-
 cpp/include/utils/types.hpp            |  1 +
 cpp/src/culda/culda.cu                 | 27 +++++++++++++--
 cusim/culda/bindings.cc                | 13 ++++---
 cusim/culda/pyculda.py                 | 24 +++++++++++--
 cusim/ioutils/pyioutils.py             |  6 ++++
 cusim/proto/config.proto               |  1 +
 8 files changed, 98 insertions(+), 24 deletions(-)

diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh
index 5211695..ad92afd 100644
--- a/cpp/include/culda/cuda_lda_kernels.cuh
+++ b/cpp/include/culda/cuda_lda_kernels.cuh
@@ -26,17 +26,19 @@ float Digamma(float x) {
 }
 
 __global__ void EstepKernel(
-  const int* cols, const int* indptr, 
+  const int* cols, const int* indptr, const bool* vali,
   const int num_cols, const int num_indptr,
   const int num_words, const int num_topics, const int num_iters,
   float* gamma, float* new_gamma, float* phi,
   float* alpha, float* beta,
-  float* grad_alpha, float* new_beta) {
+  float* grad_alpha, float* new_beta, float* train_losses, float* vali_losses) {
   
   // storage for block
   float* _gamma = gamma + num_topics * blockIdx.x;
   float* _new_gamma = new_gamma + num_topics * blockIdx.x;
   float* _phi = phi + num_topics * blockIdx.x;
+  float* _grad_alpha = grad_alpha + num_topics * blockIdx.x;
+
 
   for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) {
     int beg = indptr[i], end = indptr[i + 1];
@@ -56,18 +58,34 @@ __global__ void EstepKernel(
       // compute phi from gamma
       for (int k = beg; k < end; ++k) {
         const int w = cols[k];
+        const bool _vali = vali[k];
+
         // compute phi
-        for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
-          _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l]));
-        __syncthreads();
-        
-        // normalize phi and add it to new gamma and new beta
-        float phi_sum = ReduceSum(_phi, num_topics);
-        for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
-          _phi[l] /= phi_sum;
-          _new_gamma[l] += _phi[l];
-          if (j + 1 == num_iters) 
-            new_beta[w * num_topics + l] += phi[l];
+        if (not _vali or j + 1 == num_iters) {
+          for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
+            _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l]));
+          __syncthreads();
+          
+          // normalize phi and add it to new gamma and new beta
+          float phi_sum = ReduceSum(_phi, num_topics);
+          for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
+            _phi[l] /= phi_sum;
+            if (not _vali) _new_gamma[l] += _phi[l];
+            if (j + 1 == num_iters) { 
+              if (not _vali) new_beta[w * num_topics + l] += _phi[l];
+              _phi[l] *= beta[w * num_topics + l];
+            }
+          }
+          __syncthreads();
+        }
+        if (j + 1 == num_iters) {
+          float p = ReduceSum(_phi, num_topics);
+          if (threadIdx.x == 0) {
+            if (_vali)
+              vali_losses[blockIdx.x] += logf(p + EPS);
+            else
+              train_losses[blockIdx.x] += logf(p + EPS);
+          } 
         }
         __syncthreads();
       }
@@ -79,7 +97,8 @@ __global__ void EstepKernel(
     }
     float gamma_sum = ReduceSum(_gamma, num_topics);
     for (int j = threadIdx.x; j < num_topics; j += blockDim.x)
-      grad_alpha[j] += (Digamma(_gamma[j]) - Digamma(gamma_sum));
+      _grad_alpha[j] += (Digamma(_gamma[j]) - Digamma(gamma_sum));
+
     __syncthreads();
   } 
 }
diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp
index ee649de..b779e3a 100644
--- a/cpp/include/culda/culda.hpp
+++ b/cpp/include/culda/culda.hpp
@@ -64,7 +64,8 @@ class CuLDA {
   bool Init(std::string opt_path);
   void LoadModel(float* alpha, float* beta,
       float* grad_alpha, float* new_beta, const int num_words);
-  void FeedData(const int* indices, const int* indptr,
+  std::pair<float, float> FeedData(
+      const int* indices, const int* indptr, const bool* vali,
       const int num_indices, const int num_indptr, const int num_iters);
   void Pull();
   void Push();
diff --git a/cpp/include/utils/types.hpp b/cpp/include/utils/types.hpp
index 82d93cc..87e864b 100644
--- a/cpp/include/utils/types.hpp
+++ b/cpp/include/utils/types.hpp
@@ -11,3 +11,4 @@ struct DeviceInfo {
 };
 
 #define WARP_SIZE 32
+#define EPS 1e-10f
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
index 592e3ed..8f5b5fb 100644
--- a/cpp/src/culda/culda.cu
+++ b/cpp/src/culda/culda.cu
@@ -61,17 +61,27 @@ void CuLDA::LoadModel(float* alpha, float* beta,
   CHECK_CUDA(cudaDeviceSynchronize());
 }
 
-void CuLDA::FeedData(const int* cols, const int* indptr, 
+std::pair<float, float> CuLDA::FeedData(
+    const int* cols, const int* indptr, const bool* vali,
     const int num_cols, const int num_indptr, const int num_iters) {
+  
+  // copy feed data to GPU memory
   thrust::device_vector<int> dev_cols(num_cols);
   thrust::device_vector<int> dev_indptr(num_indptr + 1);
+  thrust::device_vector<bool> dev_vali(num_cols);
+  thrust::device_vector<float> dev_train_losses(block_cnt_, 0.0f);
+  thrust::device_vector<float> dev_vali_losses(block_cnt_, 0.0f);
   thrust::copy(cols, cols + num_cols, dev_cols.begin());
   thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin());
+  thrust::copy(vali, vali + num_cols, dev_vali.begin());
+  
   CHECK_CUDA(cudaDeviceSynchronize());
 
+  // run E step in GPU
   EstepKernel<<<block_cnt_, block_dim_>>>(
     thrust::raw_pointer_cast(dev_cols.data()),
     thrust::raw_pointer_cast(dev_indptr.data()),
+    thrust::raw_pointer_cast(dev_vali.data()),
     num_cols, num_indptr, num_words_, num_topics_, num_iters,
     thrust::raw_pointer_cast(dev_gamma_.data()),
     thrust::raw_pointer_cast(dev_new_gamma_.data()),
@@ -79,9 +89,22 @@ void CuLDA::FeedData(const int* cols, const int* indptr,
     thrust::raw_pointer_cast(dev_alpha_.data()),
     thrust::raw_pointer_cast(dev_beta_.data()),
     thrust::raw_pointer_cast(dev_grad_alpha_.data()),
-    thrust::raw_pointer_cast(dev_new_beta_.data()));
+    thrust::raw_pointer_cast(dev_new_beta_.data()),
+    thrust::raw_pointer_cast(dev_train_losses.data()),
+    thrust::raw_pointer_cast(dev_vali_losses.data()));
   
   CHECK_CUDA(cudaDeviceSynchronize());
+
+  // pull loss
+  std::vector<float> train_losses(block_cnt_), vali_losses(block_cnt_);
+  thrust::copy(dev_train_losses.begin(), dev_train_losses.end(), train_losses.begin());
+  thrust::copy(dev_vali_losses.begin(), dev_vali_losses.end(), vali_losses.begin());
+  CHECK_CUDA(cudaDeviceSynchronize());
+
+  // accumulate
+  float train_loss = std::accumulate(train_losses.begin(), train_losses.end(), 0.0f);
+  float vali_loss = std::accumulate(vali_losses.begin(), vali_losses.end(), 0.0f);
+  return {train_loss, vali_loss};
 }
 
 void CuLDA::Pull() {
diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc
index 716624d..9e42c65 100644
--- a/cusim/culda/bindings.cc
+++ b/cusim/culda/bindings.cc
@@ -14,6 +14,7 @@ namespace py = pybind11;
 
 typedef py::array_t<float, py::array::c_style | py::array::forcecast> float_array;
 typedef py::array_t<int, py::array::c_style | py::array::forcecast> int_array;
+typedef py::array_t<bool, py::array::c_style | py::array::forcecast> bool_array;
 
 class CuLDABind {
  public:
@@ -54,17 +55,21 @@ class CuLDABind {
         _new_beta.mutable_data(0), num_words);
   }
 
-  void FeedData(py::object& cols, py::object indptr, const int num_iters) {
+  std::pair<float, float> FeedData(py::object& cols, py::object& indptr, py::object& vali, const int num_iters) {
     int_array _cols(cols);
     int_array _indptr(indptr);
+    bool_arrray _vali(vali);
     auto cols_buffer = _cols.request();
     auto indptr_buffer = _indptr.request();
-    if (cols_buffer.ndim != 1 or indptr_buffer.ndim != 1) {
+    auto vali_buffer = _vali.request();
+    if (cols_buffer.ndim != 1 or indptr_buffer.ndim != 1 or vali_buffer.ndim != 1
+        or cols_buffer.shape[0] != vali_buffer.shape[0]) {
       throw std::runtime_error("invalid cols or indptr");
     }
     int num_cols = cols_buffer.shape[0];
     int num_indptr = indptr_buffer.shape[0];
-    obj_.FeedData(_cols.data(0), _indptr.data(0), num_cols, num_indptr, num_iters);
+    return obj_.FeedData(_cols.data(0), _indptr.data(0), _vali.data(0),
+        num_cols, num_indptr, num_iters);
   }
 
   void Pull() {
@@ -93,7 +98,7 @@ PYBIND11_PLUGIN(culda_bind) {
       py::arg("alpha"), py::arg("beta"),
       py::arg("grad_alpha"), py::arg("new_beta"))
   .def("feed_data", &CuLDABind::FeedData,
-      py::arg("cols"), py::arg("indptr"), py::arg("num_iters"))
+      py::arg("cols"), py::arg("indptr"), py::arg("vali"), py::arg("num_iters"))
   .def("pull", &CuLDABind::Pull)
   .def("push", &CuLDABind::Push)
   .def("get_block_cnt", &CuLDABind::GetBlockCnt)
diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
index b7b2020..b790bb0 100644
--- a/cusim/culda/pyculda.py
+++ b/cusim/culda/pyculda.py
@@ -92,7 +92,9 @@ def train_model(self):
 
   def _train_e_step(self, h5f):
     offset, size = 0, h5f["cols"].shape[0]
-    pbar = aux.Progbar(size)
+    pbar = aux.Progbar(size, stateful_metrics=["train_loss", "vali_loss"])
+    train_loss_nume, train_loss_deno = 0, 0
+    vali_loss_nume, vali_loss_deno = 0, 0
     while True:
       target = h5f["indptr"][offset] + self.opt.batch_size
       if target < size:
@@ -103,10 +105,26 @@ def _train_e_step(self, h5f):
       beg, end = indptr[0], indptr[-1]
       indptr -= beg
       cols = h5f["cols"][beg:end]
+      vali = (h5f["vali"][beg:end] < self.opt.vali_p).astype(np.bool)
       offset = next_offset
 
-      self.obj.FeedData(cols, indptr, self.opt.num_iters_in_e_step)
-      pbar.update(end)
+      # call cuda kernel
+      train_loss, vali_loss = \
+        self.obj.FeedData(cols, indptr, vali, self.opt.num_iters_in_e_step)
+
+      # accumulate loss
+      train_loss_nume -= train_loss
+      vali_loss_nume -= vali_loss
+      vali_cnt = np.count_nonzero(vali)
+      train_cnt = len(vali) - vali_cnt
+      train_loss_nume += train_cnt
+      vali_loss_nume += train_cnt
+      train_loss = train_loss_nume / train_loss_deno
+      vali_loss = vali_loss_nume / vali_loss_deno
+
+      # update progress bar
+      pbar.update(end, values=[("train_loss", train_loss),
+                               ("vali_loss", vali_loss)])
       if end == size:
         break
 
diff --git a/cusim/ioutils/pyioutils.py b/cusim/ioutils/pyioutils.py
index 5f6bc5e..be045c8 100644
--- a/cusim/ioutils/pyioutils.py
+++ b/cusim/ioutils/pyioutils.py
@@ -66,6 +66,9 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir,
     cols = h5f.create_dataset("cols", shape=(chunk_indices,),
                               maxshape=(None,), dtype=np.int32,
                               chunks=(chunk_indices,))
+    vali = h5f.create_dataset("vali", shape=(chunk_indices,),
+                              maxshape=(None,), dtype=np.float32,
+                              chunks=(chunk_indices,))
     indptr =  h5f.create_dataset("indptr", shape=(full_num_lines + 1,),
                                  dtype=np.int32, chunks=True)
     processed, offset = 1, 0
@@ -81,6 +84,9 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir,
       rows[offset:offset + data_size] = _rows + (processed - 1)
       cols.resize((offset + data_size,))
       cols[offset:offset + data_size] = _cols
+      vali.resize((offset + data_size,))
+      vali[offset:offset + data_size] = \
+        np.uniform(size=(data_size,)).astype(np.float32)
       indptr[processed:processed + read_lines] = _indptr + offset
       offset += data_size
       processed += read_lines
diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto
index 0e0ad10..14f9004 100644
--- a/cusim/proto/config.proto
+++ b/cusim/proto/config.proto
@@ -28,4 +28,5 @@ message CuLDAConfigProto {
   optional int32 batch_size = 10 [default = 100000];
   optional int32 epochs = 11 [default = 10];
   optional int32 num_iters_in_e_step = 12 [default = 5];
+  optional double vali_p = 13 [default = 0.2];
 }

From a5d945665adef879bee60fb341a02ea7a9ea1f5b Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Wed, 10 Feb 2021 22:42:25 +0900
Subject: [PATCH 20/26] fix typo and compile succeed

---
 cusim/culda/bindings.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc
index 9e42c65..9a59c6e 100644
--- a/cusim/culda/bindings.cc
+++ b/cusim/culda/bindings.cc
@@ -58,7 +58,7 @@ class CuLDABind {
   std::pair<float, float> FeedData(py::object& cols, py::object& indptr, py::object& vali, const int num_iters) {
     int_array _cols(cols);
     int_array _indptr(indptr);
-    bool_arrray _vali(vali);
+    bool_array _vali(vali);
     auto cols_buffer = _cols.request();
     auto indptr_buffer = _indptr.request();
     auto vali_buffer = _vali.request();

From df154946e7aebddf9f28a5da95c4037515743868 Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Wed, 10 Feb 2021 22:46:46 +0900
Subject: [PATCH 21/26] bug-fix and set random seed

---
 cusim/culda/pyculda.py     | 1 +
 cusim/ioutils/pyioutils.py | 7 ++++---
 cusim/proto/config.proto   | 1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
index b790bb0..35658eb 100644
--- a/cusim/culda/pyculda.py
+++ b/cusim/culda/pyculda.py
@@ -63,6 +63,7 @@ def init_model(self):
                      self.num_words, self.num_docs)
 
     # random initialize alpha and beta
+    np.random.seed(self.opt.seed)
     self.alpha = np.random.uniform( \
       size=(self.opt.num_topics,)).astype(np.float32)
     self.beta = np.random.uniform( \
diff --git a/cusim/ioutils/pyioutils.py b/cusim/ioutils/pyioutils.py
index be045c8..5bce9b7 100644
--- a/cusim/ioutils/pyioutils.py
+++ b/cusim/ioutils/pyioutils.py
@@ -23,7 +23,7 @@ def __init__(self, opt=None):
     self.opt = aux.get_opt_as_proto(opt or {}, IoUtilsConfigProto)
     self.logger = aux.get_logger("ioutils", level=self.opt.py_log_level)
 
-    tmp = tempfile.NamedTemporaryFile(mode='w')
+    tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)
     opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2)
     tmp.write(opt_content)
     tmp.close()
@@ -49,7 +49,8 @@ def load_stream_vocab(self, filepath, min_count, keys_path):
     self.obj.get_word_vocab(min_count, keys_path)
 
   def convert_stream_to_h5(self, filepath, min_count, out_dir,
-                           chunk_indices=10000):
+                           chunk_indices=10000, seed=777):
+    np.random.seed(seed)
     os.makedirs(out_dir, exist_ok=True)
     keys_path = pjoin(out_dir, "keys.txt")
     token_path = pjoin(out_dir, "token.h5")
@@ -86,7 +87,7 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir,
       cols[offset:offset + data_size] = _cols
       vali.resize((offset + data_size,))
       vali[offset:offset + data_size] = \
-        np.uniform(size=(data_size,)).astype(np.float32)
+        np.random.uniform(size=(data_size,)).astype(np.float32)
       indptr[processed:processed + read_lines] = _indptr + offset
       offset += data_size
       processed += read_lines
diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto
index 14f9004..5c4e0d9 100644
--- a/cusim/proto/config.proto
+++ b/cusim/proto/config.proto
@@ -29,4 +29,5 @@ message CuLDAConfigProto {
   optional int32 epochs = 11 [default = 10];
   optional int32 num_iters_in_e_step = 12 [default = 5];
   optional double vali_p = 13 [default = 0.2];
+  optional int32 seed = 14 [default = 777];
 }

From f094b95aaf6cf84f7ea73c39818f109b9d0ad592 Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Wed, 10 Feb 2021 22:52:06 +0900
Subject: [PATCH 22/26] bug-fix in pyculda

---
 cusim/culda/pyculda.py | 4 ++--
 examples/example1.py   | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
index 35658eb..20a4f87 100644
--- a/cusim/culda/pyculda.py
+++ b/cusim/culda/pyculda.py
@@ -74,7 +74,7 @@ def init_model(self):
 
     # zero initialize grad alpha and new beta
     block_cnt = self.obj.get_block_cnt()
-    self.grad_alpha = np.zeros(shape=(block_cnt, self.num_topics),
+    self.grad_alpha = np.zeros(shape=(block_cnt, self.opt.num_topics),
                                dtype=np.float32)
     self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32)
 
@@ -111,7 +111,7 @@ def _train_e_step(self, h5f):
 
       # call cuda kernel
       train_loss, vali_loss = \
-        self.obj.FeedData(cols, indptr, vali, self.opt.num_iters_in_e_step)
+        self.obj.feed_data(cols, indptr, vali, self.opt.num_iters_in_e_step)
 
       # accumulate loss
       train_loss_nume -= train_loss
diff --git a/examples/example1.py b/examples/example1.py
index 9886733..4297768 100644
--- a/examples/example1.py
+++ b/examples/example1.py
@@ -40,10 +40,11 @@ def run_io():
 
 def run_lda():
   opt = {
+    "data_path": DATA_PATH,
     "data_dir": DATA_PATH2,
   }
   lda = CuLDA(opt)
-  lda.init_model()
+  lda.train_model()
 
 
 if __name__ == "__main__":

From b97c1489721f1910be23c77fb731aa305587115d Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Wed, 10 Feb 2021 23:33:09 +0900
Subject: [PATCH 23/26] bug-fix in tokenization

---
 cpp/include/culda/cuda_lda_kernels.cuh |  5 +----
 cpp/src/culda/culda.cu                 | 11 ++++++-----
 cpp/src/utils/ioutils.cc               |  5 +++--
 cusim/culda/bindings.cc                |  4 ++--
 cusim/culda/pyculda.py                 | 12 ++++++++----
 examples/example1.py                   |  2 ++
 6 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh
index ad92afd..e6885f8 100644
--- a/cpp/include/culda/cuda_lda_kernels.cuh
+++ b/cpp/include/culda/cuda_lda_kernels.cuh
@@ -28,7 +28,7 @@ float Digamma(float x) {
 __global__ void EstepKernel(
   const int* cols, const int* indptr, const bool* vali,
   const int num_cols, const int num_indptr,
-  const int num_words, const int num_topics, const int num_iters,
+  const int num_topics, const int num_iters,
   float* gamma, float* new_gamma, float* phi,
   float* alpha, float* beta,
   float* grad_alpha, float* new_beta, float* train_losses, float* vali_losses) {
@@ -39,10 +39,8 @@ __global__ void EstepKernel(
   float* _phi = phi + num_topics * blockIdx.x;
   float* _grad_alpha = grad_alpha + num_topics * blockIdx.x;
 
-
   for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) {
     int beg = indptr[i], end = indptr[i + 1];
-    
     // initialize gamma
     for (int j = threadIdx.x; j < num_topics; j += blockDim.x)
       _gamma[j] = alpha[j] + (end - beg) / num_topics;
@@ -59,7 +57,6 @@ __global__ void EstepKernel(
       for (int k = beg; k < end; ++k) {
         const int w = cols[k];
         const bool _vali = vali[k];
-
         // compute phi
         if (not _vali or j + 1 == num_iters) {
           for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
index 8f5b5fb..ac7cd8f 100644
--- a/cpp/src/culda/culda.cu
+++ b/cpp/src/culda/culda.cu
@@ -39,7 +39,7 @@ bool CuLDA::Init(std::string opt_path) {
 void CuLDA::LoadModel(float* alpha, float* beta, 
     float* grad_alpha, float* new_beta, int num_words) {
   num_words_ = num_words;
-  DEBUG("copy model({} x {})", num_topics_, num_words_);
+  DEBUG("copy model({} x {})", num_words_, num_topics_);
   dev_alpha_.resize(num_topics_);
   dev_beta_.resize(num_topics_ * num_words_);
   thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin());
@@ -49,7 +49,7 @@ void CuLDA::LoadModel(float* alpha, float* beta,
   // resize device vector
   grad_alpha_ = grad_alpha;
   new_beta_ = new_beta;
-  dev_grad_alpha_.resize(block_cnt_ * num_topics_);
+  dev_grad_alpha_.resize(num_topics_ * block_cnt_);
   dev_new_beta_.resize(num_topics_ * num_words_);
 
   // copy to device
@@ -74,15 +74,15 @@ std::pair<float, float> CuLDA::FeedData(
   thrust::copy(cols, cols + num_cols, dev_cols.begin());
   thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin());
   thrust::copy(vali, vali + num_cols, dev_vali.begin());
-  
   CHECK_CUDA(cudaDeviceSynchronize());
+  DEBUG0("copy feed data to GPU memory");
 
   // run E step in GPU
   EstepKernel<<<block_cnt_, block_dim_>>>(
     thrust::raw_pointer_cast(dev_cols.data()),
     thrust::raw_pointer_cast(dev_indptr.data()),
     thrust::raw_pointer_cast(dev_vali.data()),
-    num_cols, num_indptr, num_words_, num_topics_, num_iters,
+    num_cols, num_indptr, num_topics_, num_iters,
     thrust::raw_pointer_cast(dev_gamma_.data()),
     thrust::raw_pointer_cast(dev_new_gamma_.data()),
     thrust::raw_pointer_cast(dev_phi_.data()),
@@ -92,14 +92,15 @@ std::pair<float, float> CuLDA::FeedData(
     thrust::raw_pointer_cast(dev_new_beta_.data()),
     thrust::raw_pointer_cast(dev_train_losses.data()),
     thrust::raw_pointer_cast(dev_vali_losses.data()));
-  
   CHECK_CUDA(cudaDeviceSynchronize());
+  DEBUG0("run E step in GPU");
 
   // pull loss
   std::vector<float> train_losses(block_cnt_), vali_losses(block_cnt_);
   thrust::copy(dev_train_losses.begin(), dev_train_losses.end(), train_losses.begin());
   thrust::copy(dev_vali_losses.begin(), dev_vali_losses.end(), vali_losses.begin());
   CHECK_CUDA(cudaDeviceSynchronize());
+  DEBUG0("pull loss values");
 
   // accumulate
   float train_loss = std::accumulate(train_losses.begin(), train_losses.end(), 0.0f);
diff --git a/cpp/src/utils/ioutils.cc b/cpp/src/utils/ioutils.cc
index 0cc6495..aa325c9 100644
--- a/cpp/src/utils/ioutils.cc
+++ b/cpp/src/utils/ioutils.cc
@@ -90,8 +90,8 @@ std::pair<int, int> IoUtils::TokenizeStream(int num_lines, int num_threads) {
 
       // tokenize
       for (auto& word: line_vec) {
-        if (not word_count_.count(word)) continue;
-        cols_[i].push_back(word_count_[word]);
+        if (not word_idmap_.count(word)) continue;
+        cols_[i].push_back(word_idmap_[word]);
       }
     }
   }
@@ -155,6 +155,7 @@ std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads)
 
 void IoUtils::GetWordVocab(int min_count, std::string keys_path) {
   INFO("number of raw words: {}", word_count_.size());
+  word_idmap_.clear(); word_list_.clear();
   for (auto& it: word_count_) {
     if (it.second >= min_count) {
       word_idmap_[it.first] = word_idmap_.size();
diff --git a/cusim/culda/bindings.cc b/cusim/culda/bindings.cc
index 9a59c6e..f85b2d5 100644
--- a/cusim/culda/bindings.cc
+++ b/cusim/culda/bindings.cc
@@ -47,7 +47,7 @@ class CuLDABind {
       throw std::runtime_error("invalid grad_alpha or new_beta");
     }
 
-    int num_words = beta_buffer.shape[1];
+    int num_words = beta_buffer.shape[0];
 
     return obj_.LoadModel(_alpha.mutable_data(0),
         _beta.mutable_data(0),
@@ -67,7 +67,7 @@ class CuLDABind {
       throw std::runtime_error("invalid cols or indptr");
     }
     int num_cols = cols_buffer.shape[0];
-    int num_indptr = indptr_buffer.shape[0];
+    int num_indptr = indptr_buffer.shape[0] - 1;
     return obj_.FeedData(_cols.data(0), _indptr.data(0), _vali.data(0),
         num_cols, num_indptr, num_iters);
   }
diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
index 20a4f87..6282ec3 100644
--- a/cusim/culda/pyculda.py
+++ b/cusim/culda/pyculda.py
@@ -19,6 +19,8 @@
 from cusim.culda.culda_bind import CuLDABind
 from cusim.config_pb2 import CuLDAConfigProto
 
+EPS = 1e-10
+
 class CuLDA:
   def __init__(self, opt=None):
     self.opt = aux.get_opt_as_proto(opt or {}, CuLDAConfigProto)
@@ -77,6 +79,8 @@ def init_model(self):
     self.grad_alpha = np.zeros(shape=(block_cnt, self.opt.num_topics),
                                dtype=np.float32)
     self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32)
+    self.logger.info("grad alpha %s, new beta %s initialized",
+                     self.grad_alpha.shape, self.new_beta.shape)
 
     # push it to gpu
     self.obj.load_model(self.alpha, self.beta, self.grad_alpha, self.new_beta)
@@ -118,10 +122,10 @@ def _train_e_step(self, h5f):
       vali_loss_nume -= vali_loss
       vali_cnt = np.count_nonzero(vali)
       train_cnt = len(vali) - vali_cnt
-      train_loss_nume += train_cnt
-      vali_loss_nume += train_cnt
-      train_loss = train_loss_nume / train_loss_deno
-      vali_loss = vali_loss_nume / vali_loss_deno
+      train_loss_deno += train_cnt
+      vali_loss_deno += vali_cnt
+      train_loss = train_loss_nume / (train_loss_deno + EPS)
+      vali_loss = vali_loss_nume / (vali_loss_deno + EPS)
 
       # update progress bar
       pbar.update(end, values=[("train_loss", train_loss),
diff --git a/examples/example1.py b/examples/example1.py
index 4297768..5f8f685 100644
--- a/examples/example1.py
+++ b/examples/example1.py
@@ -42,6 +42,8 @@ def run_lda():
   opt = {
     "data_path": DATA_PATH,
     "data_dir": DATA_PATH2,
+    # "skip_preprocess": True,
+    "c_log_level": 3,
   }
   lda = CuLDA(opt)
   lda.train_model()

From c7df6e4dd32949343145346af97f163823fc955b Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Thu, 11 Feb 2021 00:07:37 +0900
Subject: [PATCH 24/26] ensure non-negativitty

---
 cpp/include/culda/cuda_lda_kernels.cuh |  8 ++++----
 cpp/src/utils/ioutils.cc               |  4 ++--
 cusim/culda/pyculda.py                 | 10 ++++++++++
 examples/example1.py                   | 18 ++++++++++++++++--
 4 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh
index e6885f8..400b251 100644
--- a/cpp/include/culda/cuda_lda_kernels.cuh
+++ b/cpp/include/culda/cuda_lda_kernels.cuh
@@ -30,7 +30,7 @@ __global__ void EstepKernel(
   const int num_cols, const int num_indptr,
   const int num_topics, const int num_iters,
   float* gamma, float* new_gamma, float* phi,
-  float* alpha, float* beta,
+  const float* alpha, const float* beta,
   float* grad_alpha, float* new_beta, float* train_losses, float* vali_losses) {
   
   // storage for block
@@ -76,12 +76,12 @@ __global__ void EstepKernel(
           __syncthreads();
         }
         if (j + 1 == num_iters) {
-          float p = ReduceSum(_phi, num_topics);
+          float p = fmaxf(EPS, ReduceSum(_phi, num_topics));
           if (threadIdx.x == 0) {
             if (_vali)
-              vali_losses[blockIdx.x] += logf(p + EPS);
+              vali_losses[blockIdx.x] += logf(p);
             else
-              train_losses[blockIdx.x] += logf(p + EPS);
+              train_losses[blockIdx.x] += logf(p);
           } 
         }
         __syncthreads();
diff --git a/cpp/src/utils/ioutils.cc b/cpp/src/utils/ioutils.cc
index aa325c9..14bd94e 100644
--- a/cpp/src/utils/ioutils.cc
+++ b/cpp/src/utils/ioutils.cc
@@ -37,10 +37,10 @@ void IoUtils::ParseLineImpl(std::string line, std::vector<std::string>& ret) {
   int n = line.size();
   std::string element;
   for (int i = 0; i < n; ++i) {
-    if (line[i] == ' ' or line[i] == ',') {
+    if (line[i] == ' ') {
       ret.push_back(element);
       element.clear();
-    } else if (line[i] != '"') {
+    } else {
       element += std::tolower(line[i]);
     }
   }
diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
index 6282ec3..7bdbf8c 100644
--- a/cusim/culda/pyculda.py
+++ b/cusim/culda/pyculda.py
@@ -137,6 +137,7 @@ def _train_m_step(self):
     self.obj.pull()
 
     # update beta
+    self.new_beta[:, :] = np.maximum(self.new_beta, EPS)
     self.beta[:, :] = self.new_beta / np.sum(self.new_beta, axis=0)[None, :]
     self.new_beta[:, :] = 0
 
@@ -151,6 +152,15 @@ def _train_m_step(self):
     c_0 = c_nume / c_deno
     delta = (gvec - c_0) / hvec
     self.alpha -= delta
+    self.alpha[:] = np.maximum(self.alpha, EPS)
     self.grad_alpha[:,:] = 0
 
     self.obj.push()
+
+  def save_model(self, model_path):
+    self.logger.info("save model path: %s", model_path)
+    h5f = h5py.File(model_path, "w")
+    h5f.create_dataset("alpha", data=self.alpha)
+    h5f.create_dataset("beta", data=self.beta)
+    h5f.create_dataset("keys", data=np.array(self.words))
+    h5f.close()
diff --git a/examples/example1.py b/examples/example1.py
index 5f8f685..0338dd2 100644
--- a/examples/example1.py
+++ b/examples/example1.py
@@ -9,6 +9,8 @@
 import subprocess
 import fire
 
+import h5py
+import numpy as np
 from gensim import downloader as api
 from cusim import aux, IoUtils, CuLDA
 
@@ -43,11 +45,23 @@ def run_lda():
     "data_path": DATA_PATH,
     "data_dir": DATA_PATH2,
     # "skip_preprocess": True,
-    "c_log_level": 3,
+    # "c_log_level": 3,
   }
   lda = CuLDA(opt)
   lda.train_model()
-
+  lda.save_model("res/lda.h5")
+  h5f = h5py.File("res/lda.h5", "r")
+  beta = h5f["beta"][:]
+  for i in range(lda.opt.num_topics):
+    print("=" * 50)
+    print(f"topic {i + 1}")
+    words = np.argsort(-beta.T[i])[:10]
+    print("-" * 50)
+    for j in range(10):
+      word = lda.words[words[j]].decode("utf8")
+      prob = beta[words[j], i]
+      print(f"rank {j + 1}. word: {word}, prob: {prob}")
+  h5f.close()
 
 if __name__ == "__main__":
   fire.Fire()

From 38c83db71b0947ac1dbba89a01c270a613e39adb Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Thu, 11 Feb 2021 00:20:11 +0900
Subject: [PATCH 25/26] change proto fieldname

---
 cusim/culda/pyculda.py   | 15 ++++++++-------
 cusim/proto/config.proto |  2 +-
 examples/example1.py     | 24 +++++++++++++-----------
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/cusim/culda/pyculda.py b/cusim/culda/pyculda.py
index 7bdbf8c..e0fa3d9 100644
--- a/cusim/culda/pyculda.py
+++ b/cusim/culda/pyculda.py
@@ -44,20 +44,21 @@ def preprocess_data(self):
     if self.opt.skip_preprocess:
       return
     iou = IoUtils()
-    if not self.opt.data_dir:
-      self.opt.data_dir = tempfile.TemporaryDirectory().name
+    if not self.opt.processed_data_dir:
+      self.opt.processed_data_dir = tempfile.TemporaryDirectory().name
     iou.convert_stream_to_h5(self.opt.data_path, self.opt.word_min_count,
-                             self.opt.data_dir)
+                             self.opt.processed_data_dir)
 
   def init_model(self):
     # load voca
-    self.logger.info("load key from %s", pjoin(self.opt.data_dir, "keys.txt"))
-    with open(pjoin(self.opt.data_dir, "keys.txt"), "rb") as fin:
+    data_dir = self.opt.processed_data_dir
+    self.logger.info("load key from %s", pjoin(data_dir, "keys.txt"))
+    with open(pjoin(data_dir, "keys.txt"), "rb") as fin:
       self.words = [line.strip() for line in fin]
     self.num_words = len(self.words)
 
     # count number of docs
-    h5f = h5py.File(pjoin(self.opt.data_dir, "token.h5"), "r")
+    h5f = h5py.File(pjoin(data_dir, "token.h5"), "r")
     self.num_docs = h5f["indptr"].shape[0] - 1
     h5f.close()
 
@@ -88,7 +89,7 @@ def init_model(self):
   def train_model(self):
     self.preprocess_data()
     self.init_model()
-    h5f = h5py.File(pjoin(self.opt.data_dir, "token.h5"), "r")
+    h5f = h5py.File(pjoin(self.opt.processed_data_dir, "token.h5"), "r")
     for epoch in range(1, self.opt.epochs + 1):
       self.logger.info("Epoch %d / %d", epoch, self.opt.epochs)
       self._train_e_step(h5f)
diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto
index 5c4e0d9..10b3820 100644
--- a/cusim/proto/config.proto
+++ b/cusim/proto/config.proto
@@ -22,7 +22,7 @@ message CuLDAConfigProto {
   optional int32 num_topics = 3 [default = 10];
   optional int32 block_dim = 4 [default = 32];
   optional int32 hyper_threads = 5 [default = 10];
-  optional string data_dir = 6;
+  optional string processed_data_dir = 6;
   optional bool skip_preprocess = 8;
   optional int32 word_min_count = 9 [default = 5];
   optional int32 batch_size = 10 [default = 100000];
diff --git a/examples/example1.py b/examples/example1.py
index 0338dd2..32c8dd5 100644
--- a/examples/example1.py
+++ b/examples/example1.py
@@ -17,10 +17,12 @@
 LOGGER = aux.get_logger()
 DOWNLOAD_PATH = "./res"
 # DATASET = "wiki-english-20171001"
-DATASET = "fake-news"
+DATASET = "quora-duplicate-questions"
 DATA_PATH = f"./res/{DATASET}.stream.txt"
-DATA_PATH2 = f"./res/{DATASET}-converted"
+LDA_PATH = f"./res/{DATASET}-lda.h5"
+PROCESSED_DATA_DIR = f"./res/{DATASET}-converted"
 MIN_COUNT = 5
+TOPK = 10
 
 def download():
   if os.path.exists(DATA_PATH):
@@ -37,28 +39,28 @@ def download():
 def run_io():
   download()
   iou = IoUtils(opt={"chunk_lines": 10000, "num_threads": 8})
-  iou.convert_stream_to_h5(DATA_PATH, 5, DATA_PATH2)
+  iou.convert_stream_to_h5(DATA_PATH, 5, PROCESSED_DATA_DIR)
 
 
 def run_lda():
   opt = {
     "data_path": DATA_PATH,
-    "data_dir": DATA_PATH2,
-    # "skip_preprocess": True,
-    # "c_log_level": 3,
+    "processed_data_dir": PROCESSED_DATA_DIR,
   }
   lda = CuLDA(opt)
   lda.train_model()
-  lda.save_model("res/lda.h5")
-  h5f = h5py.File("res/lda.h5", "r")
+  lda.save_model(LDA_PATH)
+  h5f = h5py.File(LDA_PATH, "r")
   beta = h5f["beta"][:]
-  for i in range(lda.opt.num_topics):
+  word_list = h5f["keys"][:]
+  num_topics = h5f["alpha"].shape[0]
+  for i in range(num_topics):
     print("=" * 50)
     print(f"topic {i + 1}")
     words = np.argsort(-beta.T[i])[:10]
     print("-" * 50)
-    for j in range(10):
-      word = lda.words[words[j]].decode("utf8")
+    for j in range(TOPK):
+      word = word_list[words[j]].decode("utf8")
       prob = beta[words[j], i]
       print(f"rank {j + 1}. word: {word}, prob: {prob}")
   h5f.close()

From c064f69f4e63352a38d027fb4c73809e96b217e7 Mon Sep 17 00:00:00 2001
From: js1010 <vjs1010v@gmail.com>
Date: Thu, 11 Feb 2021 14:22:36 +0900
Subject: [PATCH 26/26] use mutex

---
 cpp/include/culda/cuda_lda_kernels.cuh | 24 +++++++++++++++++++++---
 cpp/include/culda/culda.hpp            |  3 +++
 cpp/src/culda/culda.cu                 | 10 ++++++++--
 examples/example1.py                   |  1 +
 4 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/cpp/include/culda/cuda_lda_kernels.cuh b/cpp/include/culda/cuda_lda_kernels.cuh
index 400b251..02dbb37 100644
--- a/cpp/include/culda/cuda_lda_kernels.cuh
+++ b/cpp/include/culda/cuda_lda_kernels.cuh
@@ -31,7 +31,8 @@ __global__ void EstepKernel(
   const int num_topics, const int num_iters,
   float* gamma, float* new_gamma, float* phi,
   const float* alpha, const float* beta,
-  float* grad_alpha, float* new_beta, float* train_losses, float* vali_losses) {
+  float* grad_alpha, float* new_beta, 
+  float* train_losses, float* vali_losses, int* mutex) {
   
   // storage for block
   float* _gamma = gamma + num_topics * blockIdx.x;
@@ -57,6 +58,7 @@ __global__ void EstepKernel(
       for (int k = beg; k < end; ++k) {
         const int w = cols[k];
         const bool _vali = vali[k];
+        
         // compute phi
         if (not _vali or j + 1 == num_iters) {
           for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
@@ -65,17 +67,33 @@ __global__ void EstepKernel(
           
           // normalize phi and add it to new gamma and new beta
           float phi_sum = ReduceSum(_phi, num_topics);
+
           for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
             _phi[l] /= phi_sum;
             if (not _vali) _new_gamma[l] += _phi[l];
+          }
+          __syncthreads();
+        }
+        
+        if (j + 1 == num_iters) {
+          // write access of w th vector of new_beta 
+          if (threadIdx.x == 0) {
+            while (atomicCAS(&mutex[w], 0, 1)) {}
+          } 
+
+          __syncthreads();
+          for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
             if (j + 1 == num_iters) { 
               if (not _vali) new_beta[w * num_topics + l] += _phi[l];
               _phi[l] *= beta[w * num_topics + l];
             }
           }
           __syncthreads();
-        }
-        if (j + 1 == num_iters) {
+
+          // release lock
+          if (threadIdx.x == 0) mutex[w] = 0;
+          __syncthreads();
+
           float p = fmaxf(EPS, ReduceSum(_phi, num_topics));
           if (threadIdx.x == 0) {
             if (_vali)
diff --git a/cpp/include/culda/culda.hpp b/cpp/include/culda/culda.hpp
index b779e3a..3a126cc 100644
--- a/cpp/include/culda/culda.hpp
+++ b/cpp/include/culda/culda.hpp
@@ -70,6 +70,7 @@ class CuLDA {
   void Pull();
   void Push();
   int GetBlockCnt();
+
  private:
   DeviceInfo dev_info_;
   json11::Json opt_;
@@ -77,6 +78,8 @@ class CuLDA {
   thrust::device_vector<float> dev_alpha_, dev_beta_;
   thrust::device_vector<float> dev_grad_alpha_, dev_new_beta_;
   thrust::device_vector<float> dev_gamma_, dev_new_gamma_, dev_phi_;
+  thrust::device_vector<int> dev_mutex_;
+
   float *alpha_, *beta_, *grad_alpha_, *new_beta_;
   int block_cnt_, block_dim_;
   int num_topics_, num_words_;
diff --git a/cpp/src/culda/culda.cu b/cpp/src/culda/culda.cu
index ac7cd8f..9217902 100644
--- a/cpp/src/culda/culda.cu
+++ b/cpp/src/culda/culda.cu
@@ -51,13 +51,18 @@ void CuLDA::LoadModel(float* alpha, float* beta,
   new_beta_ = new_beta;
   dev_grad_alpha_.resize(num_topics_ * block_cnt_);
   dev_new_beta_.resize(num_topics_ * num_words_);
-
   // copy to device
   thrust::copy(grad_alpha_, grad_alpha_ + block_cnt_ * num_topics_, dev_grad_alpha_.begin());
   thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin());
   dev_gamma_.resize(num_topics_ * block_cnt_);
   dev_new_gamma_.resize(num_topics_ * block_cnt_);
   dev_phi_.resize(num_topics_ * block_cnt_);
+  
+  // set mutex
+  dev_mutex_.resize(num_words_);
+  std::vector<int> host_mutex(num_words_, 0);
+  thrust::copy(host_mutex.begin(), host_mutex.end(), dev_mutex_.begin());
+  
   CHECK_CUDA(cudaDeviceSynchronize());
 }
 
@@ -91,7 +96,8 @@ std::pair<float, float> CuLDA::FeedData(
     thrust::raw_pointer_cast(dev_grad_alpha_.data()),
     thrust::raw_pointer_cast(dev_new_beta_.data()),
     thrust::raw_pointer_cast(dev_train_losses.data()),
-    thrust::raw_pointer_cast(dev_vali_losses.data()));
+    thrust::raw_pointer_cast(dev_vali_losses.data()),
+    thrust::raw_pointer_cast(dev_mutex_.data()));
   CHECK_CUDA(cudaDeviceSynchronize());
   DEBUG0("run E step in GPU");
 
diff --git a/examples/example1.py b/examples/example1.py
index 32c8dd5..f9362cb 100644
--- a/examples/example1.py
+++ b/examples/example1.py
@@ -46,6 +46,7 @@ def run_lda():
   opt = {
     "data_path": DATA_PATH,
     "processed_data_dir": PROCESSED_DATA_DIR,
+    "skip_preprocess":True,
   }
   lda = CuLDA(opt)
   lda.train_model()