js1010 · js1010 · Feb 7, 2021 · Feb 6, 2021 · Feb 6, 2021 · Feb 6, 2021
diff --git a/README.md b/README.md
@@ -1,2 +1,16 @@
-# cusim
-cuda implementaion of w2v and lda
+### How to install
+
+
+```shell
+# clone repo and submodules
+git clone git@github.com:js1010/cusim.git && cd cusim && git submodule update --init
+
+# install requirements
+pip install -r requirements.txt
+
+# generate proto
+python -m grpc_tools.protoc --python_out cusim/ --proto_path cusim/proto/ config.proto
+
+# install
+python setup.py install
+```
diff --git a/cpp/include/culda.hpp b/cpp/include/culda.hpp
@@ -33,7 +33,7 @@
 namespace cusim {
 
 class CuLDA {
- public: 
+ public:
   CuLDA();
   ~CuLDA();
  private:

diff --git a/cpp/include/ioutils.hpp b/cpp/include/ioutils.hpp
@@ -20,17 +20,35 @@
 #include <iostream>
 #include <unordered_map>
 
+#include "json11.hpp"
+#include "log.hpp"
+#include "types.hpp"
+
 namespace cusim {
 
 class IoUtils {
  public:
   IoUtils();
   ~IoUtils();
-  void LoadGensimVocab(std::string filepath, int min_count);
+  bool Init(std::string opt_path);
+  int LoadStreamFile(std::string filepath);
+  std::pair<int, int> ReadStreamForVocab(int num_lines, int num_threads);
+  std::pair<int, int> TokenizeStream(int num_lines, int num_threads);
+  void GetWordVocab(int min_count, std::string keys_path);
+  void GetToken(int* indices, int* indptr, int offset);
  private:
-  std::vector<std::string> parse_line(std::string line);
-  std::unordered_map<std::string, int> word_idmap_;
+  void ParseLine(std::string line, std::vector<std::string>& line_vec);
+  void ParseLineImpl(std::string line, std::vector<std::string>& line_vec);
+
+  std::vector<std::vector<int>> indices_;
+  std::vector<int> indptr_;
+  std::mutex global_lock_;
+  std::ifstream stream_fin_;
+  json11::Json opt_;
+  std::shared_ptr<spdlog::logger> logger_;
+  std::unordered_map<std::string, int> word_idmap_, word_count_;
   std::vector<std::string> word_list_;
+  int num_lines_, remain_lines_;
 };  // class IoUtils
 
 } // namespace cusim
diff --git a/cpp/include/log.hpp b/cpp/include/log.hpp
@@ -39,6 +39,6 @@ class CuSimLogger {
  private:
   static int global_logging_level_;
   std::shared_ptr<spdlog::logger> logger_;
-};  // class CuHNSWLogger
+};  // class CuSimLogger
 
 }  // namespace cusim
diff --git a/cpp/src/culda.cu b/cpp/src/culda.cu
@@ -3,7 +3,7 @@
 //
 // This source code is licensed under the Apache 2.0 license found in the
 // LICENSE file in the root directory of this source tree.
-#include "culda.cuh"
+#include "culda.hpp"
 
 namespace cusim {
 

diff --git a/cpp/src/ioutils.cc b/cpp/src/ioutils.cc
@@ -13,47 +13,166 @@ IoUtils::IoUtils() {
 
 IoUtils::~IoUtils() {}
 
-std::vector<std::string> IoUtils::parse_line(std::string line) {
+bool IoUtils::Init(std::string opt_path) {
+  std::ifstream in(opt_path.c_str());
+  if (not in.is_open()) return false;
+
+  std::string str((std::istreambuf_iterator<char>(in)),
+      std::istreambuf_iterator<char>());
+  std::string err_cmt;
+  auto _opt = json11::Json::parse(str, err_cmt);
+  if (not err_cmt.empty()) return false;
+  opt_ = _opt;
+  CuSimLogger().set_log_level(opt_["c_log_level"].int_value());
+  return true;
+}
+
+void IoUtils::ParseLine(std::string line, std::vector<std::string>& ret) {
+  ParseLineImpl(line, ret);
+}
+
+
+void IoUtils::ParseLineImpl(std::string line, std::vector<std::string>& ret) {
+  ret.clear();
   int n = line.size();
-  std::vector<std::string> ret;
   std::string element;
   for (int i = 0; i < n; ++i) {
-    if (line[i] == ' ') {
+    if (line[i] == ' ' or line[i] == ',') {
       ret.push_back(element);
       element.clear();
-    } else {
-      element += line[i];
+    } else if (line[i] != '"') {
+      element += std::tolower(line[i]);
     }
   }
   if (element.size() > 0) {
     ret.push_back(element);
   }
-  return ret;
 }
 
-void IoUtils::LoadGensimVocab(std::string filepath, int min_count) {
-  INFO("read gensim file to generate vocabulary: {}, min_count: {}", filepath, min_count);
-  std::ifstream fin(filepath.c_str());
-  std::unordered_map<std::string, int> word_count;
-  while (not fin.eof()) {
+int IoUtils::LoadStreamFile(std::string filepath) {
+  INFO("read gensim file to generate vocabulary: {}", filepath);
+  if (stream_fin_.is_open()) stream_fin_.close();
+  stream_fin_.open(filepath.c_str());
+  int count = 0;
+  std::string line;
+  while (getline(stream_fin_, line))
+    count++;
+  stream_fin_.close();
+  stream_fin_.open(filepath.c_str());
+  num_lines_ = count;
+  remain_lines_ = num_lines_;
+  INFO("number of lines: {}", num_lines_);
+  return count;
+}
+
+std::pair<int, int> IoUtils::TokenizeStream(int num_lines, int num_threads) {
+  int read_lines = std::min(num_lines, remain_lines_);
+  if (not read_lines) return {0, 0};
+  remain_lines_ -= read_lines;
+  indices_.clear();
+  indices_.resize(read_lines);
+  indptr_.resize(read_lines);
+  std::fill(indptr_.begin(), indptr_.end(), 0);
+  #pragma omp parallel num_threads(num_threads)
+  {
+    std::string line;
+    std::vector<std::string> line_vec;
+    #pragma omp for schedule(dynamic, 4)
+    for (int i = 0; i < read_lines; ++i) {
+      // get line thread-safely
+      {
+        std::unique_lock<std::mutex> lock(global_lock_);
+        getline(stream_fin_, line);
+      }
+
+      // seems to be bottle-neck
+      ParseLine(line, line_vec);
+
+      // tokenize
+      for (auto& word: line_vec) {
+        if (not word_count_.count(word)) continue;
+        indices_[i].push_back(word_count_[word]);
+      }
+    }
+  }
+  int cumsum = 0;
+  for (int i = 0; i < read_lines; ++i) {
+    cumsum += indices_[i].size();
+    indptr_[i] = cumsum;
+  }
+  return {read_lines, indptr_[read_lines - 1]};
+}
+
+void IoUtils::GetToken(int* indices, int* indptr, int offset) {
+  int n = indices_.size();
+  for (int i = 0; i < n; ++i) {
+    int beg = i == 0? 0: indptr_[i - 1];
+    int end = indptr_[i];
+    for (int j = beg; j < end; ++j) {
+      indices[j] = indices_[i][j - beg];
+    }
+    indptr[i] = offset + indptr_[i];
+  }
+}
+
+std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads) {
+  int read_lines = std::min(num_lines, remain_lines_);
+  remain_lines_ -= read_lines;
+  #pragma omp parallel num_threads(num_threads)
+  {
     std::string line;
-    getline(fin, line);
-    std::vector<std::string> line_vec = parse_line(line);
-    for (auto& word: line_vec) {
-      if (not word_count.count(word)) word_count[word] = 0;
-      word_count[word]++;
+    std::vector<std::string> line_vec;
+    std::unordered_map<std::string, int> word_count;
+    #pragma omp for schedule(dynamic, 4)
+    for (int i = 0; i < read_lines; ++i) {
+      // get line thread-safely
+      {
+        std::unique_lock<std::mutex> lock(global_lock_);
+        getline(stream_fin_, line);
+      }
+
+      // seems to be bottle-neck
+      ParseLine(line, line_vec);
+
+      // update private word count
+      for (auto& word: line_vec) {
+        word_count[word]++;
+      }
+    }
+
+    // update word count to class variable
+    {
+      std::unique_lock<std::mutex> lock(global_lock_);
+      for (auto& it: word_count) {
+        word_count_[it.first] += it.second;
+      }
     }
   }
-  INFO("number of raw words: {}", word_count.size());
-  word_idmap_.clear();
-  word_list_.clear();
-  for (auto& it: word_count) {
+  if (not remain_lines_) stream_fin_.close();
+  return {read_lines, word_count_.size()};
+}
+
+void IoUtils::GetWordVocab(int min_count, std::string keys_path) {
+  INFO("number of raw words: {}", word_count_.size());
+  for (auto& it: word_count_) {
     if (it.second >= min_count) {
-      word_idmap_[it.first] = vocab_.size();
+      word_idmap_[it.first] = word_idmap_.size();
       word_list_.push_back(it.first);
     }
   }
   INFO("number of words after filtering: {}", word_list_.size());
+
+  // write keys to csv file
+  std::ofstream fout(keys_path.c_str());
+  INFO("dump keys to {}", keys_path);
+  std::string header = "index,key\n";
+  fout.write(header.c_str(), header.size());
+  int n = word_list_.size();
+  for (int i = 0; i < n; ++i) {
+    std::string line = std::to_string(i) + ",\"" + word_list_[i] + "\"\n";
+    fout.write(line.c_str(), line.size());
+  }
+  fout.close();
 }
 
 }  // namespace cusim
diff --git a/cpp/src/log.cc b/cpp/src/log.cc
@@ -11,12 +11,12 @@
 namespace cusim {
 int CuSimLogger::global_logging_level_ = 2;
 
-CuSimLogger::CuHNSWLogger() {
+CuSimLogger::CuSimLogger() {
   spdlog::set_pattern("[%^%-8l%$] %Y-%m-%d %H:%M:%S %v");
   logger_ = spdlog::default_logger();
 }
 
-std::shared_ptr<spdlog::logger>& CuHNSWLogger::get_logger() {
+std::shared_ptr<spdlog::logger>& CuSimLogger::get_logger() {
   return logger_;
 }