Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,16 @@
# cusim
cuda implementaion of w2v and lda
### How to install


```shell
# clone repo and submodules
git clone git@github.com:js1010/cusim.git && cd cusim && git submodule update --init

# install requirements
pip install -r requirements.txt

# generate proto
python -m grpc_tools.protoc --python_out cusim/ --proto_path cusim/proto/ config.proto

# install
python setup.py install
```
2 changes: 1 addition & 1 deletion cpp/include/culda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
namespace cusim {

class CuLDA {
public:
public:
CuLDA();
~CuLDA();
private:
Expand Down
24 changes: 21 additions & 3 deletions cpp/include/ioutils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,35 @@
#include <iostream>
#include <unordered_map>

#include "json11.hpp"
#include "log.hpp"
#include "types.hpp"

namespace cusim {

class IoUtils {
public:
IoUtils();
~IoUtils();
void LoadGensimVocab(std::string filepath, int min_count);
bool Init(std::string opt_path);
int LoadStreamFile(std::string filepath);
std::pair<int, int> ReadStreamForVocab(int num_lines, int num_threads);
std::pair<int, int> TokenizeStream(int num_lines, int num_threads);
void GetWordVocab(int min_count, std::string keys_path);
void GetToken(int* indices, int* indptr, int offset);
private:
std::vector<std::string> parse_line(std::string line);
std::unordered_map<std::string, int> word_idmap_;
void ParseLine(std::string line, std::vector<std::string>& line_vec);
void ParseLineImpl(std::string line, std::vector<std::string>& line_vec);

std::vector<std::vector<int>> indices_;
std::vector<int> indptr_;
std::mutex global_lock_;
std::ifstream stream_fin_;
json11::Json opt_;
std::shared_ptr<spdlog::logger> logger_;
std::unordered_map<std::string, int> word_idmap_, word_count_;
std::vector<std::string> word_list_;
int num_lines_, remain_lines_;
}; // class IoUtils

} // namespace cusim
2 changes: 1 addition & 1 deletion cpp/include/log.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,6 @@ class CuSimLogger {
private:
static int global_logging_level_;
std::shared_ptr<spdlog::logger> logger_;
}; // class CuHNSWLogger
}; // class CuSimLogger

} // namespace cusim
2 changes: 1 addition & 1 deletion cpp/src/culda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
//
// This source code is licensed under the Apache 2.0 license found in the
// LICENSE file in the root directory of this source tree.
#include "culda.cuh"
#include "culda.hpp"

namespace cusim {

Expand Down
161 changes: 140 additions & 21 deletions cpp/src/ioutils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,47 +13,166 @@ IoUtils::IoUtils() {

IoUtils::~IoUtils() {}

std::vector<std::string> IoUtils::parse_line(std::string line) {
bool IoUtils::Init(std::string opt_path) {
std::ifstream in(opt_path.c_str());
if (not in.is_open()) return false;

std::string str((std::istreambuf_iterator<char>(in)),
std::istreambuf_iterator<char>());
std::string err_cmt;
auto _opt = json11::Json::parse(str, err_cmt);
if (not err_cmt.empty()) return false;
opt_ = _opt;
CuSimLogger().set_log_level(opt_["c_log_level"].int_value());
return true;
}

void IoUtils::ParseLine(std::string line, std::vector<std::string>& ret) {
ParseLineImpl(line, ret);
}


void IoUtils::ParseLineImpl(std::string line, std::vector<std::string>& ret) {
ret.clear();
int n = line.size();
std::vector<std::string> ret;
std::string element;
for (int i = 0; i < n; ++i) {
if (line[i] == ' ') {
if (line[i] == ' ' or line[i] == ',') {
ret.push_back(element);
element.clear();
} else {
element += line[i];
} else if (line[i] != '"') {
element += std::tolower(line[i]);
}
}
if (element.size() > 0) {
ret.push_back(element);
}
return ret;
}

void IoUtils::LoadGensimVocab(std::string filepath, int min_count) {
INFO("read gensim file to generate vocabulary: {}, min_count: {}", filepath, min_count);
std::ifstream fin(filepath.c_str());
std::unordered_map<std::string, int> word_count;
while (not fin.eof()) {
int IoUtils::LoadStreamFile(std::string filepath) {
INFO("read gensim file to generate vocabulary: {}", filepath);
if (stream_fin_.is_open()) stream_fin_.close();
stream_fin_.open(filepath.c_str());
int count = 0;
std::string line;
while (getline(stream_fin_, line))
count++;
stream_fin_.close();
stream_fin_.open(filepath.c_str());
num_lines_ = count;
remain_lines_ = num_lines_;
INFO("number of lines: {}", num_lines_);
return count;
}

std::pair<int, int> IoUtils::TokenizeStream(int num_lines, int num_threads) {
int read_lines = std::min(num_lines, remain_lines_);
if (not read_lines) return {0, 0};
remain_lines_ -= read_lines;
indices_.clear();
indices_.resize(read_lines);
indptr_.resize(read_lines);
std::fill(indptr_.begin(), indptr_.end(), 0);
#pragma omp parallel num_threads(num_threads)
{
std::string line;
std::vector<std::string> line_vec;
#pragma omp for schedule(dynamic, 4)
for (int i = 0; i < read_lines; ++i) {
// get line thread-safely
{
std::unique_lock<std::mutex> lock(global_lock_);
getline(stream_fin_, line);
}

// seems to be bottle-neck
ParseLine(line, line_vec);

// tokenize
for (auto& word: line_vec) {
if (not word_count_.count(word)) continue;
indices_[i].push_back(word_count_[word]);
}
}
}
int cumsum = 0;
for (int i = 0; i < read_lines; ++i) {
cumsum += indices_[i].size();
indptr_[i] = cumsum;
}
return {read_lines, indptr_[read_lines - 1]};
}

void IoUtils::GetToken(int* indices, int* indptr, int offset) {
int n = indices_.size();
for (int i = 0; i < n; ++i) {
int beg = i == 0? 0: indptr_[i - 1];
int end = indptr_[i];
for (int j = beg; j < end; ++j) {
indices[j] = indices_[i][j - beg];
}
indptr[i] = offset + indptr_[i];
}
}

std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads) {
int read_lines = std::min(num_lines, remain_lines_);
remain_lines_ -= read_lines;
#pragma omp parallel num_threads(num_threads)
{
std::string line;
getline(fin, line);
std::vector<std::string> line_vec = parse_line(line);
for (auto& word: line_vec) {
if (not word_count.count(word)) word_count[word] = 0;
word_count[word]++;
std::vector<std::string> line_vec;
std::unordered_map<std::string, int> word_count;
#pragma omp for schedule(dynamic, 4)
for (int i = 0; i < read_lines; ++i) {
// get line thread-safely
{
std::unique_lock<std::mutex> lock(global_lock_);
getline(stream_fin_, line);
}

// seems to be bottle-neck
ParseLine(line, line_vec);

// update private word count
for (auto& word: line_vec) {
word_count[word]++;
}
}

// update word count to class variable
{
std::unique_lock<std::mutex> lock(global_lock_);
for (auto& it: word_count) {
word_count_[it.first] += it.second;
}
}
}
INFO("number of raw words: {}", word_count.size());
word_idmap_.clear();
word_list_.clear();
for (auto& it: word_count) {
if (not remain_lines_) stream_fin_.close();
return {read_lines, word_count_.size()};
}

void IoUtils::GetWordVocab(int min_count, std::string keys_path) {
INFO("number of raw words: {}", word_count_.size());
for (auto& it: word_count_) {
if (it.second >= min_count) {
word_idmap_[it.first] = vocab_.size();
word_idmap_[it.first] = word_idmap_.size();
word_list_.push_back(it.first);
}
}
INFO("number of words after filtering: {}", word_list_.size());

// write keys to csv file
std::ofstream fout(keys_path.c_str());
INFO("dump keys to {}", keys_path);
std::string header = "index,key\n";
fout.write(header.c_str(), header.size());
int n = word_list_.size();
for (int i = 0; i < n; ++i) {
std::string line = std::to_string(i) + ",\"" + word_list_[i] + "\"\n";
fout.write(line.c_str(), line.size());
}
fout.close();
}

} // namespace cusim
4 changes: 2 additions & 2 deletions cpp/src/log.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
namespace cusim {
int CuSimLogger::global_logging_level_ = 2;

CuSimLogger::CuHNSWLogger() {
CuSimLogger::CuSimLogger() {
spdlog::set_pattern("[%^%-8l%$] %Y-%m-%d %H:%M:%S %v");
logger_ = spdlog::default_logger();
}

std::shared_ptr<spdlog::logger>& CuHNSWLogger::get_logger() {
std::shared_ptr<spdlog::logger>& CuSimLogger::get_logger() {
return logger_;
}

Expand Down
Loading