Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
language: cpp

sudo: enabled

compiler:
- gcc

matrix:
include:
- name: CUDA 10
env:
- CUDA=10.1.105-1
- CUDA_SHORT=10.1
- UBUNTU_VERSION=ubuntu1804
dist: bionic

before_install:
- sudo apt update
- sudo apt install -y software-properties-common
- sudo add-apt-repository -y ppa:deadsnakes/ppa
- sudo apt update
- sudo apt install -y python3-pip python3.6 g++
- pip3 install -U pip
- pip3 install setuptools
- pip3 install -r requirements.txt
- INSTALLER=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb
- wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER}
- sudo dpkg -i ${INSTALLER}
- wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub
- sudo apt-key add 7fa2af80.pub
- sudo apt update -qq
- sudo apt install -y cuda-core-${CUDA_SHORT/./-} cuda-cudart-dev-${CUDA_SHORT/./-} cuda-curand-dev-${CUDA_SHORT/./-} cuda-cufft-dev-${CUDA_SHORT/./-}
- sudo apt clean
- export CUDA_HOME=/usr/local/cuda-${CUDA_SHORT}
- export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
- export PATH=${CUDA_HOME}/bin:${PATH}
- python3.6 -m grpc_tools.protoc --python_out cusim/ --proto_path cusim/proto/ config.proto

script:
- sudo python3.6 setup.py install
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# CUSIM

[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)

### Introduction

This project is to speed up various ML models (e.g. topic modeling, word embedding, etc) by CUDA. It would be nice to think of it as [gensim](https://github.com/RaRe-Technologies/gensim)'s GPU version project. As a starting step, I implemented the most widely used word embedding model, the [word2vec](https://arxiv.org/pdf/1301.3781.pdf) model, and the most representative topic model, the [LDA (Latent Dirichlet Allocation)](https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf) model.
Expand Down
48 changes: 24 additions & 24 deletions cpp/include/utils/cuda_utils_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
// LICENSE file in the root directory of this source tree.
#pragma once
#include <unistd.h>
#include <cublas_v2.h>
// #include <cublas_v2.h>
#include <cuda_runtime.h>

#include <thrust/copy.h>
Expand Down Expand Up @@ -39,29 +39,29 @@ inline void checkCuda(cudaError_t code, const char *file, int line) {
}
}

inline const char* cublasGetErrorString(cublasStatus_t status) {
switch (status) {
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "Unknown";
}

#define CHECK_CUBLAS(code) { checkCublas((code), __FILE__, __LINE__); }
inline void checkCublas(cublasStatus_t code, const char * file, int line) {
if (code != CUBLAS_STATUS_SUCCESS) {
std::stringstream err;
err << "cublas error: " << cublasGetErrorString(code)
<< " (" << file << ":" << line << ")";
throw std::runtime_error(err.str());
}
}
// inline const char* cublasGetErrorString(cublasStatus_t status) {
// switch (status) {
// case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
// case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
// case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
// case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
// case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
// case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
// case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
// case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
// }
// return "Unknown";
// }
//
// #define CHECK_CUBLAS(code) { checkCublas((code), __FILE__, __LINE__); }
// inline void checkCublas(cublasStatus_t code, const char * file, int line) {
// if (code != CUBLAS_STATUS_SUCCESS) {
// std::stringstream err;
// err << "cublas error: " << cublasGetErrorString(code)
// << " (" << file << ":" << line << ")";
// throw std::runtime_error(err.str());
// }
// }

inline DeviceInfo GetDeviceInfo() {
DeviceInfo ret;
Expand Down
85 changes: 70 additions & 15 deletions cuda_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,60 @@ def find_in_path(name, path):
return None


def get_cuda_sm_list(cuda_ver):
if "CUDA_SM_LIST" in os.environ:
sm_list = os.environ["CUDA_SM_LIST"].split(",")
else:
sm_list = ["30", "52", "60", "61", "70", "75", "80", "86"]
if cuda_ver >= 110:
filter_list = ["30"]
if cuda_ver == 110:
filter_list += ["86"]
else:
filter_list = ["80", "86"]
if cuda_ver < 100:
filter_list += ["75"]
if cuda_ver < 90:
filter_list += ["70"]
if cuda_ver < 80:
filter_list += ["60", "61"]
sm_list = [sm for sm in sm_list if sm not in filter_list]
return sm_list


def get_cuda_compute(cuda_ver):
if "CUDA_COMPUTE" in os.environ:
compute = os.environ["CUDA_COMPUTE"]
else:
if 70 <= cuda_ver < 80:
compute = "52"
if 80 <= cuda_ver < 90:
compute = "61"
if 90 <= cuda_ver < 100:
compute = "70"
if 100 <= cuda_ver < 110:
compute = "75"
if cuda_ver == 110:
compute = "80"
if cuda_ver == 111:
compute = "86"
return compute


def get_cuda_arch(cuda_ver):
if "CUDA_ARCH" in os.environ:
arch = os.environ["CUDA_ARCH"]
else:
if 70 <= cuda_ver < 92:
arch = "30"
if 92 <= cuda_ver < 110:
arch = "50"
if cuda_ver == 110:
arch = "52"
if cuda_ver == 111:
arch = "80"
return arch

def locate_cuda():
"""Locate the CUDA environment on the system
If a valid cuda installation is found
Expand Down Expand Up @@ -60,22 +114,23 @@ def locate_cuda():
'your path, or set $CUDA_HOME to enable CUDA extensions')
return None
home = os.path.dirname(os.path.dirname(nvcc))

cudaconfig = {'home': home,
'nvcc': nvcc,
'include': os.path.join(home, 'include'),
'lib64': os.path.join(home, 'lib64')}
post_args = [
"-arch=sm_52",
"-gencode=arch=compute_52,code=sm_52",
"-gencode=arch=compute_60,code=sm_60",
"-gencode=arch=compute_61,code=sm_61",
"-gencode=arch=compute_70,code=sm_70",
"-gencode=arch=compute_75,code=sm_75",
"-gencode=arch=compute_80,code=sm_80",
"-gencode=arch=compute_86,code=sm_86",
"-gencode=arch=compute_86,code=compute_86",
'--ptxas-options=-v', '-O2']
'nvcc': nvcc,
'include': os.path.join(home, 'include'),
'lib64': os.path.join(home, 'lib64')}
cuda_ver = os.path.basename(os.path.realpath(home)).split("-")[1].split(".")
major, minor = int(cuda_ver[0]), int(cuda_ver[1])
cuda_ver = 10 * major + minor
assert cuda_ver >= 70, f"too low cuda ver {major}.{minor}"
print(f"cuda_ver: {major}.{minor}")
arch = get_cuda_arch(cuda_ver)
sm_list = get_cuda_sm_list(cuda_ver)
compute = get_cuda_compute(cuda_ver)
post_args = [f"-arch=sm_{arch}"] + \
[f"-gencode=arch=compute_{sm},code=sm_{sm}" for sm in sm_list] + \
[f"-gencode=arch=compute_{compute},code=compute_{compute}",
"--ptxas-options=-v", "-O2"]
print(f"nvcc post args: {post_args}")
if HALF_PRECISION:
post_args = [flag for flag in post_args if "52" not in flag]

Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def __init__(self, name):
extra_compile_args=extra_compile_args,
extra_link_args=["-fopenmp"],
library_dirs=[CUDA['lib64']],
libraries=['cudart', 'cublas', 'curand'],
libraries=['cudart', 'curand'],
extra_objects=[],
include_dirs=[ \
"cpp/include/", np.get_include(), pybind11.get_include(),
Expand All @@ -107,7 +107,7 @@ def __init__(self, name):
extra_compile_args=extra_compile_args,
extra_link_args=["-fopenmp"],
library_dirs=[CUDA['lib64']],
libraries=['cudart', 'cublas', 'curand'],
libraries=['cudart', 'curand'],
extra_objects=[],
include_dirs=[ \
"cpp/include/", np.get_include(), pybind11.get_include(),
Expand Down