js1010 · js1010 · Feb 20, 2021 · Feb 19, 2021 · Feb 19, 2021 · Feb 19, 2021
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,40 @@
+language: cpp
+
+sudo: enabled
+
+compiler:
+  - gcc
+
+matrix:
+  include:
+    - name: CUDA 10
+      env:
+      - CUDA=10.1.105-1
+      - CUDA_SHORT=10.1
+      - UBUNTU_VERSION=ubuntu1804
+      dist: bionic
+
+before_install:
+  - sudo apt update
+  - sudo apt install -y software-properties-common
+  - sudo add-apt-repository -y ppa:deadsnakes/ppa
+  - sudo apt update
+  - sudo apt install -y python3-pip python3.6 g++
+  - pip3 install -U pip
+  - pip3 install setuptools
+  - pip3 install -r requirements.txt
+  - INSTALLER=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb
+  - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER}
+  - sudo dpkg -i ${INSTALLER}
+  - wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub
+  - sudo apt-key add 7fa2af80.pub
+  - sudo apt update -qq
+  - sudo apt install -y cuda-core-${CUDA_SHORT/./-} cuda-cudart-dev-${CUDA_SHORT/./-} cuda-curand-dev-${CUDA_SHORT/./-} cuda-cufft-dev-${CUDA_SHORT/./-} 
+  - sudo apt clean
+  - export CUDA_HOME=/usr/local/cuda-${CUDA_SHORT}
+  - export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+  - export PATH=${CUDA_HOME}/bin:${PATH}
+  - python3.6 -m grpc_tools.protoc --python_out cusim/ --proto_path cusim/proto/ config.proto
+
+script:
+  - sudo python3.6 setup.py install
diff --git a/README.md b/README.md
@@ -1,3 +1,7 @@
+# CUSIM
+
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+
 ### Introduction
 
 This project is to speed up various ML models (e.g. topic modeling, word embedding, etc) by CUDA. It would be nice to think of it as [gensim](https://github.com/RaRe-Technologies/gensim)'s GPU version project. As a starting step, I implemented the most widely used word embedding model, the [word2vec](https://arxiv.org/pdf/1301.3781.pdf) model, and the most representative topic model, the [LDA (Latent Dirichlet Allocation)](https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf) model.

diff --git a/cpp/include/utils/cuda_utils_kernels.cuh b/cpp/include/utils/cuda_utils_kernels.cuh
@@ -5,7 +5,7 @@
 // LICENSE file in the root directory of this source tree.
 #pragma once
 #include <unistd.h>
-#include <cublas_v2.h>
+// #include <cublas_v2.h>
 #include <cuda_runtime.h>
 
 #include <thrust/copy.h>
@@ -39,29 +39,29 @@ inline void checkCuda(cudaError_t code, const char *file, int line) {
   }
 }
 
-inline const char* cublasGetErrorString(cublasStatus_t status) {
-  switch (status) {
-    case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
-    case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
-    case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
-    case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
-    case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
-    case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
-    case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
-    case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
-  }
-  return "Unknown";
-}
-
-#define CHECK_CUBLAS(code) { checkCublas((code), __FILE__, __LINE__); }
-inline void checkCublas(cublasStatus_t code, const char * file, int line) {
-  if (code != CUBLAS_STATUS_SUCCESS) {
-    std::stringstream err;
-    err << "cublas error: " << cublasGetErrorString(code)
-        << " (" << file << ":" << line << ")";
-    throw std::runtime_error(err.str());
-  }
-}
+// inline const char* cublasGetErrorString(cublasStatus_t status) {
+//   switch (status) {
+//     case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+//     case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+//     case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+//     case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+//     case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+//     case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+//     case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+//     case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+//   }
+//   return "Unknown";
+// }
+// 
+// #define CHECK_CUBLAS(code) { checkCublas((code), __FILE__, __LINE__); }
+// inline void checkCublas(cublasStatus_t code, const char * file, int line) {
+//   if (code != CUBLAS_STATUS_SUCCESS) {
+//     std::stringstream err;
+//     err << "cublas error: " << cublasGetErrorString(code)
+//         << " (" << file << ":" << line << ")";
+//     throw std::runtime_error(err.str());
+//   }
+// }
 
 inline DeviceInfo GetDeviceInfo() {
   DeviceInfo ret;

diff --git a/cuda_setup.py b/cuda_setup.py
@@ -28,6 +28,60 @@ def find_in_path(name, path):
   return None
 
 
+def get_cuda_sm_list(cuda_ver):
+  if "CUDA_SM_LIST" in os.environ:
+    sm_list = os.environ["CUDA_SM_LIST"].split(",")
+  else:
+    sm_list = ["30", "52", "60", "61", "70", "75", "80", "86"]
+    if cuda_ver >= 110:
+      filter_list = ["30"]
+      if cuda_ver == 110:
+        filter_list += ["86"]
+    else:
+      filter_list = ["80", "86"]
+      if cuda_ver < 100:
+        filter_list += ["75"]
+      if cuda_ver < 90:
+        filter_list += ["70"]
+      if cuda_ver < 80:
+        filter_list += ["60", "61"]
+    sm_list = [sm for sm in sm_list if sm not in filter_list]
+  return sm_list
+
+
+def get_cuda_compute(cuda_ver):
+  if "CUDA_COMPUTE" in os.environ:
+    compute = os.environ["CUDA_COMPUTE"]
+  else:
+    if 70 <= cuda_ver < 80:
+      compute = "52"
+    if 80 <= cuda_ver < 90:
+      compute = "61"
+    if 90 <= cuda_ver < 100:
+      compute = "70"
+    if 100 <= cuda_ver < 110:
+      compute = "75"
+    if cuda_ver == 110:
+      compute = "80"
+    if cuda_ver == 111:
+      compute = "86"
+  return compute
+
+
+def get_cuda_arch(cuda_ver):
+  if "CUDA_ARCH" in os.environ:
+    arch = os.environ["CUDA_ARCH"]
+  else:
+    if 70 <= cuda_ver < 92:
+      arch = "30"
+    if 92 <= cuda_ver < 110:
+      arch = "50"
+    if cuda_ver == 110:
+      arch = "52"
+    if cuda_ver == 111:
+      arch = "80"
+  return arch
+
 def locate_cuda():
   """Locate the CUDA environment on the system
   If a valid cuda installation is found
@@ -60,22 +114,23 @@ def locate_cuda():
               'your path, or set $CUDA_HOME to enable CUDA extensions')
       return None
     home = os.path.dirname(os.path.dirname(nvcc))
-
   cudaconfig = {'home': home,
-          'nvcc': nvcc,
-          'include': os.path.join(home, 'include'),
-          'lib64':   os.path.join(home, 'lib64')}
-  post_args = [
-    "-arch=sm_52",
-    "-gencode=arch=compute_52,code=sm_52",
-    "-gencode=arch=compute_60,code=sm_60",
-    "-gencode=arch=compute_61,code=sm_61",
-    "-gencode=arch=compute_70,code=sm_70",
-    "-gencode=arch=compute_75,code=sm_75",
-    "-gencode=arch=compute_80,code=sm_80",
-    "-gencode=arch=compute_86,code=sm_86",
-    "-gencode=arch=compute_86,code=compute_86",
-    '--ptxas-options=-v', '-O2']
+                'nvcc': nvcc,
+                'include': os.path.join(home, 'include'),
+                'lib64':   os.path.join(home, 'lib64')}
+  cuda_ver = os.path.basename(os.path.realpath(home)).split("-")[1].split(".")
+  major, minor = int(cuda_ver[0]), int(cuda_ver[1])
+  cuda_ver = 10 * major + minor
+  assert cuda_ver >= 70, f"too low cuda ver {major}.{minor}"
+  print(f"cuda_ver: {major}.{minor}")
+  arch = get_cuda_arch(cuda_ver)
+  sm_list = get_cuda_sm_list(cuda_ver)
+  compute = get_cuda_compute(cuda_ver)
+  post_args = [f"-arch=sm_{arch}"] + \
+    [f"-gencode=arch=compute_{sm},code=sm_{sm}" for sm in sm_list] + \
+    [f"-gencode=arch=compute_{compute},code=compute_{compute}",
+     "--ptxas-options=-v", "-O2"]
+  print(f"nvcc post args: {post_args}")
   if HALF_PRECISION:
     post_args = [flag for flag in post_args if "52" not in flag]
 

diff --git a/setup.py b/setup.py
@@ -92,7 +92,7 @@ def __init__(self, name):
             extra_compile_args=extra_compile_args,
             extra_link_args=["-fopenmp"],
             library_dirs=[CUDA['lib64']],
-            libraries=['cudart', 'cublas', 'curand'],
+            libraries=['cudart', 'curand'],
             extra_objects=[],
             include_dirs=[ \
               "cpp/include/", np.get_include(), pybind11.get_include(),
@@ -107,7 +107,7 @@ def __init__(self, name):
             extra_compile_args=extra_compile_args,
             extra_link_args=["-fopenmp"],
             library_dirs=[CUDA['lib64']],
-            libraries=['cudart', 'cublas', 'curand'],
+            libraries=['cudart', 'curand'],
             extra_objects=[],
             include_dirs=[ \
               "cpp/include/", np.get_include(), pybind11.get_include(),