Skip to content

Commit

Permalink
Generate binary datasets in CI instead of storing them in the repo
Browse files Browse the repository at this point in the history
  • Loading branch information
benjaminwinger committed May 24, 2024
1 parent 52f99f5 commit 08ced36
Show file tree
Hide file tree
Showing 31 changed files with 138 additions and 20 deletions.
102 changes: 91 additions & 11 deletions .github/workflows/ci-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,33 @@ concurrency:
cancel-in-progress: true

jobs:
generate-binary-demo:
name: Generate Binary Dataset
needs: [clang-format, sanity-checks, python-lint-check]
runs-on: kuzu-self-hosted-testing
env:
NUM_THREADS: 32
GEN: ninja
CC: gcc
CXX: g++
steps:
- uses: actions/checkout@v3

- name: Build
run: make release

- name: Generate Datasets
run: bash scripts/generate_binary_demo.sh

- name: Upload binary-demo
uses: actions/upload-artifact@v4
with:
name: binary-demo
path: dataset/binary-demo

gcc-build-test:
name: gcc build & test
needs: [clang-format, sanity-checks, python-lint-check]
needs: [clang-format, sanity-checks, python-lint-check, generate-binary-demo]
runs-on: kuzu-self-hosted-testing
env:
NUM_THREADS: 32
Expand All @@ -47,9 +71,15 @@ jobs:
steps:
- uses: actions/checkout@v3

- name: Download binary-demo
uses: actions/download-artifact@v4
with:
name: binary-demo
path: dataset/binary-demo

- name: Ensure Python dependencies
run: |
pip install torch~=2.2.0 --break-system-package --extra-index-url https://download.pytorch.org/whl/cpu
pip install torch~=2.2.0 --break-system-package --extra-index-url https://download.pytorch.org/whl/cpu
pip install --break-system-package --user -r tools/python_api/requirements_dev.txt -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
- name: Ensure Node.js dependencies
Expand Down Expand Up @@ -82,13 +112,19 @@ jobs:
code-coverage:
name: code coverage
runs-on: ubuntu-22.04
needs: [clang-format, sanity-checks, python-lint-check]
needs: [clang-format, sanity-checks, python-lint-check, generate-binary-demo]
env:
TEST_JOBS: 10
WERROR: 0
steps:
- uses: actions/checkout@v3

- name: Download binary-demo
uses: actions/download-artifact@v4
with:
name: binary-demo
path: dataset/binary-demo

- name: Install lcov
run: sudo apt-get install -y lcov

Expand All @@ -108,12 +144,18 @@ jobs:

gcc-build-test-x86:
name: gcc build & test 32-bit
needs: [clang-format, sanity-checks]
needs: [clang-format, sanity-checks, generate-binary-demo]
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: Download binary-demo
uses: actions/download-artifact@v4
with:
name: binary-demo
path: dataset/binary-demo

- name: Start Docker container
run: |
docker run -d --name kuzu-x86 \
Expand Down Expand Up @@ -155,7 +197,7 @@ jobs:

gcc-build-test-with-asan:
name: gcc build & test with asan
needs: [gcc-build-test]
needs: [gcc-build-test, generate-binary-demo]
runs-on: kuzu-self-hosted-testing
env:
NUM_THREADS: 32
Expand All @@ -167,9 +209,15 @@ jobs:
steps:
- uses: actions/checkout@v3

- name: Download binary-demo
uses: actions/download-artifact@v4
with:
name: binary-demo
path: dataset/binary-demo

- name: Ensure Python dependencies
run: |
pip install torch~=2.2.0 --break-system-package --extra-index-url https://download.pytorch.org/whl/cpu
pip install torch~=2.2.0 --break-system-package --extra-index-url https://download.pytorch.org/whl/cpu
pip install --break-system-package --user -r tools/python_api/requirements_dev.txt -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
- name: Ensure Node.js dependencies
Expand All @@ -183,7 +231,7 @@ jobs:

clang-build-test:
name: clang build and test
needs: [clang-format, sanity-checks, python-lint-check]
needs: [clang-format, sanity-checks, python-lint-check, generate-binary-demo]
runs-on: kuzu-self-hosted-testing
env:
NUM_THREADS: 32
Expand All @@ -201,9 +249,15 @@ jobs:
steps:
- uses: actions/checkout@v3

- name: Download binary-demo
uses: actions/download-artifact@v4
with:
name: binary-demo
path: dataset/binary-demo

- name: Ensure Python dependencies
run: |
pip install torch~=2.2.0 --break-system-package --extra-index-url https://download.pytorch.org/whl/cpu
pip install torch~=2.2.0 --break-system-package --extra-index-url https://download.pytorch.org/whl/cpu
pip install --break-system-package --user -r tools/python_api/requirements_dev.txt -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
- name: Ensure Node.js dependencies
Expand Down Expand Up @@ -235,7 +289,7 @@ jobs:

msvc-build-test:
name: msvc build & test
needs: [clang-format, sanity-checks, python-lint-check]
needs: [clang-format, sanity-checks, python-lint-check, generate-binary-demo]
runs-on: self-hosted-windows
env:
# Shorten build path as much as possible
Expand All @@ -256,6 +310,12 @@ jobs:
steps:
- uses: actions/checkout@v3

- name: Download binary-demo
uses: actions/download-artifact@v4
with:
name: binary-demo
path: dataset/binary-demo

- name: Ensure Python dependencies
run: |
pip install torch~=2.0.0 --extra-index-url https://download.pytorch.org/whl/cpu
Expand Down Expand Up @@ -435,7 +495,7 @@ jobs:

macos-build-test:
name: apple clang build & test
needs: [clang-format, sanity-checks, rustfmt-check, python-lint-check]
needs: [clang-format, sanity-checks, rustfmt-check, python-lint-check, generate-binary-demo]
runs-on: self-hosted-mac-x64
env:
NUM_THREADS: 32
Expand All @@ -453,9 +513,15 @@ jobs:
steps:
- uses: actions/checkout@v3

- name: Download binary-demo
uses: actions/download-artifact@v4
with:
name: binary-demo
path: dataset/binary-demo

- name: Ensure Python dependencies
run: |
pip3 install torch~=2.2.0 --break-system-package --extra-index-url https://download.pytorch.org/whl/cpu
pip3 install torch~=2.2.0 --break-system-package --extra-index-url https://download.pytorch.org/whl/cpu
pip3 install --break-system-package --user -r tools/python_api/requirements_dev.txt -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
- name: Ensure Node.js dependencies
Expand Down Expand Up @@ -563,6 +629,10 @@ jobs:
- name: Install dependencies
run: pip install --break-system-package rangehttpserver

# shell needs to be built first to generate the dataset provided by the server
- name: Extension test build
run: make extension-test-build

- name: Extension test
run: |
cd scripts/ && python3 http-server.py &
Expand Down Expand Up @@ -600,6 +670,10 @@ jobs:
- name: Install dependencies
run: pip3 install --break-system-package rangehttpserver

# shell needs to be built first to generate the dataset provided by the server
- name: Extension test build
run: make extension-test-build

- name: Extension test
run: |
cd scripts/ && python3 http-server.py &
Expand Down Expand Up @@ -640,6 +714,12 @@ jobs:
node -e 'fs=require("fs");fs.readFile(process.env.FNAME,"utf8",(err,data)=>{if(err!=null)throw err;fs.writeFile(process.env.FNAME,data.replaceAll(process.env.FIND,process.env.PG_HOST),"utf8",e=>{if(e!=null)throw e;});});'
cat postgres.test
- name: Extension test build
shell: cmd
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat"
make extension-test-build
- name: Extension test
shell: cmd
run: |
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,7 @@ scripts/generate-cpp-docs/c/kuzu.h
scripts/generate-cpp-docs/cpp/headers
scripts/generate-cpp-docs/cpp/docs
scripts/generate-cpp-docs/c/docs

# Generated datasets
dataset/binary-demo
dataset/databases/tinysnb
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,16 @@ benchmark:
example:
$(call run-cmake-release, -DBUILD_EXAMPLES=TRUE)

extension-test:

extension-test-build:
$(call run-cmake-relwithdebinfo, \
-DBUILD_EXTENSIONS="httpfs;duckdb;postgres" \
-DBUILD_EXTENSION_TESTS=TRUE \
-DENABLE_ADDRESS_SANITIZER=TRUE \
-DENABLE_BACKTRACES=TRUE \
)

extension-test: extension-test-build
ctest --test-dir build/relwithdebinfo/extension --output-on-failure -j ${TEST_JOBS}
aws s3 rm s3://kuzu-dataset-us/${RUN_ID}/ --recursive

Expand Down
18 changes: 13 additions & 5 deletions benchmark/serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import argparse

base_dir = os.path.dirname(os.path.realpath(__file__))
kuzu_exec_path = os.path.join(
base_dir, '..', 'build', 'release', 'tools', 'shell', 'kuzu')


def _get_kuzu_version():
Expand All @@ -19,7 +17,7 @@ def _get_kuzu_version():
return line.split(' ')[2].strip()


def serialize(dataset_name, dataset_path, serialized_graph_path, benchmark_copy_log_dir, single_thread: bool = False):
def serialize(kuzu_exec_path, dataset_name, dataset_path, serialized_graph_path, benchmark_copy_log_dir, single_thread: bool = False):
bin_version = _get_kuzu_version()

if not os.path.exists(serialized_graph_path):
Expand Down Expand Up @@ -65,8 +63,9 @@ def serialize(dataset_name, dataset_path, serialized_graph_path, benchmark_copy_
copy_match = re.match(r'copy\s+(.+?)\s+from', s, re.IGNORECASE)
# Run kuzu shell one query at a time. This ensures a new process is
# created for each query to avoid memory leaks.
stdout = sys.stdout if create_match or not benchmark_copy_log_dir else subprocess.PIPE
process = subprocess.Popen([kuzu_exec_path, serialized_graph_path],
stdin=subprocess.PIPE, stdout=sys.stdout if create_match else subprocess.PIPE, encoding="utf-8")
stdin=subprocess.PIPE, stdout=stdout, encoding="utf-8")
process.stdin.write(s + ";\n")
process.stdin.close()
if create_match:
Expand Down Expand Up @@ -100,9 +99,18 @@ def serialize(dataset_name, dataset_path, serialized_graph_path, benchmark_copy_
parser.add_argument("serialized_graph_path", help="Output path of the database. Will be created if it does not exist already")
parser.add_argument("benchmark_copy_log_dir", help="Optional directory to store copy logs", nargs="?")
parser.add_argument("--single-thread", help="If true, copy single threaded, which makes the results more reproduceable", action="store_true")
if sys.platform == "win32":
default_kuzu_exec_path = os.path.join(
base_dir, '..', 'build', 'release', 'tools', 'shell', 'kuzu_shell')
else:
default_kuzu_exec_path = os.path.join(
base_dir, '..', 'build', 'release', 'tools', 'shell', 'kuzu')
parser.add_argument("--kuzu-shell", help="Path of the kuzu shell executable. Defaults to the path as built in the default release build directory", default=default_kuzu_exec_path)
args = parser.parse_args()
args = parser.parse_args()

try:
serialize(args.dataset_name, args.dataset_path, args.serialized_graph_path, args.benchmark_copy_log_dir, args.single_thread)
serialize(args.kuzu_shell, args.dataset_name, args.dataset_path, args.serialized_graph_path, args.benchmark_copy_log_dir, args.single_thread)
except Exception as e:
logging.error(f'Error serializing dataset {args.dataset_name}')
raise e
Expand Down
Empty file removed dataset/binary-demo/.lock
Empty file.
Empty file removed dataset/binary-demo/.shadow
Empty file.
Empty file removed dataset/binary-demo/.wal
Empty file.
Binary file removed dataset/binary-demo/catalog.kz
Binary file not shown.
Binary file removed dataset/binary-demo/data.kz
Binary file not shown.
Binary file removed dataset/binary-demo/metadata.kz
Binary file not shown.
Binary file removed dataset/binary-demo/n-0.hindex
Binary file not shown.
Binary file removed dataset/binary-demo/n-0.hindex.ovf
Binary file not shown.
Binary file removed dataset/binary-demo/n-1.hindex
Binary file not shown.
Binary file removed dataset/binary-demo/n-1.hindex.ovf
Binary file not shown.
Binary file removed dataset/binary-demo/nodes.statistics_and_deleted.ids
Binary file not shown.
Binary file removed dataset/binary-demo/rels.statistics
Binary file not shown.
Empty file removed dataset/databases/tinysnb/.lock
Empty file.
Empty file removed dataset/databases/tinysnb/.shadow
Empty file.
Empty file removed dataset/databases/tinysnb/.wal
Empty file.
Binary file removed dataset/databases/tinysnb/catalog.kz
Binary file not shown.
Binary file removed dataset/databases/tinysnb/data.kz
Binary file not shown.
Binary file removed dataset/databases/tinysnb/metadata.kz
Binary file not shown.
Binary file removed dataset/databases/tinysnb/n-0.hindex
Binary file not shown.
Binary file removed dataset/databases/tinysnb/n-1.hindex
Binary file not shown.
Binary file removed dataset/databases/tinysnb/n-2.hindex
Binary file not shown.
Binary file removed dataset/databases/tinysnb/n-2.hindex.ovf
Binary file not shown.
Binary file not shown.
Binary file removed dataset/databases/tinysnb/rels.statistics
Binary file not shown.
1 change: 0 additions & 1 deletion dataset/databases/tinysnb/version.txt

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@
CD=`dirname "$0"`
DATASET_DIR=$CD/../dataset
python3 $CD/../benchmark/serializer.py DemoDB $DATASET_DIR/demo-db/parquet $DATASET_DIR/binary-demo --single-thread
python3 $CD/../benchmark/serializer.py TinySNB $DATASET_DIR/tinysnb $DATASET_DIR/databases/tinysnb --single-thread
27 changes: 26 additions & 1 deletion scripts/http-server.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,30 @@
import os
import shutil
import subprocess
import sys

KUZU_ROOT = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
# Datasets can only be copied from the root since copy.schema contains relative paths
os.chdir(KUZU_ROOT)

# Change the current working directory
os.chdir(f'{os.getcwd()}/../dataset')
if os.path.exists(f"{KUZU_ROOT}/dataset/databases/tinysnb"):
shutil.rmtree(f"{KUZU_ROOT}/dataset/databases/tinysnb")
if sys.platform == "win32":
kuzu_shell_path = f"{KUZU_ROOT}/build/relwithdebinfo/tools/shell/kuzu_shell"
else:
kuzu_shell_path = f"{KUZU_ROOT}/build/relwithdebinfo/tools/shell/kuzu"
subprocess.check_call(
[
"python3",
f"{KUZU_ROOT}/benchmark/serializer.py",
"TinySNB",
f"{KUZU_ROOT}/dataset/tinysnb",
f"{KUZU_ROOT}/dataset/databases/tinysnb",
"--single-thread",
"--kuzu-shell",
kuzu_shell_path,
]
)
os.chdir(f"{KUZU_ROOT}/dataset")
os.system("python3 -m RangeHTTPServer 80")

0 comments on commit 08ced36

Please sign in to comment.