diff --git a/benchmarks/tpch/README.md b/benchmarks/tpch/README.md new file mode 100644 index 00000000000..7894ba4fd27 --- /dev/null +++ b/benchmarks/tpch/README.md @@ -0,0 +1,10 @@ +***Compare lance vs parquet for TPCH Q1 and Q6 using SF1 dataset*** + +**Steps to run the benchmark:** + +1. `cd lance/benchmarks/tpch` +2. `mkdir dataset && cd dataset` +3. download parquet file lineitem from : "https://github.com/cwida/duckdb-data/releases/download/v1.0/lineitemsf1.snappy.parquet"; then rename it to "lineitem_sf1.parquet" +4. generate lance file from the parquet file in the same directory +5. `cd ..` +6. `python3 benchmark.py q1` diff --git a/benchmarks/tpch/benchmark.py b/benchmarks/tpch/benchmark.py new file mode 100644 index 00000000000..602d029f21f --- /dev/null +++ b/benchmarks/tpch/benchmark.py @@ -0,0 +1,78 @@ +# Benchmark performance Lance vs Parquet w/ Tpch Q1 and Q6 +import lance +import pandas as pd +import pyarrow as pa +import duckdb + +import sys +import time + +Q1 = """ +SELECT + l_returnflag, + l_linestatus, + sum(l_quantity) as sum_qty, + sum(l_extendedprice) as sum_base_price, + sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, + sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, + avg(l_quantity) as avg_qty, + avg(l_extendedprice) as avg_price, + avg(l_discount) as avg_disc, + count(*) as count_order +FROM + lineitem +WHERE + l_shipdate <= date '1998-12-01' - interval '90' day +GROUP BY + l_returnflag, + l_linestatus +ORDER BY + l_returnflag, + l_linestatus; +""" + +Q6 = """ +SELECT + sum(l_extendedprice * l_discount) as revenue +FROM + lineitem +WHERE + l_shipdate >= date '1994-01-01' + AND l_shipdate < date '1994-01-01' + interval '1' year + AND l_discount between 0.06 - 0.01 AND 0.06 + 0.01 + AND l_quantity < 24; +""" + +num_args = len(sys.argv) +assert num_args == 2 + +query = '' +if sys.argv[1] == 'q1': + query = Q1 +elif sys.argv[1] == 'q6': + query = Q6 +else: + sys.exit("We only support Q1 and Q6 for now") + +print("------------------BENCHMARK TPCH " + sys.argv[1] + "-------------------\n") +##### Lance ##### +start1 = time.time() +# read from lance and create a relation from it +lineitem = lance.dataset("./dataset/lineitem.lance") +res1 = duckdb.sql(query).df() +end1 = time.time() + +print("Lance Latency: ",str(round(end1 - start1, 3)) + 's') +print(res1) + +##### Parquet ##### +lineitem = None +start2 = time.time() +# read from parquet and create a view instead of table from it +duckdb.sql("CREATE VIEW lineitem AS SELECT * FROM read_parquet('./dataset/lineitem_sf1.parquet');") +res2 = duckdb.sql(query).df() +end2 = time.time() + +print("Parquet Latency: ",str(round(end2 - start2, 3)) + 's') +print(res2) +