In [None]:
# Compare magic_duckdb against baseline (DuckDB) and other implementations
#
# Create a semi-large dataframe

import sys

import duckdb
import numpy as np
import pandas as pd
from pandas import DataFrame

import magic_duckdb

# %pip install jupysql duckdb_engine
%load_ext sql
%load_ext magic_duckdb


In [None]:

# First test is Pandas. The -t df is not needed, since Pandas is the default.

def print_versions():
    print("Python: ", sys. version) 
    print("DuckDB: ", duckdb.execute("pragma version").df())
    print("magic_duckdb: ",  magic_duckdb.__version__)
    print("Pandas :", pd.__version__)

def test_duckdb_execute_df():
    display("DuckDB: Execute DF")
    with duckdb.connect() as con:
        timing = %timeit -o con.execute("select * from simpledf").df()
        timings.append(("test_duckdb_execute_df", n, timing))

def test_duckdb_execute_arrow():
    display("DuckDB: Execute Arrow")
    with duckdb.connect() as con:
        timing = %timeit -o con.execute("select * from simpledf").arrow()
        timings.append(("test_duckdb_execute_arrow", n, timing))

def test_duckdb_sql_df():
    display("DuckDB: SQL DF")
    import duckdb
    with duckdb.connect() as con:
        timing = %timeit -o con.sql("select * from simpledf").df()
        timings.append(("test_duckdb_sql_df", n, timing))

def test_magicddb_pandas():
    display("magic_duckdb: pandas")
    %dql -t df
    timing = %timeit -o %dql select * from simpledf
    timings.append(("test_magicddb_pandas", n, timing))

def test_magicddb_arrow():
    display("magic_duckdb: arrow")
    %dql -t arrow
    timing = %timeit -o %dql select * from simpledf
    timings.append(("test_magicddb_arrow", n, timing))

def test_jupysql():
    display("jupysql, duckdb_engine, sql_alchemy")
    %sql duckdb:///:memory:
    %config SqlMagic.autopandas = True
    timing = %timeit -o %sql select * from simpledf
    timings.append(("test_jupysql", n, timing))



In [None]:
print_versions()
timings = []

numcols = 20
for n in [1, 1000, 1000000]:
    print(f"n={n}")
    simpledf = DataFrame(np.random.randn(n, numcols ))
    test_duckdb_execute_df()
    test_duckdb_execute_arrow()
    test_duckdb_sql_df()
    test_magicddb_pandas()
    test_magicddb_arrow()
    test_jupysql()


In [None]:

with open('timings2.csv', 'w') as file:
    for r, n, tir in timings:
        original_stdout = sys.stdout
        sys.stdout = file
        print(r, ',', n, ',', str(tir).replace(",", "."))
        sys.stdout = original_stdout


In [None]:
duckdb.execute("with data as (select *, trim(split_part(column2, ' ms', 1))::float as t from 'timings2.csv' order by column1, t asc) pivot data on column1 using last(t) group by column0").df()