In [27]:
# Utility functions

import os
import time
from pgvector.psycopg2 import register_vector
import psycopg2
import plotly.graph_objects as go

def create_test_table(n, column):
    with psycopg2.connect(db_connection_string) as conn:
        with conn.cursor() as cursor:
            cursor.execute("DROP TABLE IF EXISTS test_table")
            cursor.execute(f'''
                CREATE TABLE test_table AS
                SELECT * FROM tsv_data
                WHERE {column}1 IS NOT NULL
                ORDER BY RANDOM()
                LIMIT {n}
            ''')
            cursor.execute('CREATE INDEX ON test_table USING ivfflat (image_url_ai2 vector_l2_ops) WITH (lists = 100)')
            cursor.execute('CREATE INDEX ON test_table USING ivfflat (context_page_description_ai2 vector_l2_ops) WITH (lists = 100)')
            cursor.execute('CREATE INDEX ON test_table (original_width)')

def drop_test_table():
    with psycopg2.connect(db_connection_string) as conn:
        with conn.cursor() as cursor:
            cursor.execute("DROP TABLE IF EXISTS test_table")

def get_table_count(column):
    with psycopg2.connect(db_connection_string) as conn:
        with conn.cursor() as cursor:
            cursor.execute(f"SELECT COUNT(*) FROM tsv_data WHERE {column}1 IS NOT NULL")
            count = cursor.fetchone()[0]
            return count

def generate_plots(title, variable, results, dimensions):
    # Create traces for each query type
    data = []
    for query_type, query_results in results.items():
        sizes, throughputs = zip(*query_results)  # Unpack the table sizes and throughput values
        trace = go.Scatter(
            x=sizes,
            y=throughputs,
            mode='lines+markers',
            name=query_type
        )
        data.append(trace)
    
    # Create layout
    layout = go.Layout(
        title=title,
        xaxis=dict(title=variable),
        yaxis=dict(title='Throughput (ops / s)'),
        showlegend=True
    )
    
    # Create figure
    fig = go.Figure(data=data, layout=layout)
    
    # Display the figure
    fig.show()

def print_throughput_results(variable, results, dimensions):
    print_double_line = lambda: print("=" * 40)
    print_line = lambda: print("-" * 40)
    for label, data in results.items():

        print("")
        print_double_line()
        print(f"{label} ({dimensions})")
        print_line()

        print(f"{variable:<15}{'Throughput (ops/s)':<20}")
        print_line()

        for sublabel, throughput in data:
            print(f"{sublabel:<15}{throughput:<20.3f}")

        print_double_line()
        print("")

WARMUP_ITERATIONS = 10
NUM_ITERATIONS = 100

def measure_throughput(query, generate_args):
    db_connection_string = os.environ.get('DATABASE_URL')
    with psycopg2.connect(db_connection_string) as conn:
        register_vector(conn)
        with conn.cursor() as cursor:
            # Create table

            # Helper for executing query
            def execute_query():
                if generate_args is not None:
                    vector = generate_args()
                    cursor.execute(query, (vector,))
                else:
                    cursor.execute(query)
            
            # Warm-up
            for _ in range(WARMUP_ITERATIONS):
                execute_query()

            # Measured
            start_time = time.time()
            for _ in range(NUM_ITERATIONS):
                execute_query()
            end_time = time.time()

    elapsed_time = end_time - start_time
    throughput = num_iterations / elapsed_time
    return throughput

In [21]:
# Generate, plot, and print experiment data for (throughput vs. table size) / query

import os
import time
import numpy as np
from pgvector.psycopg2 import register_vector
import psycopg2

N_INTERVAL = 1000

def column_experiments(column, dimensions):
    results = {}
    
    max_count = get_table_count(column)
    start = N_INTERVAL
    end = max_count - (max_count % N_INTERVAL) + 1
    for count in range(start, end, N_INTERVAL):
        create_test_table(count, column)

        experiment_inputs = [
            {
                'label': 'select 1',
                'query': 'SELECT 1'
            },
            {
                'label': 'select id',
                'query': f"SELECT 1 FROM test_table WHERE id < 100"
            },
            {
                'label': 'select int',
                'query': f"SELECT 1 FROM test_table WHERE original_height < 100"
            },
            {
                'label': 'select int (indexed)',
                'query': "SELECT 1 FROM test_table WHERE original_width < 100",
            },
            {
                'label': 'select vector',
                'query': f"SELECT 1 FROM test_table ORDER BY {column}1 <-> %s LIMIT 10",
                'generate_args': lambda: np.random.rand(dimensions)
            },
            {
                'label': 'select vector (indexed)',
                'query': f"SELECT * FROM test_table ORDER BY {column}2 <-> %s LIMIT 10",
                'generate_args': lambda: np.random.rand(dimensions)
            },
        ]

        for input in experiment_inputs:
            label = input['label']
            query = input['query']
            generate_args = input['generate_args'] if 'generate_args' in input else None
            throughput = measure_throughput(query, generate_args)
            if label not in results:
                results[label] = []
            results[label].append((count, throughput))

        drop_test_table()

    return results

def run_experiment(column, dimension):
    results = column_experiments(column, dimension)
    generate_plots(
        f"Throughput vs. Table Size for Each Query Type (Dimension = {dimension})",
        'Table Size',
        results,
        dimension
    )
    print_throughput_results('Table Size', results, dimension)

run_experiment('image_url_ai', 512)
run_experiment('context_page_description_ai', 768)


select 1 (512)
----------------------------------------
Table Size     Throughput (ops/s)  
----------------------------------------
1000           22929.718           
2000           21121.483           
3000           17273.305           
4000           16013.072           
5000           13575.996           
6000           36685.944           
7000           38272.689           
8000           22284.051           
9000           27639.565           
10000          36333.195           
11000          23477.772           
12000          16614.395           
13000          41177.145           


select id (512)
----------------------------------------
Table Size     Throughput (ops/s)  
----------------------------------------
1000           6396.779            
2000           4588.151            
3000           3281.337            
4000           2794.321            
5000           2522.237            
6000           1180.194            
7000           1928.212            
8000      


select 1 (768)
----------------------------------------
Table Size     Throughput (ops/s)  
----------------------------------------
1000           27156.387           
2000           34920.523           
3000           41659.754           
4000           21191.916           
5000           41302.846           
6000           30073.163           
7000           37933.472           
8000           40411.446           
9000           37052.155           
10000          21371.161           
11000          28064.931           
12000          22132.362           
13000          18358.226           
14000          35845.688           
15000          39934.343           
16000          16991.995           
17000          27947.122           
18000          9095.708            
19000          35705.321           
20000          6663.443            
21000          38434.014           
22000          39565.173           
23000          27114.254           
24000          40233.132           
25

In [31]:
# Generate, plot, and print experiment data for (throughput vs. k) / dimension

import os
import time
import numpy as np
from pgvector.psycopg2 import register_vector
import psycopg2
import plotly.graph_objects as go

N_INTERVAL = 1000

def column_experiments(count, column, dimensions):
    create_test_table(count, column)

    results = []
    for k in range(25, 201, 25):
        query = f"SELECT * FROM test_table ORDER BY {column}2 <-> %s LIMIT {k}"
        generate_args = input['generate_args'] if 'generate_args' in input else None
        throughput = measure_throughput(query, lambda: np.random.rand(dimensions))
        results.append((k, throughput))

    drop_test_table()

    return results

def run_experiment(inputs):
    shared_max_count = min(get_table_count(column) for column, _ in inputs)
    results = {}
    for column, dimension in inputs:
        results[dimension] = column_experiments(shared_max_count, column, dimension)
    generate_plots(
        f"Throughput vs. K for Each Dimension (N = {shared_max_count})",
        'K',
        results,
        dimension
    )
    print_throughput_results('K', results, dimension)

inputs = [
    ('image_url_ai', 512),
    ('context_page_description_ai', 768)
]
run_experiment(inputs)


512 (768)
----------------------------------------
K              Throughput (ops/s)  
----------------------------------------
25             334.325             
50             178.293             
75             118.783             
100            96.251              
125            67.735              
150            67.587              
175            65.809              
200            68.132              


768 (768)
----------------------------------------
K              Throughput (ops/s)  
----------------------------------------
25             274.297             
50             175.157             
75             130.754             
100            104.627             
125            86.099              
150            76.031              
175            64.502              
200            58.009              

