# Generate data

In [None]:
import os
import subprocess
import psycopg2
from tempfile import NamedTemporaryFile
from converters import extract_connection_params

K_values = [4, 8, 16, 32, 64]
N_values = ["10k", "100k", "200k", "400k", "600k", "800k", "1m"]

def run_command(command):
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    output, error = process.communicate()
    return output.decode(), error.decode()

def benchmark(should_index):
    db_connection_string = os.environ.get('DATABASE_URL')
    conn = psycopg2.connect(db_connection_string)
    cur = conn.cursor()

    cur.execute(open('./delete_indices.sql', 'r').read())

    if should_index:
        cur.execute(open('./create_indices.sql', 'r').read())
        cur.execute(open('./create_indices_derived.sql', 'r').read())

    for N in N_values:
        for K in K_values:
            output_file = f"outputs/{should_index}_{N}_K{K}.txt"

            query = f"""
                \set id random(1, 10000)
    
                SELECT *
                FROM sift_base{N}
                ORDER BY v <-> (
                    SELECT v
                    FROM sift_base{N}
                    WHERE id = :id
                )
                LIMIT {K};
            """
            with NamedTemporaryFile(mode="w", delete=False) as tmp_file:
                tmp_file.write(query)
                tmp_file_path = tmp_file.name

            host, port, user, password, database = extract_connection_params(db_connection_string)
            command = f'PGPASSWORD={password} pgbench -d {database} -U {user} -h {host} -p {port} -f {tmp_file_path} -c 5 -j 5 -t 15 -r > {output_file} 2>/dev/null'
            run_command(command)

            with open(output_file, "r") as file:
                print(file.read())

            print(f"Finished pgbench with N={N}, indexed={should_index}, K={K}")

    if should_index:
        cur.execute(open('./delete_indices.sql', 'r').read())

    cur.close()
    conn.close()

benchmark(False)
# benchmark(True)

pgbench (15.3 (Debian 15.3-0+deb12u1))
transaction type: /tmp/tmp0j1xe6ce
scaling factor: 1
query mode: simple
number of clients: 5
number of threads: 5
maximum number of tries: 1
number of transactions per client: 15
number of transactions actually processed: 75/75
number of failed transactions: 0 (0.000%)
latency average = 4.366 ms
initial connection time = 34.015 ms
tps = 1145.143067 (without initial connection time)
statement latencies in milliseconds and failures:
         0.014           0  \set id random(1, 10000)
         4.170           0  SELECT *

Finished pgbench with N=10k, indexed=False, K=4
pgbench (15.3 (Debian 15.3-0+deb12u1))
transaction type: /tmp/tmpz8pnt8d2
scaling factor: 1
query mode: simple
number of clients: 5
number of threads: 5
maximum number of tries: 1
number of transactions per client: 15
number of transactions actually processed: 75/75
number of failed transactions: 0 (0.000%)
latency average = 2.717 ms
initial connection time = 31.626 ms
tps = 1840.5359

# Parse data

In [12]:
import sys
sys.path.append('../util')
import os
import re
from converters import convert_number_to_string, convert_string_to_number

N_values = set()
K_values = set()

latency_values = {}
tps_values = {}

get_key = lambda indexed, N, K: f"{'true' if indexed else 'false'}_{convert_number_to_string(N)}_K{K}"

dir = 'outputs'
file_names = os.listdir(dir)

for file_name in file_names:
    key = file_name.split('.')[0]
    parts = key.split('_')
    if len(parts) != 3:
        continue
    indexed = parts[0]
    N_values.add(parts[1])
    K_values.add(parts[2][1:])

    # Read the file and extract the latency average value
    with open(f"{dir}/{file_name}", 'r') as file:
        file_content = file.read()

        # Extract latency average using regular expression
        latency_average_match = re.search(r'latency average = (\d+\.\d+) ms', file_content)
        if latency_average_match:
            latency_values[key] = float(latency_average_match.group(1))

        # Extract TPS (Transactions Per Second) using regular expression
        tps_match = re.search(r'tps = (\d+\.\d+)', file_content)
        if tps_match:
            tps_values[key] = float(tps_match.group(1))

N_values = sorted([convert_string_to_number(N) for N in N_values])
K_values = sorted([int(K) for K in K_values])

# Plot data

In [13]:
from colors import red_shades, green_shades
import plotly.graph_objects as go

full_strings = {
    'N': 'Number of rows (N)',
    'K': 'Number of similar vectors (K)'
}

param_values = {
    'N': N_values,
    'K': K_values
}

def generate_plot(x, y, key_to_values_dict, fixed, fixed_value):
    # Process data
    plot_items = []
    indexed_count = 0
    unindexed_count = 0
    for indexed in [True, False]:
        x_values = []
        y_values = []
        for param_x in param_values[x]:
            key_params = {}
            key_params[x] = param_x
            key_params[fixed] = fixed_value

            key = get_key(indexed, key_params['N'], key_params['K'])
            if key in key_to_values_dict:
                x_values.append(param_x)
                y_values.append(key_to_values_dict[key])
        if len(x_values) > 0:
            if indexed:
                indexed_str = 'indexed'
                color = green_shades[indexed_count]
                indexed_count += 1
            else:
                indexed_str = 'unindexed'
                color = red_shades[unindexed_count]
                unindexed_count += 1
            plot_items.append((f"{indexed_str}", x_values, y_values, color))

    # Plot data
    fig = go.Figure()
    for (key, x_values, y_values, color) in plot_items:
        fig.add_trace(go.Scatter(
            x=x_values,
            y=y_values,
            marker=dict(color=color),
            mode='lines+markers',
            name=key
        ))
    fig.update_layout(
        title=f"{y} vs. {x}",
        xaxis_title=full_strings[x],
        yaxis_title=y
    )
    fig.show()


generate_plot(x='N', y='Latency (ms)', key_to_values_dict=latency_values, fixed='K', fixed_value=4)
generate_plot(x='K', y='Latency (ms)', key_to_values_dict=latency_values, fixed='N', fixed_value=100000)
generate_plot(x='N', y='Transactions / second', key_to_values_dict=tps_values, fixed='K', fixed_value=4)
generate_plot(x='K', y='Transactions / second', key_to_values_dict=tps_values, fixed='N', fixed_value=100000)