In [1]:
file_name = 'outputs/latency_create/experiment.txt'

# Generate data

In [1]:
import subprocess
from scripts.delete_index import delete_index

# Set your database connection parameters
port = "5432"
user = "postgres"
password = "postgres"

# You can specify N values as a list
N_values = ["10k", "100k", "200k", "400k", "600k", "800k", "1m"]

# Set your count
count = 10

# Suppress client_min_messages
suppress_command = "SET client_min_messages TO WARNING"

# Call the delete_index function
for N in N_values:
    delete_index('sift', N)

# Output file
file_name = 'outputs/latency_create/experiment.txt'
with open(file_name, 'w') as file:
    # Loop over each N value
    for N in N_values:
        # Loop count times
        for c in range(1, count + 1):
            line = f"Experiment - N: {N}, count: {c}"
            print(line)
            file.write(line + '\n')
            file.flush()

            # Create and run your create index query
            create_index_query = f"CREATE INDEX sift_base{N}_index ON sift_base10k USING ivfflat (v vector_l2_ops) WITH (lists = 10)"
            subprocess.run(["psql", "-p", port, "-U", user, "-c", suppress_command, "-c", "\\timing", "-c", create_index_query], env={"PGPASSWORD": password}, stdout=file)

            # Create and run your drop index query again
            drop_index_query = f"DROP INDEX sift_base{N}_index"
            subprocess.run(["psql", "-p", port, "-U", user, "-c", suppress_command, "-c", drop_index_query], env={"PGPASSWORD": password}, stdout=file)


Experiment - N: 10k, count: 1
Experiment - N: 10k, count: 2
Experiment - N: 10k, count: 3
Experiment - N: 10k, count: 4
Experiment - N: 10k, count: 5
Experiment - N: 10k, count: 6
Experiment - N: 10k, count: 7
Experiment - N: 10k, count: 8
Experiment - N: 10k, count: 9
Experiment - N: 10k, count: 10
Experiment - N: 100k, count: 1
Experiment - N: 100k, count: 2
Experiment - N: 100k, count: 3
Experiment - N: 100k, count: 4
Experiment - N: 100k, count: 5
Experiment - N: 100k, count: 6
Experiment - N: 100k, count: 7
Experiment - N: 100k, count: 8
Experiment - N: 100k, count: 9
Experiment - N: 100k, count: 10
Experiment - N: 200k, count: 1
Experiment - N: 200k, count: 2
Experiment - N: 200k, count: 3
Experiment - N: 200k, count: 4
Experiment - N: 200k, count: 5
Experiment - N: 200k, count: 6
Experiment - N: 200k, count: 7
Experiment - N: 200k, count: 8
Experiment - N: 200k, count: 9
Experiment - N: 200k, count: 10
Experiment - N: 400k, count: 1
Experiment - N: 400k, count: 2
Experiment - N:

# Parse data

In [2]:
import statistics

# Open the file
with open(file_name, "r") as file:
    lines = file.readlines()

# Initialize variables to hold the current N value, count and the results
current_N_value = None
current_results = []
results = []

def save_results():
    results.append((current_N_value, current_results))
    current_results = []

# Loop over the lines in the file
for line in lines:
    # Strip off the trailing newline and any extra spaces
    line = line.rstrip()
    
    # If this line starts a new experiment, save the N value
    if line.startswith("Experiment - N:"):
        parts = line.split(",")
        N_value = parts[0].split(":")[1].strip()
        if N_value != current_N_value:
            if current_N_value is not None:
                results.append((current_N_value, current_results))
            current_N_value = N_value
            current_results = []
    
    # If this line contains a time value, save it with the current N value
    elif line.startswith("Time:"):
        time = float(line.split(":")[1].strip().split(" ")[0])
        current_results.append(time)
results.append((current_N_value, current_results))
        
# Print the results
for result in results:
    print(result)
print()

# Calculate statistics
plot_results = []
for result in results:
    mean = statistics.mean(result[1])
    stdev = statistics.stdev(result[1])
    plot_result = (result[0], mean, stdev)
    plot_results.append(plot_result)
    print(plot_result)

('10k', [216.191, 187.779, 183.376, 184.314, 180.779, 181.402, 178.888, 176.835, 184.875, 186.937])
('100k', [183.591, 213.423, 202.152, 212.89, 209.776, 207.434, 196.451, 186.439, 183.431, 205.155])
('200k', [192.813, 223.47, 196.015, 195.91, 196.008, 184.842, 181.783, 199.493, 211.495, 215.01])
('400k', [199.202, 215.234, 192.578, 210.444, 197.89, 185.203, 198.001, 187.258, 183.475, 185.888])
('600k', [180.247, 187.719, 179.61, 186.897, 181.504, 188.627, 206.473, 188.12, 195.709, 195.618])
('800k', [189.656, 194.56, 212.176, 206.136, 174.973, 205.163, 188.574, 205.847, 200.316, 194.735])
('1m', [184.393, 213.766, 198.244, 193.942, 197.893, 194.557, 190.236, 212.159, 258.399, 188.631])

('10k', 186.1376, 11.102160772670437)
('100k', 200.0742, 11.872455505637125)
('200k', 199.6839, 13.205029735756836)
('400k', 195.5173, 10.841847065370773)
('600k', 189.0524, 8.313583933672783)
('800k', 197.2136, 11.007196748794243)
('1m', 203.222, 21.56641241993361)


# Plot data

In [23]:
import sys
sys.path.append('../util')
from converters import convert_string_to_number
import plotly.graph_objects as go

x_values, y_values, y_stdev = zip(*plot_results)
x_values = list(map(convert_string_to_number, x_values))

fig = go.Figure(data=go.Scatter(x=x_values, y=y_values, mode='lines', error_y=dict(
            type='data',
            array=y_stdev,
            visible=True
        )))

fig.update_layout(
    title='Create Index Latency over Number of Rows',
    xaxis=dict(title='Number of rows'),
    yaxis=dict(title='Latency (ms)'),
)

fig.show()