In [2]:
# Download initial data

import os
import urllib.request
import gzip
import shutil

url = 'https://storage.googleapis.com/gresearch/wit/wit_v1.train.all-1percent_sample.tsv.gz'
filename = 'data/wit/data.tsv'

if os.path.exists(filename):
    print("The file exists")

else:
    # Download the data from the URL
    with urllib.request.urlopen(url) as response:
      with open(filename + '.gz', 'wb') as f:
        f.write(response.read())
    
    # Extract the data from the compressed file
    with gzip.open(filename + '.gz', 'rb') as f_in:
      with open(filename, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

    print("The file was downloaded")

The file exists


In [14]:
# Create Postgres table with initial data

import os
import psycopg2

db_connection_string = os.environ.get('DATABASE_URL')
conn = psycopg2.connect(db_connection_string)
cursor = conn.cursor()

with open('data/wit/create_table.sql', 'r') as sql_file:
    sql_script = sql_file.read()
cursor.execute(sql_script)
print("Create table")

count_query = "SELECT COUNT(*) FROM tsv_data"
cursor.execute(count_query)
row_count = cursor.fetchone()[0]

if row_count == 0:
    with open('data/wit/copy_data.sql', 'r') as sql_file:
        sql_script = sql_file.read()
    cursor.execute(sql_script)
    print("Copied data")
else:
    print("No need to copy data")

image_urls_query = "SELECT id, image_url FROM tsv_data WHERE image_url_ai IS NULL LIMIT 10"
cursor.execute(image_urls_query)
image_urls = cursor.fetchall()

conn.commit()

cursor.close()
conn.close()

print("Completed")

Create table
Copied data
Completed


In [None]:
import psycopg2
import torch
import clip
import requests
import PIL
import io
from tqdm import tqdm
import os

db_connection_string = os.environ.get('DATABASE_URL')
conn = psycopg2.connect(db_connection_string)
cursor = conn.cursor()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, preprocess = clip.load('ViT-B/32', device)
model.eval()
model.to(device)

cursor.execute('''
    SELECT
        id,
        image_url
    FROM
        tsv_data
    WHERE
        language = 'en'
        AND image_url IS NOT NULL
    ORDER BY
        RANDOM()
    LIMIT 1000
''')
rows = cursor.fetchall()

# Set the batch size for database updates
batch_size = 10

# Initialize a buffer for batched updates
update_buffer = []

# Execute batched updates and clear the buffer
def update_from_update_buffer():
    update_query = "UPDATE tsv_data SET image_url_ai1 = %s, image_url_ai2 = %s WHERE id = %s"
    cursor.executemany(update_query, update_buffer)
    update_buffer.clear()

# Process each tuple
for item in tqdm(rows):
    try:
        # Unpack the tuple
        id, image_url = item
    
        # Download the image from the URL
        req_headers = {'User-Agent': 'SelectImages/0.0 (narekg.me; ngalstjan4@gmail.com)'}
        response = requests.get(image_url, headers=req_headers)
        image = PIL.Image.open(io.BytesIO(response.content)).convert("RGB")
    
        # Preprocess the image and generate embeddings
        preprocessed_image = preprocess(image).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = model.encode_image(preprocessed_image).squeeze().tolist()
    
        # Add the updated row to the buffer
        update_buffer.append((image_embedding, image_embedding, id))
    
        # Execute batched updates when the buffer reaches the specified batch size
        if len(update_buffer) >= batch_size:
            update_from_update_buffer()

    except Exception as e:
        continue

# Execute the remaining batched updates in the buffer
if len(update_buffer) > 0:
    update_from_update_buffer()
    
# Commit the changes to the database
conn.commit()

# Close the cursor and database connection
cursor.close()
conn.close()

 44%|████▍     | 441/1000 [04:30<05:35,  1.67it/s]

In [9]:
import psycopg2
import torch
from transformers import BertModel, BertTokenizer
from tqdm import tqdm
import os
import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

db_connection_string = os.environ.get('DATABASE_URL')
conn = psycopg2.connect(db_connection_string)
cursor = conn.cursor()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
model.eval()
model.to(device)

cursor.execute('''
    SELECT
        id,
        context_page_description
    FROM
        tsv_data
    WHERE
        language = 'en'
        AND context_page_description IS NOT NULL
    ORDER BY
        RANDOM()
    LIMIT 500
''')
rows = cursor.fetchall()

# Set the batch size for database updates
batch_size = 10

# Initialize a buffer for batched updates
update_buffer = []

# Execute batched updates and clear the buffer
def update_from_update_buffer():
    update_query = "UPDATE tsv_data SET context_page_description_ai1 = %s, context_page_description_ai2 = %s WHERE id = %s"
    cursor.executemany(update_query, update_buffer)
    update_buffer.clear()

# Process each tuple
for item in tqdm(rows):
    try:
        # Unpack the tuple
        id, text = item
    
        # Tokenize the text and generate embeddings
        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            text_embedding = torch.mean(outputs.last_hidden_state, dim=1).squeeze().tolist()
    
        # Add the updated row to the buffer
        update_buffer.append((text_embedding, text_embedding, id))
    
        # Execute batched updates when the buffer reaches the specified batch size
        if len(update_buffer) >= batch_size:
            update_from_update_buffer()

    except Exception as e:
        continue

# Execute the remaining batched updates in the buffer
if len(update_buffer) > 0:
    update_from_update_buffer()
    
# Commit the changes to the database
conn.commit()

# Close the cursor and database connection
cursor.close()
conn.close()

100%|██████████| 500/500 [03:28<00:00,  2.40it/s]


In [5]:
# What is pgvector throughput?
import os
from pgvector.psycopg2 import register_vector
import psycopg2
import time
import numpy as np

db_connection_string = os.environ.get('DATABASE_URL')
conn = psycopg2.connect(db_connection_string)
register_vector(conn)
cursor = conn.cursor()

num_iterations = 10

def measure_throughput(query, generate_args=None):
    start_time = time.time()

    for i in range(num_iterations):
        if generate_args is not None:
            vector = generate_args()
            cursor.execute(query, (vector,))
        else:
            cursor.execute(query)        
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    return num_iterations / elapsed_time

cursor.execute("select count(*) from tsv_data where image_url_ai IS NOT NULL")
count = cursor.fetchone()[0]

print("count:", count)
print(f"select 1: {measure_throughput('SELECT 1')} OPS")
print(f"select id <: {measure_throughput('SELECT 1 FROM tsv_data WHERE image_url_ai IS NOT NULL AND id < 100' )} OPS")
print(f"select int <: {measure_throughput('SELECT 1 FROM tsv_data WHERE image_url_ai IS NOT NULL AND original_height < 100' )} OPS")
print(f"pgvector: {measure_throughput('SELECT * FROM tsv_data WHERE image_url_ai IS NOT NULL AND image_url_ai <-> vector(%s) < 0.5', lambda: np.random.rand(512))} OPS")

cursor.close()
conn.close()

count: 1417
select 1: 10493.630222667 OPS
select int: 3.302572449754774 OPS
pgvector: 3.278139427281616 OPS
