In [2]:
# Download initial data

import os
import urllib.request
import gzip
import shutil

url = 'https://storage.googleapis.com/gresearch/wit/wit_v1.train.all-1percent_sample.tsv.gz'
filename = 'data/wit/data.tsv'

if os.path.exists(filename):
    print("The file exists")

else:
    # Download the data from the URL
    with urllib.request.urlopen(url) as response:
      with open(filename + '.gz', 'wb') as f:
        f.write(response.read())
    
    # Extract the data from the compressed file
    with gzip.open(filename + '.gz', 'rb') as f_in:
      with open(filename, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

    print("The file was downloaded")

The file exists


In [14]:
# Create Postgres table with initial data

import os
import psycopg2

db_connection_string = os.environ.get('DATABASE_URL')
with psycopg2.connect(db_connection_string) as conn:
    with conn.cursor() as cursor:
        with open('data/wit/create_table.sql', 'r') as sql_file:
            sql_script = sql_file.read()
        cursor.execute(sql_script)
        print("Create table")
        
        count_query = "SELECT COUNT(*) FROM tsv_data"
        cursor.execute(count_query)
        row_count = cursor.fetchone()[0]
        
        if row_count == 0:
            with open('data/wit/copy_data.sql', 'r') as sql_file:
                sql_script = sql_file.read()
            cursor.execute(sql_script)
            print("Copied data")
        else:
            print("No need to copy data")
        
        image_urls_query = "SELECT id, image_url FROM tsv_data WHERE image_url_ai IS NULL LIMIT 10"
        cursor.execute(image_urls_query)
        image_urls = cursor.fetchall()
        
        conn.commit()
        print("Completed")

Create table
Copied data
Completed


In [34]:
import asyncio
import aiohttp
import psycopg2
import torch
import clip
import PIL
import io
import os
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, preprocess = clip.load('ViT-B/32', device)
model.eval()
model.to(device)

async def download_and_preprocess(session, item):
    # Unpack the tuple
    id, image_url = item

    try:
        # Download the image from the URL
        req_headers = {'User-Agent': 'SelectImages/0.0 (narekg.me; ngalstjan4@gmail.com)'}
        async with session.get(image_url, headers=req_headers) as response:
            image_bytes = await response.read()
            image = PIL.Image.open(io.BytesIO(image_bytes)).convert("RGB")

        # Preprocess the image and generate embeddings
        preprocessed_image = preprocess(image).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = model.encode_image(preprocessed_image).squeeze().tolist()

        return (image_embedding, image_embedding, id)
    except Exception as e:
        return None

db_connection_string = os.environ.get('DATABASE_URL')
with psycopg2.connect(db_connection_string) as conn:
    with conn.cursor() as cursor:
        cursor.execute('''
            SELECT
                id,
                image_url
            FROM
                tsv_data
            WHERE
                language = 'en'
                AND image_url IS NOT NULL
            ORDER BY
                RANDOM()
            LIMIT 1000
        ''')
        rows = cursor.fetchall()



        async with aiohttp.ClientSession() as session:
            tasks = [download_and_preprocess(session, item) for item in rows]
            for future in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
                result = await future
                if result is not None:
                    update_query = "UPDATE tsv_data SET image_url_ai1 = %s, image_url_ai2 = %s WHERE id = %s"
                    cursor.execute(update_query, result)
        
        # Commit the changes to the database
        conn.commit()

100%|██████████| 1000/1000 [03:33<00:00,  4.68it/s]


In [43]:
import os
import logging
import psycopg2
import torch
from tqdm import tqdm
from transformers import DistilBertModel, DistilBertTokenizer

logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

def process_batch(batch, model, tokenizer, device, cursor):
    try:
        # Unpack IDs and texts from the batch
        ids, texts = zip(*batch)

        inputs = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            padding='longest',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            text_embeddings = torch.mean(outputs.last_hidden_state, dim=1).tolist()

        for id, text_embedding in zip(ids, text_embeddings):
            query = "UPDATE tsv_data SET context_page_description_ai1 = %s, context_page_description_ai2 = %s WHERE id = %s"
            cursor.execute(query, (text_embedding, text_embedding, id))

    except Exception as e:
        print(e)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = 'distilbert-base-uncased'
model = DistilBertModel.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model.eval()
model.to(device)

db_connection_string = os.environ.get('DATABASE_URL')
with psycopg2.connect(db_connection_string) as conn:
    with conn.cursor() as cursor:
        cursor.execute('''
            SELECT
                id,
                context_page_description
            FROM
                tsv_data
            WHERE
                language = 'en'
                AND context_page_description IS NOT NULL
            ORDER BY
                RANDOM()
            LIMIT 2000
        ''')
        rows = cursor.fetchall()
        
        batch_size = 32
        for i in tqdm(range(0, len(rows), batch_size)):
            batch = rows[i:i+batch_size]
            process_batch(batch, model, tokenizer, device, cursor)
        
        conn.commit()

100%|██████████| 63/63 [06:23<00:00,  6.09s/it]


In [42]:
import os
import time
import numpy as np
from pgvector.psycopg2 import register_vector
import psycopg2

db_connection_string = os.environ.get('DATABASE_URL')

num_iterations = 50  # Increase the sample size
warm_up_iterations = 10  # Number of iterations for warm-up

def measure_throughput(label, query, generate_args=None):
    with psycopg2.connect(db_connection_string) as conn:
        register_vector(conn)
        with conn.cursor() as cursor:
            def execute_query():
                if generate_args is not None:
                    vector = generate_args()
                    cursor.execute(query, (vector,))
                else:
                    cursor.execute(query)
            
            # Warm-up
            for _ in range(warm_up_iterations):
                execute_query()

            # Measured
            start_time = time.time()
            for _ in range(num_iterations):
                execute_query()
            end_time = time.time()

    elapsed_time = end_time - start_time
    throughput = num_iterations / elapsed_time
    print(f"{label:<30} {throughput:.3f} ops / s")

def column_experiments(column, dimensions):
    with psycopg2.connect(db_connection_string) as conn:
        with conn.cursor() as cursor:
            cursor.execute(f"select count(*) from tsv_data where {column}1 IS NOT NULL")
            count = cursor.fetchone()[0]

    print("\n" + ("-" * 50))
    print(f"{column} ({count})")
    print("-" * 50)
    
    print(f"{'query':<30} {'ops / s'}")
    print("-" * 50)
    
    query = 'SELECT 1'
    measure_throughput('select 1', query)
    
    query = f"SELECT 1 FROM tsv_data WHERE {column}1 IS NOT NULL AND id < 100"
    measure_throughput('select id', query)
    
    query = f"SELECT 1 FROM tsv_data WHERE {column}1 IS NOT NULL AND original_height < 100"
    measure_throughput('select int', query)

    query = f"SELECT 1 FROM tsv_data WHERE {column}1 IS NOT NULL AND original_width < 100"
    measure_throughput('select int (indexed)', query)
    
    query = f"SELECT * FROM tsv_data WHERE {column}1 IS NOT NULL ORDER BY {column}1 <-> %s LIMIT 10"
    measure_throughput('select vector', query, lambda: np.random.rand(dimensions))
    
    query = f"SELECT * FROM tsv_data WHERE {column}2 IS NOT NULL ORDER BY {column}2 <-> %s LIMIT 10"
    measure_throughput('select vector (indexed)', query, lambda: np.random.rand(dimensions))
    print(("-" * 50) + "\n")

column_experiments('image_url_ai', 512)
column_experiments('context_page_description_ai', 768)


--------------------------------------------------
image_url_ai (8635)
--------------------------------------------------
query                          ops / s
--------------------------------------------------
select 1                       11908.870 ops / s
select id                      8267.897 ops / s
select int                     3.911 ops / s
select int (indexed)           10451.271 ops / s
select vector                  3.458 ops / s
select vector (indexed)        1026.883 ops / s
--------------------------------------------------


--------------------------------------------------
context_page_description_ai (10051)
--------------------------------------------------
query                          ops / s
--------------------------------------------------
select 1                       12175.048 ops / s
select id                      8099.614 ops / s
select int                     3.806 ops / s
select int (indexed)           10193.710 ops / s
select vector                  