In [1]:
%pip install psycopg2 pandas

Collecting psycopg2
  Using cached psycopg2-2.9.10-cp312-cp312-linux_x86_64.whl
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-2.1.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
Using cached numpy-2.1.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, psycopg2, numpy, pandas
Successfully installed numpy-2.1.2 pandas-2.2.3 psycopg2-2.9.10 pytz-2024.2

In [10]:
# 1. Connect to Postgres using Jupyter
import psycopg2
import json
import pandas as pd

# Connect to Postgres
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="postgres"
)
cursor = conn.cursor()

# 2. Create a large table with 1 million rows
cursor.execute("""
    CREATE TABLE large_table (
        id SERIAL PRIMARY KEY,
        long_text TEXT,
        json_data JSONB
    )
""")

# Generate test data
import string
import random

for i in range(10_000):
    long_text = ''.join(random.choices(string.ascii_letters + string.digits, k=500))
    json_data = {
        "A": ''.join(random.choices(string.ascii_letters + string.digits, k=500)),
        "B": [random.randint(1, 100) for _ in range(50)]
    }
    cursor.execute("INSERT INTO large_table (long_text, json_data) VALUES (%s, %s)", (long_text, json.dumps(json_data)))

# conn.commit()

cursor.execute("CREATE EXTENSION pg_trgm")
# 3. Create indexes
cursor.execute("CREATE INDEX idx_json ON large_table USING GIN (json_data jsonb_path_ops)")
cursor.execute("CREATE INDEX idx_text ON large_table USING GIN (long_text gin_trgm_ops)")



cursor.execute("set enable_seqscan=false")


# 4. Explain analyze for searching long text
cursor.execute("EXPLAIN  (ANALYZE ,VERBOSE ,COSTS ,BUFFERS ,TIMING ,SUMMARY,FORMAT TEXT)  SELECT * FROM large_table WHERE long_text iLIKE '%abc%'")
print("\n".join(map(str, cursor.fetchall())))

# 5. Explain analyze for searching JSON
cursor.execute("EXPLAIN  (ANALYZE ,VERBOSE ,COSTS ,BUFFERS ,TIMING ,SUMMARY,FORMAT TEXT) SELECT * FROM large_table WHERE json_data  @@ '$.A like_regex \".*abc.*\"' ")
print("\n".join(map(str, cursor.fetchall())))
cursor.execute("EXPLAIN  (ANALYZE ,VERBOSE ,COSTS ,BUFFERS ,TIMING ,SUMMARY,FORMAT TEXT) SELECT * FROM large_table WHERE json_data  @? '$.B[*] ? (@ == %s )' ", (random.randint(1, 100),))
print("\n".join(map(str, cursor.fetchall())))

conn.close()

('Bitmap Heap Scan on public.large_table  (cost=19.18..1079.42 rows=400 width=68) (actual time=0.057..0.865 rows=153 loops=1)',)
('  Output: id, long_text, json_data',)
("  Recheck Cond: (large_table.long_text ~~* '%abc%'::text)",)
('  Heap Blocks: exact=149',)
('  Buffers: shared hit=153',)
('  ->  Bitmap Index Scan on idx_text  (cost=0.00..19.08 rows=400 width=0) (actual time=0.025..0.025 rows=153 loops=1)',)
("        Index Cond: (large_table.long_text ~~* '%abc%'::text)",)
('        Buffers: shared hit=4',)
('Planning:',)
('  Buffers: shared hit=13',)
('Planning Time: 0.171 ms',)
('Execution Time: 0.885 ms',)
('Bitmap Heap Scan on public.large_table  (cost=88257.77..88595.92 rows=100 width=68) (actual time=11.414..47.342 rows=30 loops=1)',)
('  Output: id, long_text, json_data',)
('  Recheck Cond: (large_table.json_data @@ \'($."A" like_regex ".*abc.*")\'::jsonpath)',)
('  Rows Removed by Index Recheck: 9970',)
('  Heap Blocks: exact=2500',)
('  Buffers: shared hit=2671',)
('  ->  