In [1]:
from sqlalchemy import create_engine
import pandas as pd
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
conn = create_engine("postgresql+psycopg://root:root@localhost:5433/ny_taxi", echo=True)
conn

Engine(postgresql+psycopg://root:***@localhost:5433/ny_taxi)

In [3]:
dtypes = {
    "VendorID": "Int64",
    "passenger_count": "Int64",
    "trip_distance": "float64",
    "RatecodeID": "Int64",
    "store_and_fwd_flag": "string",
    "PULocationID": "Int64",
    "DOLocationID": "Int64",
    "payment_type": "Int64",
    "fare_amount": "float64",
    "extra": "float64",
    "mta_tax": "float64",
    "tip_amount": "float64",
    "tolls_amount": "float64",
    "improvement_surcharge": "float64",
    "total_amount": "float64",
    "congestion_surcharge": "float64"
}

parse_dates = ['lpep_pickup_datetime', 'lpep_dropoff_datetime']

In [4]:
green_link = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2021-01.csv.gz"
df_iter = pd.read_csv(
    green_link, 
    chunksize=10000, 
    compression='gzip', 
    iterator=True,
    dtype=dtypes,
    parse_dates=parse_dates)

In [None]:
first = True
for chunk in tqdm(df_iter):
    if first:
        first = False
        chunk.head(0).to_sql(
            name="green_taxi", 
            con=conn, 
            if_exists="replace"
        )
        print("Table green_taxi created")
        
    chunk.to_sql(
        name="green_taxi", 
        con=conn, 
        if_exists="append"
    )
    print(f"Inserted: {len(chunk)} records")
    

0it [00:00, ?it/s]

2026-02-01 09:37:44,363 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2026-02-01 09:37:44,364 INFO sqlalchemy.engine.Engine [raw sql] {}
2026-02-01 09:37:44,365 INFO sqlalchemy.engine.Engine select current_schema()
2026-02-01 09:37:44,366 INFO sqlalchemy.engine.Engine [raw sql] {}
2026-02-01 09:37:44,366 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2026-02-01 09:37:44,367 INFO sqlalchemy.engine.Engine [raw sql] {}
2026-02-01 09:37:44,369 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2026-02-01 09:37:44,373 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s::VARCHAR AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s::VARCHAR, %(param_2)s::VARCHAR, %(param_3)s::VARCHAR, %(param_4)s::VARCHAR, %(param_5)s::VARCHAR]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class

1it [00:00,  1.27it/s]

2026-02-01 09:37:45,155 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2026-02-01 09:37:45,158 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s::VARCHAR AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s::VARCHAR, %(param_2)s::VARCHAR, %(param_3)s::VARCHAR, %(param_4)s::VARCHAR, %(param_5)s::VARCHAR]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s::VARCHAR
2026-02-01 09:37:45,159 INFO sqlalchemy.engine.Engine [cached since 0.7858s ago] {'table_name': 'green_taxi', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2026-02-01 09:37:45,261 INFO sqlalchemy.engine.Engine INSERT INTO green_taxi (index, "VendorID", lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, "

2it [00:01,  1.38it/s]

2026-02-01 09:37:45,816 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2026-02-01 09:37:45,818 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s::VARCHAR AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s::VARCHAR, %(param_2)s::VARCHAR, %(param_3)s::VARCHAR, %(param_4)s::VARCHAR, %(param_5)s::VARCHAR]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s::VARCHAR
2026-02-01 09:37:45,819 INFO sqlalchemy.engine.Engine [cached since 1.445s ago] {'table_name': 'green_taxi', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2026-02-01 09:37:45,946 INFO sqlalchemy.engine.Engine INSERT INTO green_taxi (index, "VendorID", lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, "R

3it [00:02,  1.46it/s]

2026-02-01 09:37:46,460 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2026-02-01 09:37:46,462 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s::VARCHAR AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s::VARCHAR, %(param_2)s::VARCHAR, %(param_3)s::VARCHAR, %(param_4)s::VARCHAR, %(param_5)s::VARCHAR]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s::VARCHAR
2026-02-01 09:37:46,462 INFO sqlalchemy.engine.Engine [cached since 2.089s ago] {'table_name': 'green_taxi', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2026-02-01 09:37:46,539 INFO sqlalchemy.engine.Engine INSERT INTO green_taxi (index, "VendorID", lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, "R

4it [00:02,  1.51it/s]

2026-02-01 09:37:47,074 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2026-02-01 09:37:47,077 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s::VARCHAR AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s::VARCHAR, %(param_2)s::VARCHAR, %(param_3)s::VARCHAR, %(param_4)s::VARCHAR, %(param_5)s::VARCHAR]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s::VARCHAR
2026-02-01 09:37:47,077 INFO sqlalchemy.engine.Engine [cached since 2.704s ago] {'table_name': 'green_taxi', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2026-02-01 09:37:47,199 INFO sqlalchemy.engine.Engine INSERT INTO green_taxi (index, "VendorID", lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, "R

5it [00:03,  1.56it/s]

2026-02-01 09:37:47,679 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2026-02-01 09:37:47,682 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s::VARCHAR AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s::VARCHAR, %(param_2)s::VARCHAR, %(param_3)s::VARCHAR, %(param_4)s::VARCHAR, %(param_5)s::VARCHAR]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s::VARCHAR
2026-02-01 09:37:47,682 INFO sqlalchemy.engine.Engine [cached since 3.309s ago] {'table_name': 'green_taxi', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2026-02-01 09:37:47,765 INFO sqlalchemy.engine.Engine INSERT INTO green_taxi (index, "VendorID", lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, "R

6it [00:03,  1.59it/s]

2026-02-01 09:37:48,287 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2026-02-01 09:37:48,288 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s::VARCHAR AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s::VARCHAR, %(param_2)s::VARCHAR, %(param_3)s::VARCHAR, %(param_4)s::VARCHAR, %(param_5)s::VARCHAR]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s::VARCHAR
2026-02-01 09:37:48,289 INFO sqlalchemy.engine.Engine [cached since 3.916s ago] {'table_name': 'green_taxi', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2026-02-01 09:37:48,371 INFO sqlalchemy.engine.Engine INSERT INTO green_taxi (index, "VendorID", lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, "R

7it [00:04,  1.65it/s]

2026-02-01 09:37:48,852 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2026-02-01 09:37:48,853 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s::VARCHAR AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s::VARCHAR, %(param_2)s::VARCHAR, %(param_3)s::VARCHAR, %(param_4)s::VARCHAR, %(param_5)s::VARCHAR]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s::VARCHAR
2026-02-01 09:37:48,854 INFO sqlalchemy.engine.Engine [cached since 4.481s ago] {'table_name': 'green_taxi', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2026-02-01 09:37:48,909 INFO sqlalchemy.engine.Engine INSERT INTO green_taxi (index, "VendorID", lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, "R

8it [00:04,  1.64it/s]


In [7]:
result = pd.read_sql("SELECT count(*) FROM green_taxi", conn)
print(result)

2026-02-01 09:39:09,644 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2026-02-01 09:39:09,644 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s::VARCHAR AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s::VARCHAR, %(param_2)s::VARCHAR, %(param_3)s::VARCHAR, %(param_4)s::VARCHAR, %(param_5)s::VARCHAR]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s::VARCHAR
2026-02-01 09:39:09,645 INFO sqlalchemy.engine.Engine [cached since 85.27s ago] {'table_name': 'SELECT count(*) FROM green_taxi', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2026-02-01 09:39:09,646 INFO sqlalchemy.engine.Engine SELECT count(*) FROM green_taxi
2026-02-01 09:39:09,646 INFO sqlalchemy.engine.Engine [ra