**Postgres credentials**

user: root | password: root | localhost | port: 5432 | postgres_db: ny_taxi

Notes: install the following packages

    `pip install sqlalchemy pyarrow psycopg2-binary`

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi")
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f9f757b23d0>

**Query test**

In [2]:
query = """
    SELECT 1 AS NUM
"""

pd.read_sql(query, con=engine)

Unnamed: 0,num
0,1


**Load the data and upload to database**

In [3]:
df = pd.read_parquet("yellow_tripdata_2021-01.parquet")
df.shape

(1369769, 19)

In [4]:
# initialize the header of database
df.head(n=0).to_sql(
    name='yellow_taxi_data', 
    con=engine, 
    if_exists='replace', 
    index=False
)

0

In [5]:
# test if header already exists
query = """
    SELECT * FROM yellow_taxi_data
"""

pd.read_sql(query, con=engine)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee


In [6]:
import pyarrow.parquet as pq

# create generator for parquet file
parquet_file = pq.ParquetFile("./yellow_tripdata_2021-01.parquet")
parquet_data_generator = parquet_file.iter_batches(batch_size=100000)

# parse each batch to pandas
batch_df = next(parquet_data_generator).to_pandas()

# parse to datetime
batch_df["tpep_pickup_datetime"] = pd.to_datetime(batch_df["tpep_pickup_datetime"])
batch_df["tpep_dropoff_datetime"] = pd.to_datetime(batch_df["tpep_dropoff_datetime"])

# migrate to database
%time batch_df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append', index=False)

CPU times: user 12.4 s, sys: 877 ms, total: 13.3 s
Wall time: 23.3 s


1000

Time per batch performance

**1000 rows**
* CPU times: user 252 ms, sys: 19 ms, total: 271 ms
* Wall time: 446 ms (**2242** rows per second)

**10000 rows**
* CPU times: user 1.24 s, sys: 40.7 ms, total: 1.28 s
* Wall time: 2.56 s (**3906** rows per second)

**100000 rows**
* CPU times: user 11.6 s, sys: 699 ms, total: 12.3 s
* Wall time: 22 s (**4545** rows per second)


In [7]:
query = """
    SELECT * FROM yellow_taxi_data
"""

pd.read_sql(query, con=engine)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.10,1.0,N,142,43,2,8.0,3.0,0.5,0.00,0.0,0.3,11.80,2.5,
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.20,1.0,N,238,151,2,3.0,0.5,0.5,0.00,0.0,0.3,4.30,0.0,
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.70,1.0,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0,
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.60,1.0,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0,
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1,2021-01-04 14:04:31,2021-01-04 14:08:52,3.0,0.70,1.0,N,234,224,2,5.0,2.5,0.5,0.00,0.0,0.3,8.30,2.5,
99996,1,2021-01-04 14:18:46,2021-01-04 14:35:45,2.0,3.30,1.0,N,234,236,1,14.5,2.5,0.5,3.55,0.0,0.3,21.35,2.5,
99997,1,2021-01-04 14:42:41,2021-01-04 14:59:22,2.0,4.70,1.0,N,236,79,1,17.0,2.5,0.5,4.05,0.0,0.3,24.35,2.5,
99998,2,2021-01-04 14:39:02,2021-01-04 15:09:37,2.0,17.95,2.0,N,132,148,1,52.0,0.0,0.5,5.00,0.0,0.3,60.30,2.5,


**Batch loading to database**

In [7]:
from time import time
import pandas as pd
import pyarrow.parquet as pq

# create generator for parquet file
parquet_file = pq.ParquetFile("./yellow_tripdata_2021-01.parquet")
parquet_file.read().to_pandas()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.10,1.0,N,142,43,2,8.00,3.00,0.5,0.00,0.00,0.3,11.80,2.5,
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.20,1.0,N,238,151,2,3.00,0.50,0.5,0.00,0.00,0.3,4.30,0.0,
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.70,1.0,N,132,165,1,42.00,0.50,0.5,8.65,0.00,0.3,51.95,0.0,
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.60,1.0,N,138,132,1,29.00,0.50,0.5,6.05,0.00,0.3,36.35,0.0,
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1,16.50,0.50,0.5,4.06,0.00,0.3,24.36,2.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1369764,2,2021-01-31 23:03:00,2021-01-31 23:33:00,,8.89,,,229,181,0,27.78,0.00,0.5,7.46,0.00,0.3,38.54,,
1369765,2,2021-01-31 23:29:00,2021-01-31 23:51:00,,7.43,,,41,70,0,32.58,0.00,0.5,0.00,6.12,0.3,39.50,,
1369766,2,2021-01-31 23:25:00,2021-01-31 23:38:00,,6.26,,,74,137,0,16.85,0.00,0.5,3.90,0.00,0.3,24.05,,
1369767,6,2021-01-31 23:01:06,2021-02-01 00:02:03,,19.70,,,265,188,0,53.68,0.00,0.5,0.00,0.00,0.3,54.48,,


In [3]:
from time import time
import pandas as pd
import pyarrow.parquet as pq

# initialize the header of database
df = pd.read_parquet("./yellow_tripdata_2021-01.parquet")
df.head(n=0).to_sql(
    name='yellow_taxi_data', 
    con=engine, 
    if_exists='replace', 
    index=False
)

# create generator for parquet file
parquet_file = pq.ParquetFile("./yellow_tripdata_2021-01.parquet")
parquet_data_generator = parquet_file.iter_batches()

# upload to database
while True: 
    try:
        t_start = time()
        batch_df = next(parquet_data_generator).to_pandas()

        batch_df["tpep_pickup_datetime"] = pd.to_datetime(batch_df["tpep_pickup_datetime"])
        batch_df["tpep_dropoff_datetime"] = pd.to_datetime(batch_df["tpep_dropoff_datetime"])
        
        batch_df.to_sql(
            name='yellow_taxi_data', 
            con=engine, 
            if_exists='append', 
            index=False
        )
        t_end = time()
        print('inserted another chunk, took %.3f second' % (t_end - t_start))
    except StopIteration:
        print('completed')
        break

inserted another chunk, took 15.220 second
inserted another chunk, took 13.192 second
inserted another chunk, took 13.003 second
inserted another chunk, took 13.002 second
inserted another chunk, took 13.082 second
inserted another chunk, took 12.998 second
inserted another chunk, took 13.172 second
inserted another chunk, took 13.015 second
inserted another chunk, took 13.401 second
inserted another chunk, took 12.858 second
inserted another chunk, took 12.685 second
inserted another chunk, took 12.565 second
inserted another chunk, took 12.773 second
inserted another chunk, took 12.953 second
inserted another chunk, took 13.243 second
inserted another chunk, took 13.639 second
inserted another chunk, took 12.626 second
inserted another chunk, took 12.797 second
inserted another chunk, took 12.788 second
inserted another chunk, took 11.973 second
inserted another chunk, took 10.418 second
completed


**Query**

In [4]:
query = """
    SELECT COUNT(1) FROM yellow_taxi_data 
"""

pd.read_sql(query, con=engine)

Unnamed: 0,count
0,1369769
