In [15]:
import pandas as pd
from sqlalchemy import create_engine
from time import time

In [4]:
fpath_raw = 'yellow_tripdata_2021-01'

df = pd.read_parquet(
    f'{fpath_raw}.parquet',
    engine='pyarrow',
)
df.to_csv(f'{fpath_raw}.csv', index=False)

df = pd.read_csv(
    f'{fpath_raw}.csv',
    nrows=100,
    index_col=0,
)
df.head().T

VendorID,1,1.1,1.2,1.3,2
tpep_pickup_datetime,2021-01-01 00:30:10,2021-01-01 00:51:20,2021-01-01 00:43:30,2021-01-01 00:15:48,2021-01-01 00:31:49
tpep_dropoff_datetime,2021-01-01 00:36:12,2021-01-01 00:52:19,2021-01-01 01:11:06,2021-01-01 00:31:01,2021-01-01 00:48:21
passenger_count,1.0,1.0,1.0,0.0,1.0
trip_distance,2.1,0.2,14.7,10.6,4.94
RatecodeID,1.0,1.0,1.0,1.0,1.0
store_and_fwd_flag,N,N,N,N,N
PULocationID,142,238,132,138,68
DOLocationID,43,151,165,132,33
payment_type,2,2,1,1,1
fare_amount,8.0,3.0,42.0,29.0,16.5


In [10]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [13]:
postgres_url = 'postgresql://root:root@localhost:5432/ny_taxi'
engine = create_engine(postgres_url)
engine.connect()

<sqlalchemy.engine.base.Connection at 0x15a9fbe80>

In [14]:
sql = pd.io.sql.get_schema(df, name='yellow_taxi_data', con=engine)
print(sql)


CREATE TABLE yellow_taxi_data (
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	airport_fee FLOAT(53)
)




In [17]:
df_iter = pd.read_csv(
    f'{fpath_raw}.csv',
    index_col=0,
    iterator=True,
    chunksize=100000,
)

while True:

    start_ts = time()

    try:
        df = next(df_iter)
    except:
        break

    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

    df.to_sql(
        name='yellow_taxi_data',
        con=engine,
        if_exists='append',
    )

    end_ts = time()

    print(f'Chunk inserted taking {end_ts - start_ts:.2f} seconds.')

Chunk inserted taking 5.66 seconds.
Chunk inserted taking 5.29 seconds.
Chunk inserted taking 5.32 seconds.
Chunk inserted taking 5.47 seconds.
Chunk inserted taking 5.44 seconds.
Chunk inserted taking 5.39 seconds.
Chunk inserted taking 5.38 seconds.


  df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)


Chunk inserted taking 5.41 seconds.
Chunk inserted taking 5.48 seconds.
Chunk inserted taking 5.52 seconds.
Chunk inserted taking 5.42 seconds.
Chunk inserted taking 5.44 seconds.


  df = next(df_iter)


Chunk inserted taking 5.22 seconds.
Chunk inserted taking 3.45 seconds.
