In [1]:
import pandas as pd
from time import time
from sqlalchemy import create_engine

In [2]:
pd.__version__

'2.0.3'

In [3]:
df = pd.read_csv('yellow_tripdata_2021-01.csv.gz', nrows=100, compression='gzip')

In [4]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [5]:
engine = create_engine('postgresql://postgres:postgres148@localhost:5432/dataengineering-learning')
engine.connect()

<sqlalchemy.engine.base.Connection at 0x12fb96450>

In [6]:
print(pd.io.sql.get_schema(df, name='yellow_taxi_data_2021_01', con=engine))


CREATE TABLE yellow_taxi_data_2021_01 (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [7]:
df_iter = pd.read_csv('yellow_tripdata_2021-01.csv.gz', compression='gzip', iterator=True, chunksize=100000)

In [8]:
df = next(df_iter)

In [9]:
len(df)

100000

In [10]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [11]:
df.head(n=0).to_sql(name='yellow_taxi_data_2021_01', con=engine, if_exists='replace')

0

In [12]:
%time df.to_sql(name='yellow_taxi_data_2021_01', con=engine, if_exists='append')

CPU times: user 2.03 s, sys: 48.5 ms, total: 2.08 s
Wall time: 4.62 s


1000

In [13]:
while True:
    try:
        t_start = time()
        
        df = next(df_iter)
    
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    
        df.to_sql(name='yellow_taxi_data_2021_01', con=engine, if_exists='append')
    
        t_end = time()
        print('insert another chunk..., took %.3f second' % (t_end - t_start))
    except StopIteration:
        break
    

insert another chunk..., took 4.763 second
insert another chunk..., took 4.682 second
insert another chunk..., took 4.751 second
insert another chunk..., took 4.660 second
insert another chunk..., took 4.691 second
insert another chunk..., took 4.538 second
insert another chunk..., took 4.550 second
insert another chunk..., took 4.546 second
insert another chunk..., took 4.556 second
insert another chunk..., took 4.549 second
insert another chunk..., took 4.598 second


  df = next(df_iter)


insert another chunk..., took 4.471 second
insert another chunk..., took 2.841 second


In [14]:
!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

--2024-01-16 20:38:15--  https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.40.176, 52.216.61.112, 16.182.38.96, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.40.176|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12322 (12K) [application/octet-stream]
Saving to: ‘taxi+_zone_lookup.csv’


2024-01-16 20:38:16 (9.34 MB/s) - ‘taxi+_zone_lookup.csv’ saved [12322/12322]



In [16]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [17]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265