In [2]:
import pandas as pd
from sqlalchemy import create_engine
from time import time

In [2]:
pd.__version__

'1.4.0'

In [4]:
# Let's look at the yellow cab data
!ls

Dockerfile			 pipeline.py
ingesting_ny_taxi_dataset.ipynb  test_preparing_postgres_container.yml
ny_taxi_postgres_data		 yellow_tripdata_2021-01.csv


In [41]:
df = pd.read_csv('yellow_tripdata_2021-01.csv', nrows=100)

In [42]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1,2.1,1,N,142,43,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1,0.2,1,N,238,151,2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1,14.7,1,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0,10.6,1,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1,4.94,1,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5


In [43]:
df.shape

(100, 18)

In [44]:
# convert date columns into datetime
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [45]:
# Let's make a connection to postgres with pandas
engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi")
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f7f70fe9fa0>

In [46]:
# In order to ingest it to postgres we need to generate a schema
print(pd.io.sql.get_schema(df,name="yellow_taxi_data",con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [47]:
df.shape

(100, 18)

In [48]:
# First create table by running command with a head(n=0), as it will only create the schema
df.head(n=0).to_sql(name="yellow_taxi_data",con=engine, if_exists='replace')

0

In [49]:
# Let's now grab the dataframe with an iterator to batch ingest it into postgres
df_iter = pd.read_csv("yellow_tripdata_2021-01.csv", iterator=True, chunksize=100000)

In [50]:
# Now we do the same, but actually append the chunks of the dataframe iterator

while True:
    t_start = time()
    # get next chunk of dataframe 
    df = next(df_iter) 

    # convert date columns into datetime
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    
    df.to_sql(name="yellow_taxi_data",con=engine, if_exists='append')
    
    t_end = time()
    
    
    print(f"inserted another chunk..., it took {t_end - t_start :.3f}")

inserted another chunk..., it took 6.764
inserted another chunk..., it took 6.230
inserted another chunk..., it took 6.283
inserted another chunk..., it took 6.198
inserted another chunk..., it took 6.347
inserted another chunk..., it took 6.921
inserted another chunk..., it took 7.123
inserted another chunk..., it took 6.971
inserted another chunk..., it took 7.679
inserted another chunk..., it took 6.836
inserted another chunk..., it took 7.001
inserted another chunk..., it took 7.125


  df = next(df_iter)


inserted another chunk..., it took 7.077
inserted another chunk..., it took 3.986


StopIteration: 

# Ingesting zone lookup table

In [4]:
df = pd.read_csv('taxi+_zone_lookup.csv')

In [5]:
df.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [6]:
df.shape

(265, 4)

In [8]:
# Let's make a connection to postgres with pandas
engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi")
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f8fce3f9310>

In [11]:
print(engine.table_names())

['yellow_taxi_trips']


  print(engine.table_names())


In [10]:
# In order to ingest it to postgres we need to generate a schema
print(pd.io.sql.get_schema(df,name="ny_taxi_zones",con=engine))


CREATE TABLE ny_taxi_zones (
	"LocationID" BIGINT, 
	"Borough" TEXT, 
	"Zone" TEXT, 
	service_zone TEXT
)




In [12]:
df.to_sql(name="ny_taxi_zones", con=engine, if_exists="replace")

265

In [13]:
print(engine.table_names())

['ny_taxi_zones', 'yellow_taxi_trips']


  print(engine.table_names())
