In [47]:
import pandas as pd
from sqlalchemy import create_engine
from time import time
from datetime import datetime;

Initialize Values - Schema

In [34]:
schema = {
    'VendorID': pd.Series(dtype=pd.Int64Dtype()),
    'tpep_pickup_datetime': pd.Series(dtype='datetime64[ns]'),
    'tpep_dropoff_datetime': pd.Series(dtype='datetime64[ns]'),
    'passenger_count': pd.Series(dtype=pd.Int64Dtype()),
    'trip_distance': pd.Series(dtype='float64'),
    'RatecodeID': pd.Series(dtype=pd.Int64Dtype()),
    'store_and_fwd_flag': pd.Series(dtype='object'),
    'PULocationID': pd.Series(dtype='int64'),
    'DOLocationID': pd.Series(dtype='int64'),
    'payment_type': pd.Series(dtype=pd.Int64Dtype()),
    'fare_amount': pd.Series(dtype='float64'),
    'extra': pd.Series(dtype='float64'),
    'mta_tax': pd.Series(dtype='float64'),
    'tip_amount': pd.Series(dtype='float64'),
    'tolls_amount': pd.Series(dtype='float64'),
    'improvement_surcharge': pd.Series(dtype='float64'),
    'total_amount': pd.Series(dtype='float64'),
    'congestion_surcharge': pd.Series(dtype='float64'),
    'tpep_pickup_datetime_text': pd.Series(dtype='object'),
    'tpep_dropoff_datetime_text': pd.Series(dtype='object')    
}


Initialize Values - Settings

In [35]:
db_user = 'root'
db_password = 'root'
db_host = '127.0.0.1'
db_port = '5432'
db_name = 'ny_taxi'
table_name = 'yellow_taxi_data'
data_url = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2021-01.csv.gz'

Create an empty data Frame with the Schema already reviewed 

In [36]:
df_empty = pd.DataFrame(schema)

In [37]:
df_empty.dtypes

VendorID                               Int64
tpep_pickup_datetime          datetime64[ns]
tpep_dropoff_datetime         datetime64[ns]
passenger_count                        Int64
trip_distance                        float64
RatecodeID                             Int64
store_and_fwd_flag                    object
PULocationID                           int64
DOLocationID                           int64
payment_type                           Int64
fare_amount                          float64
extra                                float64
mta_tax                              float64
tip_amount                           float64
tolls_amount                         float64
improvement_surcharge                float64
total_amount                         float64
congestion_surcharge                 float64
tpep_pickup_datetime_text             object
tpep_dropoff_datetime_text            object
dtype: object

Create SQL engine

In [38]:
engine = create_engine(f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}')
engine.connect()

<sqlalchemy.engine.base.Connection at 0x747c4c7e20d0>

Create the table if doesn't exist already

In [39]:
print(pd.io.sql.get_schema(df_empty, name= table_name))


CREATE TABLE "yellow_taxi_data" (
"VendorID" INTEGER,
  "tpep_pickup_datetime" TIMESTAMP,
  "tpep_dropoff_datetime" TIMESTAMP,
  "passenger_count" INTEGER,
  "trip_distance" REAL,
  "RatecodeID" INTEGER,
  "store_and_fwd_flag" TEXT,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "payment_type" INTEGER,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "congestion_surcharge" REAL,
  "tpep_pickup_datetime_text" TEXT,
  "tpep_dropoff_datetime_text" TEXT
)


In [41]:
df_empty.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'tpep_pickup_datetime_text',
       'tpep_dropoff_datetime_text'],
      dtype='object')

In [42]:
df_empty.to_sql(name= table_name, con=engine, if_exists='replace', index=False)

0

Insert Values - test inserting first batch

In [45]:
def table_count(table_name: str):
    return engine.execute(f"select count(1) from {table_name}").scalar()

table_count(table_name)

0

In [50]:
print(datetime.now())
print(datetime.now().time())
print(time())

2024-10-21 12:41:32.173170
12:41:32.173381
1729525292.1734326


In [54]:
d_start = datetime.now()
t_start = time()
iter_count = 0
print(f'Start batch {d_start}')
initial_table_count = table_count(table_name)
print(f'Table {table_name} count {initial_table_count:06} records')      
for chunk in pd.read_csv(data_url, compression='gzip',nrows=100, iterator=True, chunksize=10):
    iter_count += 1
    print(f'Iteration {iter_count:03}')
    b_start = time()
    chunk.to_sql(name= table_name, con=engine, if_exists='append', index=False)
    b_end = time()
    print(f'Appends time taken {b_end-b_start:10.3f} seconds.')
    current_table_count = table_count(table_name)    
    print(f'Table {table_name} count {current_table_count:06} records')
t_end = time()
print(f'File load completed. Total time taken was {t_end-t_start:10.3f} seconds for {iter_count:03} batches. Start - End Table Counts: {initial_table_count} - {current_table_count} ')


Start batch 2024-10-21 12:58:23.220941
Table yellow_taxi_data count 000100 records
Iteration 001
Appends time taken      0.014 seconds.
Table yellow_taxi_data count 000110 records
Iteration 002
Appends time taken      0.011 seconds.
Table yellow_taxi_data count 000120 records
Iteration 003
Appends time taken      0.010 seconds.
Table yellow_taxi_data count 000130 records
Iteration 004
Appends time taken      0.014 seconds.
Table yellow_taxi_data count 000140 records
Iteration 005
Appends time taken      0.010 seconds.
Table yellow_taxi_data count 000150 records
Iteration 006
Appends time taken      0.010 seconds.
Table yellow_taxi_data count 000160 records
Iteration 007
Appends time taken      0.012 seconds.
Table yellow_taxi_data count 000170 records
Iteration 008
Appends time taken      0.011 seconds.
Table yellow_taxi_data count 000180 records
Iteration 009
Appends time taken      0.011 seconds.
Table yellow_taxi_data count 000190 records
Iteration 010
Appends time taken      0.011 