In [None]:
!pip install pandas sqlalchemy psycopg2-binary python-dotenv

In [None]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
load_dotenv()


taxi_file_name = 'green_tripdata_2019-10.csv'
taxi_file_name_compressed = 'green_tripdata_2019-10.csv.gz'
taxi_table = 'green_tripdata'

taxi_lookup_file_name = 'taxi_zone_lookup.csv'
taxi_lookup_table = 'taxi_zone_lookup'

# wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz
# wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv

chunk_size = 50000
columns_to_datetime = ['lpep_pickup_datetime', 'lpep_dropoff_datetime']

DATABASE_URL = os.getenv("DATABASE_URL")

engine = create_engine(DATABASE_URL)
engine.connect()

df = pd.read_csv(taxi_file_name, nrows=100)
for col in columns_to_datetime:
    df[col] = pd.to_datetime(df[col])
print(pd.io.sql.get_schema(df, name=taxi_table, con=engine))

In [None]:
from time import time

df_iter = pd.read_csv(taxi_file_name, iterator=True, chunksize=chunk_size)

df = next(df_iter)
for col in columns_to_datetime:
    df[col] = pd.to_datetime(df[col])

df.head(n=0).to_sql(name=taxi_table, con=engine, if_exists='replace')

df.to_sql(name=taxi_table, con=engine, if_exists='append')

while True:
    try:
        t_start = time()
        df = next(df_iter)
        for col in columns_to_datetime:
            df[col] = pd.to_datetime(df[col])
        df.to_sql(name=taxi_table, con=engine, if_exists='append')
        t_end = time()
        print('inserted another chunk, took %.3f second' % (t_end - t_start))
    except Exception as e:
        print(e)
        break

df2 = pd.read_csv(taxi_lookup_file_name)
# df2.head(n=0).to_sql(name=taxi_lookup_table, con=engine, if_exists='replace')
df2.to_sql(name=taxi_lookup_table, con=engine, if_exists='append')