In [None]:
# Import Libraries.
import pandas as pd
from time import time
from sqlalchemy import create_engine

In [None]:
# Download Data.
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-01.csv.gz

In [None]:
# Read Data.
df = pd.read_csv('./data/green_tripdata_2019-01.csv.gz')
df.head()

In [None]:
# Format dtype.
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

In [None]:
# Create engine to connect to postgresql.
engine = create_engine('postgresql://db:db@localhost:5432/nyc_taxi')

In [None]:
print(pd.io.sql.get_schema(df, name='green_taxi_data', con=engine))

In [None]:
# Check number of rows.
len(df)

In [None]:
# Read data into database in chunks.
df_iter = pd.read_csv('./data/green_tripdata_2019-01.csv.gz', iterator=True, chunksize=100000)

In [None]:
# First Chunk.
df = next(df_iter)

In [None]:
# Read in the header into the database.
df.head(n=0).to_sql(name='green_taxi_data', con=engine, if_exists='replace')

In [None]:
# Read in the first chunk into postgresql database.
%time df.to_sql(name='green_taxi_data', con=engine, if_exists='append')

In [None]:
# Read in the other part of the data chunk by chunk into postgresql database.
while True: 
    t_start = time()

    df = next(df_iter)

    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    
    df.to_sql(name='green_taxi_data', con=engine, if_exists='append')

    t_end = time()

    print('inserted another chunk, took %.3f second' % (t_end - t_start))

In [None]:
# Download Data.
!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

In [None]:
# Read Data.
df_zones = pd.read_csv('./data/taxi+_zone_lookup.csv')
df_zones.head()

In [None]:
# Read in data into postgresql database.
df_zones.to_sql(name='zones', con=engine, if_exists='replace')