## Initial Exploration

In [1]:
import pandas as pd

In [5]:
green_df = pd.read_csv("green_tripdata_2019-10.csv.gz", nrows=100, compression='gzip')

In [6]:
green_df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2019-10-01 00:26:02,2019-10-01 00:39:58,N,1,112,196,1,5.88,18.0,0.5,0.5,0.0,0.0,,0.3,19.3,2,1,0.0
1,1,2019-10-01 00:18:11,2019-10-01 00:22:38,N,1,43,263,1,0.8,5.0,3.25,0.5,0.0,0.0,,0.3,9.05,2,1,0.0
2,1,2019-10-01 00:09:31,2019-10-01 00:24:47,N,1,255,228,2,7.5,21.5,0.5,0.5,0.0,0.0,,0.3,22.8,2,1,0.0
3,1,2019-10-01 00:37:40,2019-10-01 00:41:49,N,1,181,181,1,0.9,5.5,0.5,0.5,0.0,0.0,,0.3,6.8,2,1,0.0
4,2,2019-10-01 00:08:13,2019-10-01 00:17:56,N,1,97,188,1,2.52,10.0,0.5,0.5,2.26,0.0,,0.3,13.56,1,1,0.0


In [7]:
green_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   VendorID               100 non-null    int64  
 1   lpep_pickup_datetime   100 non-null    object 
 2   lpep_dropoff_datetime  100 non-null    object 
 3   store_and_fwd_flag     100 non-null    object 
 4   RatecodeID             100 non-null    int64  
 5   PULocationID           100 non-null    int64  
 6   DOLocationID           100 non-null    int64  
 7   passenger_count        100 non-null    int64  
 8   trip_distance          100 non-null    float64
 9   fare_amount            100 non-null    float64
 10  extra                  100 non-null    float64
 11  mta_tax                100 non-null    float64
 12  tip_amount             100 non-null    float64
 13  tolls_amount           100 non-null    float64
 14  ehail_fee              0 non-null      float64
 15  improve

In [10]:
[c for c in green_df.columns if "datetime" in c]

['lpep_pickup_datetime', 'lpep_dropoff_datetime']

In [None]:
green_df.to_sql()

In [12]:
zone_df = pd.read_csv('taxi_zone_lookup.csv')

In [13]:
zone_df.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [14]:
zone_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   LocationID    265 non-null    int64 
 1   Borough       265 non-null    object
 2   Zone          264 non-null    object
 3   service_zone  263 non-null    object
dtypes: int64(1), object(3)
memory usage: 8.4+ KB


## Testing Ingest Code

In [1]:
import pandas as pd
import numpy as np

from time import time

from sqlalchemy import create_engine

In [7]:
user = 'root'
password = 'root'
host = 'localhost'
port = '5432'
db_name = 'ny_taxi'
taxi_table_name = 'green_taxi_data'
zone_table_name = 'zone_data'
green_taxi_url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz"
zone_url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv"

green_csv_name = 'green_tripdata_2019-10.csv.gz'
zone_csv_name = 'taxi_zone_lookup.csv'

In [4]:
engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{db_name}")
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7fd84a80c980>

In [8]:
# first handling the zone data
t_start = time()

df_zone = pd.read_csv(zone_csv_name)

df_zone.to_sql(name=zone_table_name, con=engine, if_exists="replace")

t_end = time()

print(f"Completed zone table in {t_end-t_start} seconds")

Completed zone table in 0.032900094985961914 seconds


In [9]:
# now reading in the green taxi trip data in 100K row chunks
df_intake = pd.read_csv(
    green_csv_name, 
    iterator=True, 
    chunksize=100000, 
    compression='gzip')

df_chunk = next(df_intake)

datetime_cols = [c for c in df_chunk.columns if "datetime" in c]

for col in datetime_cols:
    df_chunk[col] = pd.to_datetime(df_chunk[col])

df_chunk.head(n=0).to_sql(name=taxi_table_name, con=engine, if_exists="replace")

df_chunk.to_sql(name=taxi_table_name, con=engine, if_exists="append")

# not the cleanest code, but what was provided from course
# loops through iterator chunks to insert data into database

n = 1
while True:
    n += 1
    t_start = time()
    df_chunk = next(df_intake)
    
    for col in datetime_cols:
        df_chunk[col] = pd.to_datetime(df_chunk[col])

    df_chunk.to_sql(name=taxi_table_name, con=engine, if_exists="append")

    t_end = time()
    print(f"Completed chunk {n} in {t_end-t_start} seconds")

Completed chunk 2 in 5.771690368652344 seconds
Completed chunk 3 in 5.918017625808716 seconds


  df_chunk = next(df_intake)


Completed chunk 4 in 6.1756110191345215 seconds
Completed chunk 5 in 3.767322540283203 seconds


StopIteration: 