The green taxi data is a little bit different. Let's find out how:

In [2]:
import pandas as pd

In [5]:
df = pd.read_csv('green_tripdata_2019-10.csv.gz', compression='gzip', nrows=100)

In [6]:
df.shape

(100, 20)

In [None]:
df.head()
# Looks like the main difference is that the green taxi uses lpep_pickup_datetime, while the yellow taxi data uses tpep_pickup_datetime. So adjusting the ingestion script will be quite simple

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2019-10-01 00:26:02,2019-10-01 00:39:58,N,1,112,196,1,5.88,18.0,0.5,0.5,0.0,0.0,,0.3,19.3,2,1,0.0
1,1,2019-10-01 00:18:11,2019-10-01 00:22:38,N,1,43,263,1,0.8,5.0,3.25,0.5,0.0,0.0,,0.3,9.05,2,1,0.0
2,1,2019-10-01 00:09:31,2019-10-01 00:24:47,N,1,255,228,2,7.5,21.5,0.5,0.5,0.0,0.0,,0.3,22.8,2,1,0.0
3,1,2019-10-01 00:37:40,2019-10-01 00:41:49,N,1,181,181,1,0.9,5.5,0.5,0.5,0.0,0.0,,0.3,6.8,2,1,0.0
4,2,2019-10-01 00:08:13,2019-10-01 00:17:56,N,1,97,188,1,2.52,10.0,0.5,0.5,2.26,0.0,,0.3,13.56,1,1,0.0


Ingest Zone data

In [8]:
import pandas as pd
from sqlalchemy import create_engine, inspect

In [11]:
# download the file and rename it
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv -O green_taxi_zone_lookup.csv

--2025-01-12 18:08:46--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6ea97ed0e6a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250112%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250112T180736Z&X-Amz-Expires=300&X-Amz-Signature=f916afc69c84f381b93ee5d231a2192feaa98d3d0963a7243c0ee09ae76c7a61&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dtaxi_zone_lookup.csv&response-content-type=application%2Foctet-stream [following]
--2025-01-12 18:08:46--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6ea97ed0e6a?X-Amz-Algorithm=AWS4-HMAC-

In [13]:
# Create read the file

df = pd.read_csv('green_taxi_zone_lookup.csv')
df.shape

(265, 4)

In [14]:
# Let's make a connection to postgres with pandas
engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi")
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7fc3cb789f70>

In [15]:
# Use the inspector to see what we've inputted already on the database
inspector = inspect(engine)
print(inspector.get_table_names())

['ny_taxi_zones', 'green_taxi_trips_2019', 'yellow_taxi_trips']


In [17]:
# First we need to create a schema from the data frame using the engine, then we can actually input the data
print(pd.io.sql.get_schema(df,name="green_ny_taxi_zones",con=engine))


CREATE TABLE green_ny_taxi_zones (
	"LocationID" BIGINT, 
	"Borough" TEXT, 
	"Zone" TEXT, 
	service_zone TEXT
)




In [19]:
# Now we create the table using the schema created above

df.to_sql(name="test", con=engine, if_exists="replace")

265