# Questions

## Question 1: dlt Version

In [1]:
!pip install dlt[duckdb]



In [2]:
import dlt

In [3]:
print(f"dlt version:{dlt.__version__}")

dlt version:1.6.1


## Question 2: Define & Run the Pipeline (NYC Taxi API)

In [11]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator


# your code is here
@dlt.resource(name="rides")
def ny_taxi():
    client = RESTClient(
        base_url="https://us-central1-dlthub-analytics.cloudfunctions.net",
        paginator=PageNumberPaginator(
            base_page=1,
            total_path=None
        )
    )

    for page in client.paginate("data_engineering_zoomcamp_api"):
        yield page
        

pipeline = dlt.pipeline(
    pipeline_name="ny_taxi_pipeline",
    destination="duckdb",
    dataset_name="ny_taxi_data"
)

In [12]:
load_info = pipeline.run(ny_taxi, write_disposition="replace")
print(load_info)

Pipeline ny_taxi_pipeline load step completed in 2.50 seconds
1 load package(s) were loaded to destination duckdb and into dataset ny_taxi_data
The duckdb destination used duckdb:////home/joviedo/data-engineering-zoomcamp/hwk/notebooks/ny_taxi_pipeline.duckdb location to store data
Load package 1739645056.0932279 is LOADED and contains no failed jobs


In [6]:
import duckdb

# Connect to the DuckDB database
conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb")

# Set search path to the dataset
conn.sql(f"SET search_path = '{pipeline.dataset_name}'")

# Describe the dataset
conn.sql("DESCRIBE").df()


Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,ny_taxi_pipeline,ny_taxi_data,_dlt_loads,"[load_id, schema_name, status, inserted_at, sc...","[VARCHAR, VARCHAR, BIGINT, TIMESTAMP WITH TIME...",False
1,ny_taxi_pipeline,ny_taxi_data,_dlt_pipeline_state,"[version, engine_version, pipeline_name, state...","[BIGINT, BIGINT, VARCHAR, VARCHAR, TIMESTAMP W...",False
2,ny_taxi_pipeline,ny_taxi_data,_dlt_version,"[version, engine_version, inserted_at, schema_...","[BIGINT, BIGINT, TIMESTAMP WITH TIME ZONE, VAR...",False
3,ny_taxi_pipeline,ny_taxi_data,rides,"[end_lat, end_lon, fare_amt, passenger_count, ...","[DOUBLE, DOUBLE, DOUBLE, BIGINT, VARCHAR, DOUB...",False


## Question 3: Explore the loaded data

In [7]:
df = pipeline.dataset(dataset_type="default").rides.df()
df.head()

Unnamed: 0,end_lat,end_lon,fare_amt,passenger_count,payment_type,start_lat,start_lon,tip_amt,tolls_amt,total_amt,trip_distance,trip_dropoff_date_time,trip_pickup_date_time,surcharge,vendor_name,_dlt_load_id,_dlt_id,store_and_forward
0,40.742963,-73.980072,45.0,1,Credit,40.641525,-73.787442,9.0,4.15,58.15,17.52,2009-06-14 23:48:00+00:00,2009-06-14 23:23:00+00:00,0.0,VTS,1739644918.52758,k4q4FQ0wJ3EBmQ,
1,40.740187,-74.005698,6.5,1,Credit,40.722065,-74.009767,1.0,0.0,8.5,1.56,2009-06-18 17:43:00+00:00,2009-06-18 17:35:00+00:00,1.0,VTS,1739644918.52758,Nq1d5rsq61LvVA,
2,40.718043,-74.004745,12.5,5,Credit,40.761945,-73.983038,2.0,0.0,15.5,3.37,2009-06-10 18:27:00+00:00,2009-06-10 18:08:00+00:00,1.0,VTS,1739644918.52758,pb2iBZDi7NVvaA,
3,40.739637,-73.985233,4.9,1,CASH,40.749802,-73.992247,0.0,0.0,5.4,1.11,2009-06-14 23:58:00+00:00,2009-06-14 23:54:00+00:00,0.5,VTS,1739644918.52758,v9DJD+KkI5Ijng,
4,40.730032,-73.852693,25.7,1,CASH,40.776825,-73.949233,0.0,4.15,29.85,11.09,2009-06-13 13:23:00+00:00,2009-06-13 13:01:00+00:00,0.0,VTS,1739644918.52758,CboWfeNjW6SuAQ,


In [8]:
df.shape

(10000, 18)

In [9]:
len(df)

10000

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   end_lat                 10000 non-null  float64            
 1   end_lon                 10000 non-null  float64            
 2   fare_amt                10000 non-null  float64            
 3   passenger_count         10000 non-null  int64              
 4   payment_type            10000 non-null  object             
 5   start_lat               10000 non-null  float64            
 6   start_lon               10000 non-null  float64            
 7   tip_amt                 10000 non-null  float64            
 8   tolls_amt               10000 non-null  float64            
 9   total_amt               10000 non-null  float64            
 10  trip_distance           10000 non-null  float64            
 11  trip_dropoff_date_time  10000 non-null  da

## Question 4: Trip Duration Analysis

In [13]:
with pipeline.sql_client() as client:
    res = client.execute_sql(
            """
            SELECT
            AVG(date_diff('minute', trip_pickup_date_time, trip_dropoff_date_time))
            FROM rides;
            """
        )
    # Prints column values of the first row
    print(res)

[(12.3049,)]
