# dlt (data load tool)

it's a tool for extracting, normalizing, and loading data from data source to storage destination

**installing dlt library**

In [1]:
!pip install dlt[duckdb]

Collecting dlt[duckdb]
  Downloading dlt-1.6.1-py3-none-any.whl.metadata (11 kB)
Collecting giturlparse>=0.10.0 (from dlt[duckdb])
  Downloading giturlparse-0.12.0-py2.py3-none-any.whl.metadata (4.5 kB)
Collecting hexbytes>=0.2.2 (from dlt[duckdb])
  Downloading hexbytes-1.3.0-py3-none-any.whl.metadata (3.3 kB)
Collecting humanize>=4.4.0 (from dlt[duckdb])
  Downloading humanize-4.12.0-py3-none-any.whl.metadata (7.8 kB)
Collecting jsonpath-ng>=1.5.3 (from dlt[duckdb])
  Downloading jsonpath_ng-1.7.0-py3-none-any.whl.metadata (18 kB)
Collecting orjson!=3.10.1,!=3.9.11,!=3.9.12,!=3.9.13,!=3.9.14,<4,>=3.6.7 (from dlt[duckdb])
  Downloading orjson-3.10.15-cp310-cp310-win_amd64.whl.metadata (42 kB)
Collecting pathvalidate>=2.5.2 (from dlt[duckdb])
  Downloading pathvalidate-3.2.3-py3-none-any.whl.metadata (12 kB)
Collecting pendulum>=2.1.2 (from dlt[duckdb])
  Downloading pendulum-3.0.0-cp310-none-win_amd64.whl.metadata (7.0 kB)
Collecting pluggy>=1.3.0 (from dlt[duckdb])
  Downloading plug


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


**Question 1: Checking dlt library version**

In [2]:
!dlt --version

[39mdlt 1.6.1[0m


In [None]:
import dlt

print("dlt version:", dlt.__version__)

dlt version: 1.6.1


**Question 2: Define & Run the Pipeline (NYC Taxi API)**

In [None]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator


# your code is here
# Define the API resource for NYC taxi data
@dlt.resource(
    name="rides"
)  # <--- The name of the resource (will be used as the table name)
def ny_taxi():
    client = RESTClient(
        base_url="https://us-central1-dlthub-analytics.cloudfunctions.net",
        paginator=PageNumberPaginator(base_page=1, total_path=None),
    )

    for page in client.paginate(
        "data_engineering_zoomcamp_api"
    ):  # <--- API endpoint for retrieving taxi ride data
        yield page  # <--- yield data to manage memory


# Create a pipeline
pipeline = dlt.pipeline(
    pipeline_name="ny_taxi_pipeline", destination="duckdb", dataset_name="ny_taxi_data"
)

In [7]:
load_info = pipeline.run(ny_taxi)
print(load_info)

Pipeline ny_taxi_pipeline load step completed in 3.33 seconds
1 load package(s) were loaded to destination duckdb and into dataset ny_taxi_data
The duckdb destination used duckdb:///b:\Belajar\data engineering\homework-de-zoomcamp\workshop-dlt\ny_taxi_pipeline.duckdb location to store data
Load package 1739630968.4035795 is LOADED and contains no failed jobs


In [None]:
import duckdb

# A database '<pipeline_name>.duckdb' was created in working directory so just connect to it

# Connect to the DuckDB database
conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb")

# Set search path to the dataset
conn.sql(f"SET search_path = '{pipeline.dataset_name}'")

# Describe the dataset
conn.sql("DESCRIBE").df()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,ny_taxi_pipeline,ny_taxi_data,_dlt_loads,"[load_id, schema_name, status, inserted_at, sc...","[VARCHAR, VARCHAR, BIGINT, TIMESTAMP WITH TIME...",False
1,ny_taxi_pipeline,ny_taxi_data,_dlt_pipeline_state,"[version, engine_version, pipeline_name, state...","[BIGINT, BIGINT, VARCHAR, VARCHAR, TIMESTAMP W...",False
2,ny_taxi_pipeline,ny_taxi_data,_dlt_version,"[version, engine_version, inserted_at, schema_...","[BIGINT, BIGINT, TIMESTAMP WITH TIME ZONE, VAR...",False
3,ny_taxi_pipeline,ny_taxi_data,rides,"[end_lat, end_lon, fare_amt, passenger_count, ...","[DOUBLE, DOUBLE, DOUBLE, BIGINT, VARCHAR, DOUB...",False


**Question 3: Explore the loaded data**

In [10]:
df = pipeline.dataset(dataset_type="default").rides.df()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   end_lat                 10000 non-null  float64            
 1   end_lon                 10000 non-null  float64            
 2   fare_amt                10000 non-null  float64            
 3   passenger_count         10000 non-null  int64              
 4   payment_type            10000 non-null  object             
 5   start_lat               10000 non-null  float64            
 6   start_lon               10000 non-null  float64            
 7   tip_amt                 10000 non-null  float64            
 8   tolls_amt               10000 non-null  float64            
 9   total_amt               10000 non-null  float64            
 10  trip_distance           10000 non-null  float64            
 11  trip_dropoff_date_time  10000 non-null  da

**Question 4: Trip Duration Analysis**

In [None]:
with pipeline.sql_client() as client:
    res = client.execute_sql(
        """
            SELECT
            AVG(date_diff('minute', trip_pickup_date_time, trip_dropoff_date_time))
            FROM rides;
            """
    )
    # Prints column values of the first row
    print(res)

[(12.3049,)]


In [None]:
# Calculate the trip duration in minutes
df["trip_duration"] = (
    df["trip_dropoff_date_time"] - df["trip_pickup_date_time"]
).dt.total_seconds() / 60

# Calculate the average trip duration
average_trip_duration = df["trip_duration"].mean()
print(f"Average trip duration: {average_trip_duration} minutes")

Average trip duration: 12.304918333333335 minutes
