In [1]:
import dlt
import requests
import pandas as pd 
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import OffsetPaginator
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator
from dlt.sources.helpers.rest_client.auth import APIKeyAuth
from dlt.destinations import filesystem

In [2]:
# Define the API endpoint and your access key
API_URL = 'https://api.marketstack.com/v1/eod'
ACCESS_KEY = 'hoho'


In [7]:
# Define the parameters for the API request
params = {
    'access_key': ACCESS_KEY,
    'symbols': 'AAPL',  # Example symbols
    'date_from': '2025-01-01',
    'date_to': '2025-01-02', 
    'limit': 2
}


In [8]:
response = requests.get(API_URL, params=params)

In [10]:
if response.status_code == 200:
    print(response.json()) 

{'pagination': {'limit': 2, 'offset': 0, 'count': 1, 'total': 1}, 'data': [{'open': 248.93, 'high': 249.1, 'low': 241.82, 'close': 243.85, 'volume': 55558000.0, 'adj_high': 249.1, 'adj_low': 241.8201, 'adj_close': 243.85, 'adj_open': 248.93, 'adj_volume': 55740731.0, 'split_factor': 1.0, 'dividend': 0.0, 'symbol': 'AAPL', 'exchange': 'XNAS', 'date': '2025-01-02T00:00:00+0000'}]}


In [14]:
# Extract the 'data' field and convert to a DataFrame
df = pd.DataFrame(response.json()["data"])

# Convert 'date' to a proper datetime format
df["date"] = pd.to_datetime(df["date"])

df.head() 

Unnamed: 0,open,high,low,close,volume,adj_high,adj_low,adj_close,adj_open,adj_volume,split_factor,dividend,symbol,exchange,date
0,248.93,249.1,241.82,243.85,55558000.0,249.1,241.8201,243.85,248.93,55740731.0,1.0,0.0,AAPL,XNAS,2025-01-02 00:00:00+00:00


In [17]:
%%capture
!pip install dlt[duckdb]

In [3]:
# Extract raw data, auto normalization and Load data into GCP Bucket 
@dlt.resource(name="stock", write_disposition="replace")
def extract_raw():
    client = RESTClient(
        base_url=API_URL,

        paginator=OffsetPaginator(  # Handles pagination using offset
            limit=100, offset=0, total_path=None
        )
    
    )

    for page in client.paginate(
        params = {
            'access_key': ACCESS_KEY,
            'symbols': 'AAPL,MSFT',  # Example symbols
            'date_from': '2025-01-01',
            'date_to': '2025-01-31'
        }
    ):
        yield page 



In [29]:
pipeline = dlt.pipeline(
    pipeline_name='stock_data_pipeline',
    destination='duckdb', # <--- to test pipeline locally
    dataset_name='stock_dataset',
)

In [34]:
# run the pipeline with the new resource
load_info = pipeline.run(extract_raw, write_disposition="replace")
print(load_info)

Pipeline stock_data_pipeline load step completed in 0.16 seconds
1 load package(s) were loaded to destination duckdb and into dataset stock_dataset
The duckdb destination used duckdb:////Users/liuchen/Desktop/Project/DE-Zoomcamp-Project/dlt/stock_data_pipeline.duckdb location to store data
Load package 1742049830.691514 is LOADED and contains no failed jobs


In [35]:
# explore loaded data
pipeline.dataset(dataset_type="default").stock.df()

Unnamed: 0,open,high,low,close,volume,adj_high,adj_low,adj_close,adj_open,adj_volume,split_factor,dividend,symbol,exchange,date,_dlt_load_id,_dlt_id
0,418.98,420.69,414.91,415.06,34161900.0,420.69,414.91,415.06,418.98,34223388.0,1.0,0.0,MSFT,XNAS,2025-01-31 00:00:00+00:00,1742049830.691514,/pNCQVBmtAWdeg
1,247.19,247.19,233.44,236.0,100959800.0,247.19,233.44,236.0,247.19,101075128.0,1.0,0.0,AAPL,XNAS,2025-01-31 00:00:00+00:00,1742049830.691514,WLmQ9c4C0VdWeQ
2,238.665,240.79,237.21,237.59,44801575.0,240.79,237.21,237.59,238.665,55658279.0,1.0,0.0,AAPL,XNAS,2025-01-30 00:00:00+00:00,1742049830.691514,89zjXDp/tsf13g
3,418.48,422.86,413.16,414.99,54218011.0,422.86,413.16,414.99,418.77,54586260.0,1.0,0.0,MSFT,XNAS,2025-01-30 00:00:00+00:00,1742049830.691514,IM99ToN6qA9omA
4,446.69,446.88,440.4,442.33,22563100.0,446.88,440.4,442.33,446.69,23581370.0,1.0,0.0,MSFT,XNAS,2025-01-29 00:00:00+00:00,1742049830.691514,tNja/heRoRgETw
5,234.12,239.86,234.01,239.36,45375500.0,239.855,234.01,239.36,234.12,45486100.0,1.0,0.0,AAPL,XNAS,2025-01-29 00:00:00+00:00,1742049830.691514,pslxW/dQBcPU3Q
6,434.6,448.38,431.38,447.2,23471800.0,448.38,431.38,447.2,434.6,23491703.0,1.0,0.0,MSFT,XNAS,2025-01-28 00:00:00+00:00,1742049830.691514,Zewzno16WeU+wg
7,230.85,240.19,230.81,238.26,75633300.0,240.19,230.81,238.26,230.85,75707569.0,1.0,0.0,AAPL,XNAS,2025-01-28 00:00:00+00:00,1742049830.691514,IQkacsKZPq8qjQ
8,224.03,232.15,224.0,229.86,94132139.0,232.15,223.98,229.86,224.02,94863418.0,1.0,0.0,AAPL,XNAS,2025-01-27 00:00:00+00:00,1742049830.691514,fUJL57SF3LfUGw
9,424.21,435.19,423.5,434.56,35377647.0,435.2,423.5,434.56,424.01,35647805.0,1.0,0.0,MSFT,XNAS,2025-01-27 00:00:00+00:00,1742049830.691514,mbb4scTaeroVTw


In [None]:
from datetime import datetime, timedelta
def main(initial_dt): 
    @dlt.resource(name="stock", write_disposition="append")
    def extract_raw_incremental(cursor_date=dlt.sources.incremental(
            "date",   # <--- field to track, our timestamp
            initial_value=initial_dt  
            )
        ):
        client = RESTClient(
            base_url=API_URL,
    
            paginator=OffsetPaginator(  # Handles pagination using offset
                limit=100, offset=0, total_path=None
            )
        
        )
    
        for page in client.paginate(
            params = {
                'access_key': ACCESS_KEY,
                'symbols': 'AAPL,MSFT',  # Example symbols
                'date_from': cursor_date.last_value,
                # Set date_to to one day before today
                'date_to' = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
            }
        ):
            yield page 


In [4]:
GCP_URL = "gs://de-zoomcamp-project-453801-terra-bucket" 
BUCKET_NAME = "demo-bucket"

In [5]:
gcp_bucket = filesystem(GCP_URL)
gcp_bucket 
pip install "dlt[gs]"

<dlt.destinations.impl.filesystem.factory.filesystem at 0x7f29b9437950>

In [12]:
import json 
GCP_URL = "gs://de-zoomcamp-project-453801-terra-bucket"

# Load credentials from JSON file
with open("/home/chenchen/.gc/my-creds.json") as f:
    credentials = json.load(f)

# Initialize filesystem with credentials
gcp_bucket = filesystem(GCP_URL, credentials=credentials)

In [13]:
pipeline = dlt.pipeline(
    pipeline_name="marketstack_pipeline",
    destination=gcp_bucket,  # Set the destination to GCS
    dataset_name='stock_dataset'
)

In [16]:
# run the pipeline with the new resource
load_info = pipeline.run(extract_raw, write_disposition="replace",loader_file_format="csv" )

print(load_info)

Pipeline marketstack_pipeline load step completed in 1.85 seconds
1 load package(s) were loaded to destination filesystem and into dataset stock_dataset
The filesystem destination used gs://de-zoomcamp-project-453801-terra-bucket location to store data
Load package 1742107479.6691566 is LOADED and contains no failed jobs


In [2]:
# Inspect the data 
from google.cloud import storage

# Initialize the GCS client
client = storage.Client()

# Specify your bucket name
bucket_name = "de-zoomcamp-project-453801-terra-bucket"
bucket = client.bucket(bucket_name)



In [10]:
# List all files in the bucket
blobs = bucket.list_blobs()
for blob in blobs:
    print(blob.name)  # Print the file name


stock_dataset/_dlt_loads/init
stock_dataset/_dlt_loads/marketstack__1742107142.4909291.jsonl
stock_dataset/_dlt_loads/marketstack__1742107479.6691566.jsonl
stock_dataset/_dlt_loads/marketstack__1742110254.6944857.jsonl
stock_dataset/_dlt_loads/marketstack__1742133149.7746913.jsonl
stock_dataset/_dlt_pipeline_state/init
stock_dataset/_dlt_pipeline_state/marketstack_pipeline__1742107142.4909291__99e8f4613409b0bc04290ec677b40fd8ed2129231e3639d061c09a5a2e012b9d.jsonl
stock_dataset/_dlt_pipeline_state/marketstack_pipeline__1742133149.7746913__a0a38bfb8271785d974e25be41990fa32a5b650ee0b8ef7cc71528682c7d1582.jsonl
stock_dataset/_dlt_version/init
stock_dataset/_dlt_version/marketstack__1742107144.035603__99e8f4613409b0bc04290ec677b40fd8ed2129231e3639d061c09a5a2e012b9d.jsonl
stock_dataset/_dlt_version/marketstack__1742110256.4775732__c8815ab31b5d0075efa4b068f1f0d78f10f6e676807c0b7ce90c108167e15c45.jsonl
stock_dataset/_dlt_version/marketstack__1742133152.1333828__a0a38bfb8271785d974e25be41990fa3

In [11]:
csv_blob = bucket.blob("stock_dataset/stock/1742133149.7746913.388efa4c72.csv")  # Replace with the actual path to your CSV file

In [12]:
import gzip
from io import BytesIO
csv_data = csv_blob.download_as_bytes()  # Download as bytes

# Decompress the gzip file and read it into a DataFrame
with gzip.GzipFile(fileobj=BytesIO(csv_data), mode='rb') as gz_file:
    df = pd.read_csv(gz_file)

# Display the first few rows of the DataFrame
print(df.head())

      open      high     low   close      volume  adj_high  adj_low  \
0  211.250  213.9500  209.58  213.49  60060200.0  213.9500   209.58   
1  379.780  390.2300  379.51  388.56  19929300.0  390.2300   379.51   
2  215.940  216.8394  208.42  209.68  60306872.0  216.8394   208.42   
3  383.155  385.3099  377.45  378.77  20280230.0  385.3200   377.45   
4  220.140  221.7500  214.91  216.98  62466400.0  221.7500   214.91   

   adj_close  adj_open  adj_volume  split_factor  dividend symbol exchange  \
0     213.49   211.250  60107582.0           1.0       0.0   AAPL     XNAS   
1     388.56   379.775  19952846.0           1.0       0.0   MSFT     XNAS   
2     209.68   215.950  61368330.0           1.0       0.0   AAPL     XNAS   
3     378.77   383.155  20473017.0           1.0       0.0   MSFT     XNAS   
4     216.98   220.140  62547467.0           1.0       0.0   AAPL     XNAS   

                        date  _dlt_load_id         _dlt_id  
0  2025-03-14 00:00:00+00:00  1.742133e+09 

In [13]:
df

Unnamed: 0,open,high,low,close,volume,adj_high,adj_low,adj_close,adj_open,adj_volume,split_factor,dividend,symbol,exchange,date,_dlt_load_id,_dlt_id
0,211.250,213.9500,209.5800,213.49,60060200.0,213.9500,209.58,213.49,211.250,60107582.0,1.0,0.0,AAPL,XNAS,2025-03-14 00:00:00+00:00,1.742133e+09,1V78ISrFKmCNcA
1,379.780,390.2300,379.5100,388.56,19929300.0,390.2300,379.51,388.56,379.775,19952846.0,1.0,0.0,MSFT,XNAS,2025-03-14 00:00:00+00:00,1.742133e+09,Rqf8w61uVNlwIg
2,215.940,216.8394,208.4200,209.68,60306872.0,216.8394,208.42,209.68,215.950,61368330.0,1.0,0.0,AAPL,XNAS,2025-03-13 00:00:00+00:00,1.742133e+09,4c32rIbcWjxNSQ
3,383.155,385.3099,377.4500,378.77,20280230.0,385.3200,377.45,378.77,383.155,20473017.0,1.0,0.0,MSFT,XNAS,2025-03-13 00:00:00+00:00,1.742133e+09,E3anMCgUkfORzQ
4,220.140,221.7500,214.9100,216.98,62466400.0,221.7500,214.91,216.98,220.140,62547467.0,1.0,0.0,AAPL,XNAS,2025-03-12 00:00:00+00:00,1.742133e+09,hc9OgTl9ytzmCw
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,433.030,439.6700,432.6300,437.42,25983200.0,439.6700,432.63,437.42,433.030,26009429.0,1.0,0.0,MSFT,XNAS,2024-12-04 00:00:00+00:00,1.742133e+09,OG+UpVl/4Ih0+Q
136,239.920,242.7550,238.9003,242.65,37736685.0,242.7600,238.90,242.65,239.810,38861017.0,1.0,0.0,AAPL,XNAS,2024-12-03 00:00:00+00:00,1.742133e+09,wpb7ZW7cEf3+FA
137,429.840,432.4700,427.7400,431.20,18281400.0,432.4700,427.74,431.20,429.840,18301987.0,1.0,0.0,MSFT,XNAS,2024-12-03 00:00:00+00:00,1.742133e+09,5nxhEsKjcog01Q
138,237.270,240.7900,237.1600,239.59,42439900.0,240.7900,237.16,239.59,237.270,48137103.0,1.0,0.0,AAPL,XNAS,2024-12-02 00:00:00+00:00,1.742133e+09,oSEVlK8Grqg7Fg


In [14]:
df['date'].unique()

array(['2025-03-14 00:00:00+00:00', '2025-03-13 00:00:00+00:00',
       '2025-03-12 00:00:00+00:00', '2025-03-11 00:00:00+00:00',
       '2025-03-10 00:00:00+00:00', '2025-03-07 00:00:00+00:00',
       '2025-03-06 00:00:00+00:00', '2025-03-05 00:00:00+00:00',
       '2025-03-04 00:00:00+00:00', '2025-03-03 00:00:00+00:00',
       '2025-02-28 00:00:00+00:00', '2025-02-27 00:00:00+00:00',
       '2025-02-26 00:00:00+00:00', '2025-02-25 00:00:00+00:00',
       '2025-02-24 00:00:00+00:00', '2025-02-21 00:00:00+00:00',
       '2025-02-20 00:00:00+00:00', '2025-02-19 00:00:00+00:00',
       '2025-02-18 00:00:00+00:00', '2025-02-14 00:00:00+00:00',
       '2025-02-13 00:00:00+00:00', '2025-02-12 00:00:00+00:00',
       '2025-02-11 00:00:00+00:00', '2025-02-10 00:00:00+00:00',
       '2025-02-07 00:00:00+00:00', '2025-02-06 00:00:00+00:00',
       '2025-02-05 00:00:00+00:00', '2025-02-04 00:00:00+00:00',
       '2025-02-03 00:00:00+00:00', '2025-01-31 00:00:00+00:00',
       '2025-01-30 00:00: