In [None]:
import dlt
from dlt.sources.rest_api import rest_api_source
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator

In [None]:

@dlt.resource(name="courses")   # The name of the resource (used as the table name)
def fetch_courses_pipeline():
    client = RESTClient(  # Initialize REST client without params
        base_url="https://opendata.caissedesdepots.fr/api/explore/v2.1",
        paginator=PageNumberPaginator(
            base_page=1,
            total_path=None
        )
    )
    
    # Iterate over paginated responses
    for page in client.paginate("/catalog/datasets/moncompteformation_catalogueformation/exports/csv"):
        yield page

# Define new dlt pipeline
pipeline = dlt.pipeline(destination="filesystem")

# Run the pipeline with the new resource
load_info = pipeline.run(fetch_courses_pipeline, write_disposition="replace")
print(load_info)    

In [None]:
import dlt
from dlt.sources.rest_api import rest_api_source
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator
from dlt.sources.filesystem import filesystem
import pandas as pd

In [None]:
import dlt
from dlt.sources.rest_api import rest_api_source
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator
from dlt.sources.filesystem import filesystem
import pandas as pd

url="https://opendata.caissedesdepots.fr/api/explore/v2.1/catalog/datasets/moncompteformation_catalogueformation/exports/csv"

@dlt.resource(name="courses")   # The name of the resource (used as the table name)
def fetch_courses_pipeline():
    try:
        with requests.get(url, stream=True) as response:
            response.raise_for_status()
            buffer = io.BytesIO()
            for chunk in response.iter_content(chunk_size=1024 * 1024):  # 1MB chunks
                buffer.write(chunk)
            buffer.seek(0)
            table = pd.read_csv(buffer)
            print(f'Got data from {url} with {table.num_rows} records')
            if table.num_rows > 0:
                yield table
    except Exception as e:
        print(f"Failed to fetch data from {url}: {e}")

# Define new dlt pipeline
pipeline = dlt.pipeline(destination="filesystem")

# Run the pipeline with the new resource
load_info = pipeline.run(fetch_courses_pipeline(), write_disposition="replace")
print(load_info)    

In [None]:
import dlt
import requests
import io
import pandas as pd
from google.cloud import storage

In [None]:
import dlt
import requests
import io
import pandas as pd
from google.cloud import storage

# Define the base URL and refinement parameters
BASE_URL = "https://opendata.caissedesdepots.fr/api/explore/v2.1/catalog/datasets/moncompteformation_catalogueformation/exports/csv"
params = {
    "refine": [
        "libelle_nsf_1:Informatique, traitement de l'information, réseaux de transmission",
        "libelle_nsf_1:Enseignement, formation",
        "libelle_nsf_1:Commerce, vente",
        "libelle_nsf_1:Comptabilite, gestion",
        "libelle_nsf_1:Spécialités pluri-scientifiques",
        "libelle_nsf_1:Spécialites plurivalentes de la communication et de l'information"
    ]
}

# Set the GCS credentials environment variable (make sure to provide the correct JSON key path)
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./gcs.json"

# Define the GCS bucket and prefix for storage
GCS_BUCKET_NAME = "jugnu-france-course-enrollments"
GCS_PREFIX = "french_courses/"

@dlt.resource(name="french_courses")
def fetch_courses():
    """Fetches and yields data from the API in chunks."""
    try:
        # Send the GET request to the API with parameters
        response = requests.get(BASE_URL, params=params, stream=True)
        response.raise_for_status()  # Raise an error for failed requests

        # Create a buffer to read the CSV data
        buffer = io.StringIO(response.text)
        df = pd.read_csv(buffer)

        # If the response contains data, yield the dataframe
        if not df.empty:
            print(f"Fetched {len(df)} records")
            yield df
        else:
            print("No data found")

    except Exception as e:
        print(f"Failed to fetch data: {e}")

# Define a function to upload the data to GCS
def upload_to_gcs(df, file_name):
    """Uploads the dataframe to GCS as a CSV file."""
    try:
        # Convert DataFrame to CSV and upload to GCS
        csv_buffer = io.StringIO()
        df.to_csv(csv_buffer, index=False)
        csv_buffer.seek(0)

        # Initialize the GCS client
        client = storage.Client()

        # Set the GCS blob (file) path
        gcs_blob_path = GCS_PREFIX + file_name
        bucket = client.get_bucket(GCS_BUCKET_NAME)
        blob = bucket.blob(gcs_blob_path)

        # Upload CSV content to the GCS bucket
        blob.upload_from_file(csv_buffer, content_type="text/csv")
        print(f"File uploaded to GCS: {gcs_blob_path}")
    except Exception as e:
        print(f"Failed to upload to GCS: {e}")

# Configure the pipeline to send data to GCS (no BigQuery here)
pipeline = dlt.pipeline(
    pipeline_name="french_courses_pipeline",
    destination="filesystem",  # Set destination as GCS
    dataset_name="french_courses_dataset",
    dev_mode=True
)

# Run the pipeline and load data
load_info = pipeline.run(fetch_courses())

# Once data is processed, upload to GCS
for idx, df in enumerate(load_info):
    file_name = f"french_courses_part_{idx + 1}.csv"
    upload_to_gcs(df, file_name)

print("Data upload to GCS completed.")


In [None]:
import dlt
import requests
import io
import pandas as pd

url = "https://opendata.caissedesdepots.fr/api/explore/v2.1/catalog/datasets/moncompteformation_catalogueformation/exports/csv"

@dlt.resource(name="courses")
def fetch_courses_pipeline():
    try:
        with requests.get(url, stream=True) as response:
            response.raise_for_status()
            buffer = io.BytesIO()
            for chunk in response.iter_content(chunk_size=1024 * 1024):
                buffer.write(chunk)
            buffer.seek(0)
            table = pd.read_csv(buffer, sep=";") #Added sep parameter
            print(f'Got data from {url} with {len(table)} records') #Used len(table)
            if len(table) > 0:
                yield table
    except Exception as e:
        print(f"Failed to fetch data from {url}: {e}")

# Define new dlt pipeline
pipeline = dlt.pipeline(destination="filesystem")

# Run the pipeline with the new resource
load_info = pipeline.run(fetch_courses_pipeline(), write_disposition="replace")
print(load_info)

In [1]:
import dlt
import requests
import io
import pandas as pd

url = "https://opendata.caissedesdepots.fr/api/explore/v2.1/catalog/datasets/moncompteformation_catalogueformation/exports/csv"

@dlt.resource(name="courses")
def fetch_courses_pipeline():
    try:
        with requests.get(url, stream=True) as response:
            response.raise_for_status()
            buffer = io.BytesIO()
            for chunk in response.iter_content(chunk_size=1024 * 1024):
                buffer.write(chunk)
            buffer.seek(0)
            table = pd.read_csv(buffer, sep=";")
            print(f'Got data from {url} with {len(table)} records')
            if len(table) > 0:
                table['code_region'] = table['code_region'].astype(str)
                table['coderegion_export'] = table['coderegion_export'].astype(str)
                yield table
    except Exception as e:
        print(f"Failed to fetch data from {url}: {e}")

# Define new dlt pipeline
pipeline = dlt.pipeline(
    pipeline_name="moncompteformation_pipeline",
    destination="filesystem",
    dataset_name="courses_data"  # Top-level folder name
)

# Run the pipeline with the new resource, specify table name and destination path
load_info = pipeline.run(
    fetch_courses_pipeline(),
    write_disposition="replace",
    table_name="courses_france"
)
print(load_info)

  table = pd.read_csv(buffer, sep=";")


Got data from https://opendata.caissedesdepots.fr/api/explore/v2.1/catalog/datasets/moncompteformation_catalogueformation/exports/csv with 201175 records
Pipeline moncompteformation_pipeline load step completed in 13.91 seconds
1 load package(s) were loaded to destination filesystem and into dataset courses_data
The filesystem destination used gs://jugnu-france-course-enrollments location to store data
Load package 1742298130.7551758 is LOADED and contains no failed jobs


In [None]:
import dlt

try:
    pipeline = dlt.pipeline(destination="filesystem")
    print("GCS destination is available.")
except AttributeError as e:
    print(f"GCS destination is not available: {e}")