In [1]:
import dlt
import requests
import io
import pandas as pd

url = "https://opendata.caissedesdepots.fr/api/explore/v2.1/catalog/datasets/moncompteformation_catalogueformation/exports/csv"

@dlt.resource(name="courses")
def fetch_courses_pipeline():
    try:
        with requests.get(url, stream=True) as response:
            response.raise_for_status()
            buffer = io.BytesIO()
            for chunk in response.iter_content(chunk_size=1024 * 1024):
                buffer.write(chunk)
            buffer.seek(0)
            table = pd.read_csv(buffer, sep=";")
            print(f'Got data from {url} with {len(table)} records')
            if len(table) > 0:
                table['code_region'] = table['code_region'].astype(str)
                table['coderegion_export'] = table['coderegion_export'].astype(str)
                yield table
    except Exception as e:
        print(f"Failed to fetch data from {url}: {e}")

# Define new dlt pipeline
pipeline = dlt.pipeline(
    pipeline_name="moncompteformation_pipeline",
    destination="filesystem",
    dataset_name="courses_data"  # Top-level folder name
)

# Run the pipeline with the new resource, specify table name and destination path
load_info = pipeline.run(
    fetch_courses_pipeline(),
    write_disposition="replace",
    table_name="courses_france"
)
print(load_info)

Got data from https://opendata.caissedesdepots.fr/api/explore/v2.1/catalog/datasets/moncompteformation_catalogueformation/exports/csv with 195315 records
Pipeline moncompteformation_pipeline load step completed in 12.70 seconds
1 load package(s) were loaded to destination filesystem and into dataset courses_data
The filesystem destination used gs://jugnu-france-course-enrollments location to store data
Load package 1743088392.22626 is LOADED and contains no failed jobs


In [None]:
import dlt
import requests
import io
import pandas as pd
from datetime import datetime

url = "https://opendata.caissedesdepots.fr/api/explore/v2.1/catalog/datasets/entree_sortie_formation/exports/csv"

@dlt.resource(name="enrollments")
def fetch_courses_pipeline():
    try:
        with requests.get(url, stream=True) as response:
            response.raise_for_status()
            buffer = io.BytesIO()
            for chunk in response.iter_content(chunk_size=1024 * 1024):
                buffer.write(chunk)
            buffer.seek(0)
            table = pd.read_csv(buffer, sep=";")
            print(f'Got data from {url} with {len(table)} records')
            if len(table) > 0:
                yield table
    except Exception as e:
        print(f"Failed to fetch data from {url}: {e}")

# Define new dlt pipeline
pipeline = dlt.pipeline(
    pipeline_name="moncompteformation_pipeline",
    destination="filesystem",
    dataset_name="enrollments_data"  # Top-level folder name
)

# Run the pipeline with the new resource, specify table name and destination path
load_info = pipeline.run(
    fetch_courses_pipeline(),
    write_disposition="replace",
    table_name="enrollments_raw_parquet_test"
)
print(load_info)

In [None]:
import requests
import zipfile
import io
import pandas as pd
import dlt
import os


In [None]:


zip_url = "https://formacode.centre-inffo.fr/IMG/zip/fcod_13_25_10_2019.zip"
##gcs_destination = os.environ.get("GCS_DESTINATION")
#gcp_creds = os.environ.get("GCP_CREDS")

@dlt.resource(name="formacode_data")
def fetch_and_process_formacode():
    try:
        response = requests.get(zip_url)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
            xls_data = zip_file.read("FCod.xls")

        df = pd.read_excel(io.BytesIO(xls_data), header=None)
        df = df.iloc[:, [0, 1, 6, 9]]
        df.columns = ['formacode', 'description', 'field', 'generic_term']

        yield df

    except requests.exceptions.RequestException as e:
        print(f"Failed to download or extract zip file from {zip_url}: {e}")
    except zipfile.BadZipFile as e:
        print(f"Failed to open zip file: {e}")
    except pd.errors.ParserError as e:
        print(f"Failed to parse XLS file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Define new dlt pipeline
pipeline = dlt.pipeline(
    pipeline_name="formacode_pipeline",
    destination="gcs",
    dataset_name="formacode_data"
)

# Run the pipeline with the new resource, specify table name and destination path
load_info = pipeline.run(
    fetch_and_process_formacode(),
    write_disposition="replace",
    table_name="formacode_translated"
)
print(load_info)

UnknownDestinationModule: Destination gcs is not one of the standard dlt destinations. Following fully qualified refs were tried in the registry:
	gcs
	destinations.gcs
	dlt.destinations.gcs
Modules and attributes were tried in the following order and failed to import:
	mod:destinations attr: gcs failed due to ModuleSpecNotFound
	mod:dlt.destinations attr: gcs failed due to AttrNotFound and causing exception: module 'dlt.destinations' has no attribute 'gcs'


In [10]:
text_file = "../data/FCod.xls"


In [7]:
csv_file = "../data/formacode_processed.csv"

In [9]:
df = pd.read_excel(xls_file, header=None, engine='xlrd')
df = df.iloc[:, [0, 1, 6, 9]]
df.columns = ['formacode', 'description', 'field', 'generic_term']
df.to_csv(csv_file, index=False)

XLRDError: Unsupported format, or corrupt file: Expected BOF record; found b'"15054"\t'

In [12]:
df = pd.read_csv(text_file, sep='\t', header=None, encoding='latin1') # Read as tab-delimited text

In [13]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,15054,DEVELOPPEMENT PROFESSIONNEL ET PERSONNEL,développement professionnel et personnel,,,DOM,150 DEVELOPPEMENT PROFESSIONNEL ET PERSONNEL,,,00101 DEVELOPPEMENT DES COMPETENCES,15061 ACCOMPAGNEMENT VERS EMPLOI$15031 ADAPTAT...,32154 ENCADREMENT MANAGEMENT$44542 PEDAGOGIE,,Employer si possible un descripteur plus préci...,,"423 Vie familiale, vie sociale et autres forma...",,15054 DEVELOPPEMENT PERSONNEL ET PROFESSIONNEL,,A0A0
1,15061,ACCOMPAGNEMENT VERS EMPLOI,accompagnement vers emploi,,,DOM,150 DEVELOPPEMENT PROFESSIONNEL ET PERSONNEL,insertion professionnelle$insertion socioprofe...,,15054 DEVELOPPEMENT PROFESSIONNEL ET PERSONNEL,15070 TECHNIQUE RECHERCHE EMPLOI,,,Employé pour les formations visant l'insertion...,,"415 Développement des capacités d'orientation,...",,15061 ACCOMPAGNEMENT VERS EMPLOI,,A0A0A0
2,15070,TECHNIQUE RECHERCHE EMPLOI,technique recherche emploi,,,DOM,150 DEVELOPPEMENT PROFESSIONNEL ET PERSONNEL,curriculum vitae$CV$lettre candidature$TRE,,15061 ACCOMPAGNEMENT VERS EMPLOI,,33003 CONDUITE ENTRETIEN RECRUTEMENT,,Employé pour les formations aux techniques de ...,,"415 Développement des capacités d'orientation,...",,15070 TECHNIQUE RECHERCHE EMPLOI,,A0A0A0A0
3,15031,ADAPTATION SOCIALE,adaptation sociale,,,DOM,150 DEVELOPPEMENT PROFESSIONNEL ET PERSONNEL,exclusion$insertion sociale$marginalisation so...,,15054 DEVELOPPEMENT PROFESSIONNEL ET PERSONNEL,,44072 TRAVAIL SOCIAL,,Employé pour les formations visant une inserti...,,"415 Développement des capacités d'orientation,...",,15031 ADAPTATION SOCIALE,,A0A0A1
4,15043,ALPHABETISATION,alphabétisation,,,DOM,150 DEVELOPPEMENT PROFESSIONNEL ET PERSONNEL,formation illettré$illettré,,15054 DEVELOPPEMENT PROFESSIONNEL ET PERSONNEL,,15235 FRANCAIS LANGUE ETRANGERE$44077 MIGRANT$...,Enseignement de l'écriture et de la lecture à ...,Employé pour les formations destinées aux publ...,,412 Développement des capacités mentales et ap...,,15043 ALPHABETISATION,,A0A0A2


In [16]:
df = df.iloc[:, [0, 1, 6, 9]]

In [17]:
df.columns = ['formacode', 'description', 'field', 'generic_term']

In [19]:
df.shape

(3379, 4)

In [None]:
  - id: workingDirectory_f
    type: io.kestra.plugin.core.flow.WorkingDirectory
    tasks:
    - id: cloneRepository_f
      type: io.kestra.plugin.git.Clone
      url: https://github.com/jugnuarora/france_courses_enrollments.git
      branch: main

    - id: spark_job_f
      type: io.kestra.plugin.spark.SparkCLI
      inputFiles:
        gcs.json: "{{ kv('GCP_CREDS') }}" # Read GCP credentials from KV store
        formacode_data.csv: "{{ outputs.download_and_process.formacode_data.csv }}"
      docker:
        image: bitnami/spark
      beforeCommands:
        - pip install deep_translator
      commands:
        - /opt/bitnami/spark/bin/spark-submit --jars ./lib/gcs-connector-hadoop3-2.2.5.jar --name GCS_Spark_Job --master local[*] ./scripts/05_formacode_spark_translation.py --input formacode_data.csv --output {{render(vars.gcs_formacode_translated)}}_new
    
  - id: upload_bigquery_f
    type: io.kestra.plugin.gcp.bigquery.LoadFromGcs
    from:
      - "{{render(vars.gcs_formacode_translated)}}_new/*.parquet"
    destinationTable: "{{kv('GCP_DATASET')}}.source_tables_test.formacode"
    format: PARQUET