In [8]:
from kfp.v2 import dsl
from kfp.v2.dsl import (Artifact,
                        Dataset,
                        Input,
                        Output,
                        Model,
                        Metrics,
                        Markdown,
                        HTML,
                        component, 
                        OutputPath, 
                        InputPath)

from kfp.v2 import compiler
from google.cloud.aiplatform import pipeline_jobs

from src.config.gcp_config import load_config

In [9]:
config = load_config()

In [56]:
@component(
    base_image=config.base_image,
    output_component_file="get_and_process_data.yaml"
)
def get_and_process_data():
    
    from src.pipelines.utils import (
        create_table_from_df,
        preprocess_data
    )
    from google.cloud import bigquery
    
    bq_client = bigquery.Client(location=config.region, project=config.gcp_project_id)
    query = f"SELECT * FROM `{config.main_table_id}`"
    df = bq_client.query(query).to_dataframe()

    X_processed = preprocess_data(df=df, target_column_name="target_class")

    create_table_from_df(
        bq_client=bq_client,
        df=X_processed,
        table_id=config.stg_table_id,
        write_disposition="WRITE_TRUNCATE"
    )

  @component(
  def get_test_data(


In [57]:
@component(
    base_image=config.base_image,
    install_kfp_package=False,
    output_component_file="train_model.yaml",
)
def train_model():

    import pickle
    import tempfile
    from google.cloud import (
        bigquery,
        storage
    )
    from datetime import datetime
    from src.pipelines.utils import model_train

    bq_client = bigquery.Client(location=config.region, project=config.gcp_project_id)
    query = f"SELECT * FROM `{config.stg_table_id}`"
    X_processed = bq_client.query(query).to_dataframe()

    model = model_train(X_processed=X_processed, target_column_name="target_class")

    with tempfile.NamedTemporaryFile() as tmp_pickle:
        pickle_name = datetime.now().strftime("%d-%m-%Y:%H%M")
        pickle.dump(model, tmp_pickle)
        storage_client = storage.Client(project=config.gcp_project_id)
        bucket = storage_client.bucket(config.gcp_bucket)
        upload_blob = bucket.blob(f"/models/{pickle_name}")
        upload_blob.upload_from_file(tmp_pickle, rewind=True)


  @component(
  def save_file(


In [59]:
@dsl.pipeline(
    pipeline_root=config.pipeline_root,
    name="pipeline-big-data-project"   
)

def pipeline():

    data_prep = get_and_process_data()
    model_training = train_model()

In [60]:
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='model_training_pipeline.json')

In [61]:
start_pipeline = pipeline_jobs.PipelineJob(
    display_name="cc-project-pipeline",
    template_path="model_training_pipeline.json",
    enable_caching=False,
    location=config.region,
)

In [62]:
start_pipeline.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/502842367035/locations/europe-west3/pipelineJobs/pipeline-houseprice-20240328212516
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/502842367035/locations/europe-west3/pipelineJobs/pipeline-houseprice-20240328212516')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/europe-west3/pipelines/runs/pipeline-houseprice-20240328212516?project=502842367035
PipelineJob projects/502842367035/locations/europe-west3/pipelineJobs/pipeline-houseprice-20240328212516 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/502842367035/locations/europe-west3/pipelineJobs/pipeline-houseprice-20240328212516 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/502842367035/locations/europe-west3/pipelineJobs/pipeline-houseprice-20240328212516 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob run completed. Resource name: