In [1]:
from kfp.v2 import dsl
from kfp.v2.dsl import (Artifact,
                        Dataset,
                        Input,
                        Output,
                        Model,
                        Metrics,
                        Markdown,
                        HTML,
                        component, 
                        OutputPath, 
                        InputPath)

from kfp.v2 import compiler
from google.cloud.aiplatform import pipeline_jobs

from src.config.gcp_config import load_config

  from kfp.v2 import dsl


In [2]:
config = load_config()

In [3]:
@component(
    base_image=config.base_image,
    output_component_file="get_and_process_data.yaml"
)
def get_and_process_data():
    
    from src.pipelines.utils import (
        create_table_from_df,
        preprocess_data
    )
    from google.cloud import bigquery
    
    bq_client = bigquery.Client(location=config.region, project=config.gcp_project_id)
    query = f"SELECT * FROM `{config.main_table_id}`"
    df = bq_client.query(query).to_dataframe()

    X_processed = preprocess_data(df=df, target_column_name="target_class")

    create_table_from_df(
        bq_client=bq_client,
        df=X_processed,
        table_id=config.stg_table_id
    )

  @component(
  def get_and_process_data(stg_table_id: OutputPath('str')):


TypeError: Artifacts must have both a schema_title and a schema_version, separated by `@`. Got: str

In [None]:
@component(
    base_image=config.base_image,
    install_kfp_package=False,
    output_component_file="train_model.yaml",
)
def train_model():

    import pickle
    import tempfile
    from google.cloud import (
        bigquery,
        storage
    )
    from datetime import datetime
    from src.pipelines.utils import model_train

    bq_client = bigquery.Client(location=config.region, project=config.gcp_project_id)
    query = f"SELECT * FROM `{config.stg_table_id}`"
    X_processed = bq_client.query(query).to_dataframe()

    model = model_train(X_processed=X_processed, target_column_name="target_class")

    with tempfile.NamedTemporaryFile() as tmp_pickle:
        pickle_name = datetime.now().strftime("%d-%m-%Y:%H%M")
        pickle.dump(model, tmp_pickle)
        storage_client = storage.Client(project=config.gcp_project_id)
        bucket = storage_client.bucket(config.gcp_bucket)
        upload_blob = bucket.blob(f"/models/{pickle_name}")
        upload_blob.upload_from_file(tmp_pickle, rewind=True)


  @component(
  def train_model():


In [None]:
@dsl.pipeline(
    pipeline_root=config.pipeline_root,
    name="pipeline-big-data-project"   
)

def pipeline():
    data_prep = get_and_process_data().set_display_name('Data preprocessing and staging')
    model_training = train_model().after(data_prep).set_display_name('Model training')

In [4]:
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='model_training_pipeline.json')

NameError: name 'pipeline' is not defined

In [7]:
start_pipeline = pipeline_jobs.PipelineJob(
    display_name="cc-project-pipeline",
    template_path="model_training_pipeline.json",
    enable_caching=False,
    location=config.region,
)

In [8]:
start_pipeline.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/33891971032/locations/europe-west3/pipelineJobs/pipeline-big-data-project-20240330204814
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/33891971032/locations/europe-west3/pipelineJobs/pipeline-big-data-project-20240330204814')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/europe-west3/pipelines/runs/pipeline-big-data-project-20240330204814?project=33891971032
PipelineJob projects/33891971032/locations/europe-west3/pipelineJobs/pipeline-big-data-project-20240330204814 current state:
PipelineState.PIPELINE_STATE_PENDING
PipelineJob projects/33891971032/locations/europe-west3/pipelineJobs/pipeline-big-data-project-20240330204814 current state:
PipelineState.PIPELINE_STATE_PENDING
PipelineJob projects/33891971032/locations/europe-west3/pipelineJobs/pipeline-big-data-project-20240330204814 current state:
PipelineState.PIPELINE_STATE_PENDING


Exception ignored in: <function _ChannelCallState.__del__ at 0x7f20af075510>
Traceback (most recent call last):
  File "/home/telejkoi/.local/lib/python3.10/site-packages/grpc/_channel.py", line 1717, in __del__
    self.channel.close(
  File "src/python/grpcio/grpc/_cython/_cygrpc/channel.pyx.pxi", line 542, in grpc._cython.cygrpc.Channel.close
  File "src/python/grpcio/grpc/_cython/_cygrpc/channel.pyx.pxi", line 428, in grpc._cython.cygrpc._close
  File "src/python/grpcio/grpc/_cython/_cygrpc/channel.pyx.pxi", line 458, in grpc._cython.cygrpc._close
  File "/usr/lib/python3.10/threading.py", line 389, in notify_all
    def notify_all(self):
KeyboardInterrupt: 


PipelineJob run completed. Resource name: projects/33891971032/locations/europe-west3/pipelineJobs/pipeline-big-data-project-20240330204814
