In [2]:
from kfp.v2 import dsl
from kfp.v2.dsl import (Artifact,
                        Dataset,
                        Input,
                        Output,
                        Model,
                        Metrics,
                        Markdown,
                        HTML,
                        component, 
                        OutputPath, 
                        InputPath)

from kfp.v2 import compiler
from google.cloud.aiplatform import pipeline_jobs
import pandas as pd

In [54]:
PROJECT_ID = "cloud-computing-project-418718"
REGION = 'europe-west3'
BUCKET_NAME="gs://houseprice"
PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline_root/"

In [55]:
# Custom base image created using docker
IMAGE_NAME = "training"
BASE_IMAGE = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/houseprice/{IMAGE_NAME}"

In [56]:
@component(
    base_image=BASE_IMAGE,
    output_component_file="get_and_process_data.yaml"
)
def get_and_process_data():
    
    import pandas as pd
    from src.utils import (
        remove_highly_correlated_features, 
        remove_constant_features, 
        standarise_float_columns, 
        normalise_int_columns,
        create_table_from_df
    )
    from google.cloud import bigquery
    
    bq_client = bigquery.Client(location=REGION, project=GCP_PROJECT_ID)
    query = f"SELECT * FROM `{MAIN_TABLE_ID}`"
    df = bq_client.query(query).to_dataframe()

    y = df['target_class']
    X = df.drop(columns=['target_class'])

    X_processed = X.drop(remove_highly_correlated_features(df, threshold=0.7), axis=1)
    X_processed = X_processed.drop(['visitorid'], axis=1)
    X_processed = X_processed.drop(remove_constant_features(X_processed), axis=1)
    X_processed = standarise_float_columns(X_processed)
    X_processed = normalise_int_columns(X_processed)
    
    X_processed["target_class"] = y

    create_table_from_df(
        bq_client=bq_client,
        df=df,
        table_id=STG_TABLE_ID
    )

  @component(
  def get_test_data(


In [57]:
@component(
    base_image=BASE_IMAGE,
    install_kfp_package=False,
    output_component_file="train_model.yaml",
)
def train_model():

    from sklearn.ensemble import RandomForestClassifier
    import pickle
    import tempfile
    from google.cloud import (
        bigquery,
        storage
    )
    from datetime import datetime

    bq_client = bigquery.Client(location=REGION, project=GCP_PROJECT_ID)
    query = f"SELECT * FROM `{STG_TABLE_ID}`"
    X_processed = bq_client.query(query).to_dataframe()

    y = X_processed['target_class']
    X = X_processed.drop(columns=['target_class'])

    clf = RandomForestClassifier(max_depth=10, random_state=1307, n_estimators=100, class_weight='balanced')
    clf.fit(X, y)

    with tempfile.NamedTemporaryFile() as tmp_pickle:
        pickle_name = datetime.now().strftime("%d-%m-%Y:%H%M")
        pickle.dump(clf, tmp_pickle)
        storage_client = storage.Client(project=GCP_PROJECT_ID)
        bucket = storage_client.bucket(GCP_BUCKET)
        upload_blob = bucket.blob(f"/models/{pickle_name}")
        upload_blob.upload_from_file(tmp_pickle, rewind=True)


  @component(
  def save_file(


In [59]:
@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    name="pipeline-big-data-project"   
)

def pipeline(
    data_filepath: str = f"{BUCKET_NAME}/data",
    out_filepath: str = f"{BUCKET_NAME}/out", 
):

    data_prep = get_and_process_data()
    model_training = train_model()

In [60]:
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='model_training_pipeline.json')

In [61]:
start_pipeline = pipeline_jobs.PipelineJob(
    display_name="cc-project-pipeline",
    template_path="model_training_pipeline.json",
    enable_caching=False,
    location=REGION,
)

In [62]:
start_pipeline.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/502842367035/locations/europe-west3/pipelineJobs/pipeline-houseprice-20240328212516
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/502842367035/locations/europe-west3/pipelineJobs/pipeline-houseprice-20240328212516')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/europe-west3/pipelines/runs/pipeline-houseprice-20240328212516?project=502842367035
PipelineJob projects/502842367035/locations/europe-west3/pipelineJobs/pipeline-houseprice-20240328212516 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/502842367035/locations/europe-west3/pipelineJobs/pipeline-houseprice-20240328212516 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/502842367035/locations/europe-west3/pipelineJobs/pipeline-houseprice-20240328212516 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob run completed. Resource name: