In [1]:
# import libraries
import os

# load all environment variables
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

pipeline_root = os.environ['PIPELINE_ROOT']
base_image = os.environ.get("CONTAINER_IMAGE")
serving_image = os.environ.get("SERVING_IMAGE")
project_id = os.environ['PROJECT_ID']
region = os.environ['REGION']
service_account = os.environ['SERVICE_ACCOUNT']
artifact_repo = os.environ['ARTIFACT_REPO']
model_display_name = os.environ['MODEL_DISPLAY_NAME']
model_name = os.environ['MODEL_NAME']
endpoint_name = os.environ['ENDPOINT_NAME']
gcs_url = os.environ['GCS_URL']
train_ratio = float(os.environ['TRAIN_RATIO'])
bucket_name = os.environ['BUCKET_NAME']

In [2]:
# %%writefile ../components/load_data.py

from kfp.v2 import dsl
from kfp.v2.dsl import Dataset, Output
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

@dsl.component(base_image=base_image)
def load_data(
    gcs_url: str, 
    output_dataset: Output[Dataset]
):
    """Download data from a GCS URL and save it to the specified path as a Dataset."""
    
    # Logic-specific Imports
    from google.cloud import storage
    import pandas as pd

    # Extract bucket and blob info from GCS URL
    if not gcs_url.startswith("gs://"):
        raise ValueError("Invalid GCS URL format")
    parts = gcs_url[5:].split("/", 1)
    if len(parts) != 2:
        raise ValueError("Invalid GCS URL format")
    bucket_name, blob_name = parts

    # Create a GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    # Read the contents into Pandas DataFrame
    df = pd.read_csv(blob.open("rb"))

    # Save to the specified path as Dataset
    df.to_csv(output_dataset.path, index=False)
    output_dataset.metadata['dataset_metadata'] = {'format': 'csv'}

  from kfp.v2 import dsl


In [3]:
# %%writefile ../components/preprocess_data.py

from kfp.v2 import dsl
from kfp.v2.dsl import Dataset, Input, Output
from dotenv import load_dotenv
import os

@dsl.component(base_image=base_image)
def preprocess_data(
    input_dataset: Input[Dataset], 
    train_dataset: Output[Dataset],
    test_dataset: Output[Dataset],
    train_ratio: float = 0.7,  # Updated to reflect the 70:30 split
):
    """Preprocess data by partitioning it into training and testing sets."""

    # Logic-specific Imports
    import pandas as pd
    from sklearn.model_selection import train_test_split

    # Load dataset
    df = pd.read_csv(input_dataset.path)
    df = df.dropna()

    # Check if the last column is the target and contains 'Yes'/'No', then convert it to 1/0
    if set(df.iloc[:, -1].unique()) == {'Yes', 'No'}:
        df.iloc[:, -1] = df.iloc[:, -1].map({'Yes': 1, 'No': 0})

    # Splitting data into training and testing sets
    train_data, test_data = train_test_split(df, train_size=train_ratio, random_state=42)

    # Saving the datasets
    train_data.to_csv(train_dataset.path, index=False)
    test_data.to_csv(test_dataset.path, index=False)

In [4]:
# %%writefile ../components/random_forest_train.py

from kfp.v2 import dsl
from kfp.v2.dsl import Dataset, Input, Model, Output
import os

@dsl.component(base_image=base_image)
def train_random_forest(
    train_dataset: Input[Dataset], 
    model: Output[Model]
):
    """Train a Random Forest model with Random Search."""

    # Logic-specific Imports
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.metrics import accuracy_score
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    import joblib
    import logging

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Load training dataset
    train_df = pd.read_csv(train_dataset.path)

    # Separate features and target. Assuming target is the last column.
    X_train = train_df.iloc[:, :-1]
    y_train = train_df.iloc[:, -1]

     # Preprocess features
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Random Forest classifier
    classifier = RandomForestClassifier(random_state=0)

    # Pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])

    # Define search space for hyperparameters
    param_distributions = {
        'classifier__n_estimators': [10, 50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    }

    # Random search with cross-validation
    random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=20, cv=5, n_jobs=-1, random_state=0)

    # Train the model
    random_search.fit(X_train, y_train)

    # Calculate training accuracy
    y_train_pred = random_search.predict(X_train)
    training_accuracy = accuracy_score(y_train, y_train_pred)
    logging.info(f"Training Accuracy: {training_accuracy}")

    # Best model
    best_model = random_search.best_estimator_
    logging.info(f"Best Parameters: {random_search.best_params_}")

    model.metadata["framework"] = "RandomForest"
    model.metadata["metrics"] = {
        "best_score": random_search.best_score_,
        "training_accuracy": training_accuracy
    }

    # Save the model using joblib
    file_name = model.path + ".joblib"
    joblib.dump(best_model, file_name)

In [5]:
# %%writefile ../components/decision_tree_train.py

from kfp.v2 import dsl
from kfp.v2.dsl import Dataset, Input, Model, Output
import os

@dsl.component(base_image=base_image)
def train_decision_tree(
    train_dataset: Input[Dataset], 
    model: Output[Model]
):
    """Train a Decision Tree model with Random Search."""

    # Logic-specific Imports
    import pandas as pd
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.metrics import accuracy_score
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    import joblib
    import logging

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Load training dataset
    train_df = pd.read_csv(train_dataset.path)

    # Separate features and target. Assuming target is the last column.
    X_train = train_df.iloc[:, :-1]
    y_train = train_df.iloc[:, -1]

    # Preprocess features
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Decision Tree classifier
    classifier = DecisionTreeClassifier(random_state=0)

    # Pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])

    # Define search space for hyperparameters
    param_distributions = {
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__max_features': [None, 'auto', 'sqrt', 'log2']
    }

    # Random search with cross-validation
    random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=20, cv=5, n_jobs=-1, random_state=0)

    # Train the model
    random_search.fit(X_train, y_train)

    # Calculate training accuracy
    y_train_pred = random_search.predict(X_train)
    training_accuracy = accuracy_score(y_train, y_train_pred)
    logging.info(f"Training Accuracy: {training_accuracy}")

    # Best model
    best_model = random_search.best_estimator_
    logging.info(f"Best Parameters: {random_search.best_params_}")

    model.metadata["framework"] = "DecisionTree"
    model.metadata["metrics"] = {
        "best_score": random_search.best_score_,
        "training_accuracy": training_accuracy
    }

    # Save the model using joblib
    file_name = model.path + ".joblib"
    joblib.dump(best_model, file_name)


In [6]:
# %%writefile ../components/evaluate_model.py

from kfp.v2 import dsl
from typing import NamedTuple
from kfp.v2.dsl import Dataset, Input, Metrics, Model, Output
from dotenv import load_dotenv
import os

@dsl.component(base_image=base_image)
def evaluate_model(
    test_dataset: Input[Dataset], 
    dt_model: Input[Model],
    rf_model: Input[Model],
    metrics: Output[Metrics]
) -> NamedTuple("output", [("optimal_model", str)]):
    
    """Evaluate models on test data and determine the best one based on accuracy."""
    
    import pandas as pd
    import joblib
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.tree import DecisionTreeClassifier
#     from xgboost import XGBClassifier
#     from lightgbm import LGBMClassifier
    import sklearn.metrics as skmetrics
    import logging
    import os
    
    # Set up logging
    logging.basicConfig(level=logging.INFO)

    def load_model(model_dir):
        # Construct the full path with .joblib extension
        model_path = model_dir.path + ".joblib"
        return joblib.load(model_path)

    def evaluate(model, X, y):
        predictions = model.predict(X)
        return skmetrics.accuracy_score(y, predictions)

    # Load the test dataset
    df = pd.read_csv(test_dataset.path)
    X_test = df.iloc[:, :-1]
    y_test = df.iloc[:, -1]

    # Convert categorical columns to 'category' data type for X_test
    categorical_cols = X_test.select_dtypes(include=['object']).columns
    X_test[categorical_cols] = X_test[categorical_cols].astype('category')

    # Load models
    rf = load_model(rf_model)
    dt = load_model(dt_model)
#     xgb = load_model(xgb_model)
#     lgb = load_model(lgb_model)

    # Evaluate models
    dt_accuracy = evaluate(dt, X_test, y_test)
    rf_accuracy = evaluate(rf, X_test, y_test)
#     xgb_accuracy = evaluate(xgb, X_test, y_test)
#     lgb_accuracy = evaluate(lgb, X_test, y_test)

    # Log metrics
    metrics.log_metric("dt_accuracy", round(dt_accuracy, 2))
    metrics.log_metric("rf_accuracy", round(rf_accuracy, 2))
#     metrics.log_metric("xgb_accuracy", round(xgb_accuracy, 2))
#     metrics.log_metric("lgb_accuracy", round(lgb_accuracy, 2))

    # Determine the best model
    accuracies = {"decision_tree": dt_accuracy, "random_forest": rf_accuracy 
#                   ,"xgboost": xgb_accuracy, "lightgbm": lgb_accuracy
                 }
    optimal_model = max(accuracies, key=accuracies.get)
    
    logging.info(f"Optimal Model: {optimal_model} with accuracy: {accuracies[optimal_model]}")

    return (optimal_model,)

In [7]:
from kfp.v2 import dsl
from typing import NamedTuple
from kfp.v2.dsl import Dataset, Input, Metrics, Model, Output, Artifact
from dotenv import load_dotenv
import os

@dsl.component(base_image=base_image)
def deploy_model(
    optimal_model_name: str,
    project: str,
    region: str,
    serving_image : str, 
    rf_model: Input[Model],
    dt_model: Input[Model],
    vertex_model: Output[Artifact] 
)-> NamedTuple('Outputs', [('model_resource_name', str)]):
    """Deploy the optimal model to a Vertex AI endpoint."""
    
    from google.cloud import aiplatform
    aiplatform.init(project=project, location=region)
    
    
    DISPLAY_NAME  = "pet_model"
    MODEL_NAME = "pet_model_v1"
    
    # Select the optimal model based on the name
    model_mapping = {
            "decision_tree": dt_model,
            "random_forest": rf_model
        }
    model_to_deploy = model_mapping[optimal_model_name]

    # Upload model to vertex model registry
    model_upload = aiplatform.Model.upload(
        display_name = DISPLAY_NAME, 
        artifact_uri = model_to_deploy.uri.rpartition('/')[0],
        serving_container_image_uri=serving_image,
        serving_container_health_route=f"/v1/models/{MODEL_NAME}",
        serving_container_predict_route=f"/v1/models/{MODEL_NAME}:predict",
        serving_container_environment_variables={"MODEL_NAME": MODEL_NAME},       
    )
    
    return (model_upload.resource_name,)

In [8]:
# %%writefile ../components/mlplatform_pipeline.py

from kfp.v2 import dsl
from kfp.v2 import compiler
import os

@dsl.pipeline(
    name="Data Loading and Preprocessing",
    description="A pipeline that loads data, preprocesses it, and deploys the best model.",
    pipeline_root=pipeline_root
)
def mlplatform_pipeline(
    gcs_url: str = gcs_url,
    train_ratio: float = train_ratio,
    ):
    load_data_op = load_data(gcs_url=gcs_url)
    preprocess_data_op = preprocess_data(input_dataset=load_data_op.output,
                                    train_ratio=train_ratio) 
    
    train_rf_op = train_random_forest(train_dataset=preprocess_data_op.outputs["train_dataset"])  
    train_dt_op = train_decision_tree(train_dataset=preprocess_data_op.outputs["train_dataset"])
    
    # Evaluate models
    evaluate_op = evaluate_model(
        test_dataset=preprocess_data_op.outputs["test_dataset"],
        dt_model=train_dt_op.output,
        rf_model=train_rf_op.output,
    )
    
    deploy_model_op = deploy_model(
        optimal_model_name=evaluate_op.outputs["optimal_model"],
        project=project_id,
        region=region,
        serving_image=serving_image,
        dt_model=train_dt_op.output,
        rf_model=train_rf_op.output,
    )

In [9]:
# generate a JSON file that you'll use to run the pipeline:
compiler.Compiler().compile(pipeline_func=mlplatform_pipeline, package_path="pipeline.json")

In [10]:
from datetime import datetime
from google.cloud import aiplatform, aiplatform_v1

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

# create vertex pipeline job
api_client = aiplatform.PipelineJob(
    display_name="ml-pipeline",
    template_path="pipeline.json",
    job_id=f"ml-pipeline-{TIMESTAMP}",
    enable_caching=True,
    project=project_id,
    location=region,
    parameter_values={
        "gcs_url": gcs_url,
        "train_ratio": train_ratio,
#         "model_display_name": model_display_name,
#         "model_name": model_name,
#         "endpoint_name": endpoint_name,
#         "project": str,
#         "location": str,
#         "container_image": container_image
    }
)

# run vertex pipeline job
api_client.submit(service_account=service_account)

Creating PipelineJob
PipelineJob created. Resource name: projects/546917293276/locations/europe-west1/pipelineJobs/ml-pipeline-20231231214942
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/546917293276/locations/europe-west1/pipelineJobs/ml-pipeline-20231231214942')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/europe-west1/pipelines/runs/ml-pipeline-20231231214942?project=546917293276
