In [33]:
# import libraries
import os

# load all environment variables
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

pipeline_root = os.environ['PIPELINE_ROOT']
base_image = os.environ.get("CONTAINER_IMAGE")
project_id = os.environ['PROJECT_ID']
region = os.environ['REGION']
service_account = os.environ['SERVICE_ACCOUNT']
artifact_repo = os.environ['ARTIFACT_REPO']
model_display_name = os.environ['MODEL_DISPLAY_NAME']
model_name = os.environ['MODEL_NAME']
endpoint_name = os.environ['ENDPOINT_NAME']
gcs_url = os.environ['GCS_URL']
train_ratio = float(os.environ['TRAIN_RATIO'])

In [34]:
# %%writefile ../components/load_data.py

from kfp.v2 import dsl
from kfp.v2.dsl import Dataset, Output
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

container_image = os.environ.get("CONTAINER_IMAGE", "python:3.8") # Fallback to a default image if not set

@dsl.component(base_image=base_image)
def load_data(
    gcs_url: str, 
    output_dataset: Output[Dataset]
):
    """Download data from a GCS URL and save it to the specified path as a Dataset."""
    
    # Logic-specific Imports
    from google.cloud import storage
    import pandas as pd

    # Extract bucket and blob info from GCS URL
    if not gcs_url.startswith("gs://"):
        raise ValueError("Invalid GCS URL format")
    parts = gcs_url[5:].split("/", 1)
    if len(parts) != 2:
        raise ValueError("Invalid GCS URL format")
    bucket_name, blob_name = parts

    # Create a GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    # Read the contents into Pandas DataFrame
    df = pd.read_csv(blob.open("rb"))

    # Save to the specified path as Dataset
    df.to_csv(output_dataset.path, index=False)
    output_dataset.metadata['dataset_metadata'] = {'format': 'csv'}

In [35]:
# %%writefile ../components/preprocess_data.py

from kfp.v2 import dsl
from kfp.v2.dsl import Dataset, Input, Output
from dotenv import load_dotenv
import os

@dsl.component(base_image=base_image)
def preprocess_data(
    input_dataset: Input[Dataset], 
    train_dataset: Output[Dataset],
    test_dataset: Output[Dataset],
    train_ratio: float = 0.7,  # Updated to reflect the 70:30 split
):
    """Preprocess data by partitioning it into training and testing sets."""

    # Logic-specific Imports
    import pandas as pd
    from sklearn.model_selection import train_test_split

    # Load dataset
    df = pd.read_csv(input_dataset.path)
    df = df.dropna()

    # Check if the last column is the target and contains 'Yes'/'No', then convert it to 1/0
    if set(df.iloc[:, -1].unique()) == {'Yes', 'No'}:
        df.iloc[:, -1] = df.iloc[:, -1].map({'Yes': 1, 'No': 0})

    # Splitting data into training and testing sets
    train_data, test_data = train_test_split(df, train_size=train_size, random_state=42)

    # Saving the datasets
    train_data.to_csv(train_dataset.path, index=False)
    test_data.to_csv(test_dataset.path, index=False)

In [36]:
# %%writefile ../components/random_forest_train.py

from kfp.v2 import dsl
from kfp.v2.dsl import Dataset, Input, Model, Output
import os

@dsl.component(base_image=container_image)
def random_forest_train(
    train_dataset: Input[Dataset], 
    model: Output[Model]
):
    """Train a Random Forest model with Random Search."""

    # Logic-specific Imports
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.metrics import accuracy_score
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    import joblib
    import logging

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Load training dataset
    train_df = pd.read_csv(train_dataset.path)

    # Separate features and target. Assuming target is the last column.
    X_train = train_df.iloc[:, :-1]
    y_train = train_df.iloc[:, -1]

     # Preprocess features
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Random Forest classifier
    classifier = RandomForestClassifier(random_state=0)

    # Pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])

    # Define search space for hyperparameters
    param_distributions = {
        'classifier__n_estimators': [10, 50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    }

    # Random search with cross-validation
    random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=20, cv=5, n_jobs=-1, random_state=0)

    # Train the model
    random_search.fit(X_train, y_train)

    # Calculate training accuracy
    y_train_pred = random_search.predict(X_train)
    training_accuracy = accuracy_score(y_train, y_train_pred)
    logging.info(f"Training Accuracy: {training_accuracy}")

    # Best model
    best_model = random_search.best_estimator_
    logging.info(f"Best Parameters: {random_search.best_params_}")

    model.metadata["framework"] = "RandomForest"
    model.metadata["metrics"] = {
        "best_score": random_search.best_score_,
        "training_accuracy": training_accuracy
    }

    # Save the model using joblib
    file_name = model.path + ".joblib"
    joblib.dump(best_model, file_name)


In [40]:
# %%writefile ../components/mlplatform_pipeline.py

from kfp.v2 import dsl
from kfp.v2 import compiler
import os

@dsl.pipeline(
    name="Data Loading and Preprocessing",
    description="A pipeline that loads data, preprocesses it, and deploys the best model.",
    pipeline_root=pipeline_root
)
def mlplatform_pipeline(
    gcs_url: str = gcs_url,
    train_ratio: float = train_ratio,
    ):
    load_data_op = load_data(gcs_url=gcs_url)
    preprocess_data_op = preprocess_data(input_dataset=load_data_op.output,
                                    train_ratio=train_ratio) 
    
    random_forest_train_op = random_forest_train(train_dataset=preprocess_data_op.outputs["train_dataset"])                                          

In [41]:
# generate a JSON file that you'll use to run the pipeline:
compiler.Compiler().compile(pipeline_func=mlplatform_pipeline, package_path="pipeline.json")

In [42]:
from datetime import datetime
from google.cloud import aiplatform, aiplatform_v1

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

# create vertex pipeline job
api_client = aiplatform.PipelineJob(
    display_name="ml-pipeline",
    template_path="pipeline.json",
    job_id=f"ml-pipeline-{TIMESTAMP}",
    enable_caching=False,
    project=project_id,
    location=region,
    parameter_values={
        "gcs_url": gcs_url,
        "train_ratio": train_ratio,
#         "model_display_name": model_display_name,
#         "model_name": model_name,
#         "endpoint_name": endpoint_name,
#         project: str,
#         "location": str,
#         "container_image": container_image
    }
)

# run vertex pipeline job
api_client.submit(service_account=service_account)

Creating PipelineJob
PipelineJob created. Resource name: projects/546917293276/locations/europe-west1/pipelineJobs/ml-pipeline-20231230164810
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/546917293276/locations/europe-west1/pipelineJobs/ml-pipeline-20231230164810')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/europe-west1/pipelines/runs/ml-pipeline-20231230164810?project=546917293276
