In [1]:
import os
import sys

current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
sys.path.append(parent_directory)

In [2]:
# %%writefile ../config/config.py

import os
from dotenv import load_dotenv

load_dotenv()

pipeline_root = os.environ['PIPELINE_ROOT']
base_image = os.environ.get("CONTAINER_IMAGE")
serving_image = os.environ.get("SERVING_IMAGE")
project_id = os.environ['PROJECT_ID']
region = os.environ['REGION']
service_account = os.environ['SERVICE_ACCOUNT']
artifact_repo = os.environ['ARTIFACT_REPO']
gcs_url = os.environ['GCS_URL']
train_ratio = float(os.environ['TRAIN_RATIO'])
bucket_name = os.environ['BUCKET_NAME']

In [None]:
# # Commands to build the docker
# first authenticate to gcloud

# # gcloud auth login
# gcloud auth configure-docker

# # Build the image using Docker
# docker build -f docker/Dockerfile.requirement -t {region}-docker.pkg.dev/{gcp-project-id}/{gcp-artifact-repo}/{image-name}:latest .

# # Push to artifact registyry
# docker push {region}-docker.pkg.dev/{gcp-project-id}/{gcp-artifact-repo}/{image-name}:latest

In [None]:
%%writefile ../components/load_data.py

from config.config import base_image, gcs_url
from kfp.v2 import dsl
from kfp.v2.dsl import Dataset, Output

@dsl.component(base_image=base_image) 
def load_data(gcs_url: str, 
              output_dataset: Output[Dataset]
              ):
    """
    Download data from a GCS URL and save it to the specified path as a Dataset.
    """
    from google.cloud import storage
    import pandas as pd

    if not gcs_url.startswith("gs://"):
        raise ValueError("Invalid GCS URL format")
    parts = gcs_url[5:].split("/", 1)
    if len(parts) != 2:
        raise ValueError("Invalid GCS URL format")
    bucket_name, blob_name = parts

    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    df = pd.read_csv(blob.open("rb"))

    df.to_csv(output_dataset.path, index=False)
    output_dataset.metadata['dataset_metadata'] = {'format': 'csv'}

In [None]:
%%writefile ../components/preprocess_data.py

from config.config import base_image
from kfp.v2 import dsl
from kfp.v2.dsl import Dataset, Input, Output

@dsl.component(base_image=base_image) 
def preprocess_data(
    input_dataset: Input[Dataset], 
    train_dataset: Output[Dataset],
    test_dataset: Output[Dataset],
    train_ratio: float = 0.7, 
):
    """
    Preprocess data by partitioning it into training and testing sets.
    """
    import pandas as pd
    from sklearn.model_selection import train_test_split

    df = pd.read_csv(input_dataset.path)
    df = df.dropna()
    
    if set(df.iloc[:, -1].unique()) == {'Yes', 'No'}:
        df.iloc[:, -1] = df.iloc[:, -1].map({'Yes': 1, 'No': 0})

    train_data, test_data = train_test_split(df, train_size=train_ratio, random_state=42)

    train_data.to_csv(train_dataset.path, index=False)
    test_data.to_csv(test_dataset.path, index=False)

In [None]:
%%writefile ../components/random_forest_train.py

from config.config import base_image
from kfp.v2 import dsl
from kfp.v2.dsl import Dataset, Input, Model, Output

@dsl.component(base_image=base_image)
def train_random_forest(
    train_dataset: Input[Dataset], 
    model: Output[Model]
):
    """
    Train a Random Forest model with Random Search.
    """
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.metrics import accuracy_score
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    import joblib
    import logging

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Load training dataset
    train_df = pd.read_csv(train_dataset.path)

    # Separate features and target. Assuming target is the last column.
    X_train = train_df.iloc[:, :-1]
    y_train = train_df.iloc[:, -1]

    # Preprocess features
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Random Forest classifier
    classifier = RandomForestClassifier(random_state=0)

    # Pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])

    # Define search space for hyperparameters
    param_distributions = {
        'classifier__n_estimators': [10, 50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    }

    # Random search with cross-validation
    random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=20, cv=5, n_jobs=-1, random_state=0)

    # Train the model
    random_search.fit(X_train, y_train)

    # Calculate training accuracy
    y_train_pred = random_search.predict(X_train)
    training_accuracy = accuracy_score(y_train, y_train_pred)
    logging.info(f"Training Accuracy: {training_accuracy}")

    # Best model
    best_model = random_search.best_estimator_
    logging.info(f"Best Parameters: {random_search.best_params_}")

    model.metadata["framework"] = "RandomForest"
    model.metadata["metrics"] = {
        "best_score": random_search.best_score_,
        "training_accuracy": training_accuracy
    }

    # Save the model using joblib
    file_name = model.path + ".joblib"
    joblib.dump(best_model, file_name)

In [None]:
%%writefile ../components/decision_tree_train.py

from config.config import base_image
from kfp.v2 import dsl
from kfp.v2.dsl import Dataset, Input, Model, Output

@dsl.component(base_image=base_image)
def train_decision_tree(
    train_dataset: Input[Dataset], 
    model: Output[Model]
):
    """
    Train a Decision Tree model with Random Search.
    """
    # Import necessary libraries within the function
    import pandas as pd
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.metrics import accuracy_score
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    import joblib
    import logging

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Load training dataset
    train_df = pd.read_csv(train_dataset.path)

    # Separate features and target. Assuming target is the last column.
    X_train = train_df.iloc[:, :-1]
    y_train = train_df.iloc[:, -1]

    # Preprocess features
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Decision Tree classifier
    classifier = DecisionTreeClassifier(random_state=0)

    # Pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])

    # Define search space for hyperparameters
    param_distributions = {
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__max_features': [None, 'auto', 'sqrt', 'log2']
    }

    # Random search with cross-validation
    random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=20, cv=5, n_jobs=-1, random_state=0)

    # Train the model
    random_search.fit(X_train, y_train)

    # Calculate training accuracy
    y_train_pred = random_search.predict(X_train)
    training_accuracy = accuracy_score(y_train, y_train_pred)
    logging.info(f"Training Accuracy: {training_accuracy}")

    # Best model
    best_model = random_search.best_estimator_
    logging.info(f"Best Parameters: {random_search.best_params_}")

    model.metadata["framework"] = "DecisionTree"
    model.metadata["metrics"] = {
        "best_score": random_search.best_score_,
        "training_accuracy": training_accuracy
    }

    # Save the model using joblib
    file_name = model.path + ".joblib"
    joblib.dump(best_model, file_name)

In [None]:
%%writefile ../components/evaluate_model.py

from config.config import base_image
from kfp.v2 import dsl
from typing import NamedTuple
from kfp.v2.dsl import Dataset, Input, Model, Metrics

@dsl.component(base_image=base_image)
def evaluate_model(
    test_dataset: Input[Dataset], 
    dt_model: Input[Model],
    rf_model: Input[Model],
) -> NamedTuple("EvaluationOutput", [("optimal_model", str)]):
    """
    Evaluate models on test data and determine the best one based on accuracy.
    """
    # Import necessary libraries within the function
    import pandas as pd
    import joblib
    import sklearn.metrics as skmetrics
    import logging

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    def load_model(model_dir):
        """Load a model from a specified directory."""
        model_path = model_dir.path + ".joblib"
        return joblib.load(model_path)

    def evaluate(model, X, y):
        """Evaluate a model and return the accuracy score."""
        predictions = model.predict(X)
        return skmetrics.accuracy_score(y, predictions)

    # Load the test dataset
    df = pd.read_csv(test_dataset.path)
    X_test = df.iloc[:, :-1]
    y_test = df.iloc[:, -1]
    
    # Convert categorical columns to 'category' data type for X_test
    categorical_cols = X_test.select_dtypes(include=['object']).columns
    X_test[categorical_cols] = X_test[categorical_cols].astype('category')

    # Load models
    dt = load_model(dt_model)
    rf = load_model(rf_model)

    # Evaluate models
    dt_accuracy = evaluate(dt, X_test, y_test)
    rf_accuracy = evaluate(rf, X_test, y_test)

    # Log metrics
    logging.info(f"Decision Tree Accuracy: {dt_accuracy}")
    logging.info(f"Random Forest Accuracy: {rf_accuracy}")

    # Determine the best model
    # You can modify the logic here to compare all your models
    optimal_model = "decision_tree" if dt_accuracy > rf_accuracy else "random_forest"
    optimal_accuracy = max(dt_accuracy, rf_accuracy)
    logging.info(f"Optimal Model: {optimal_model} with accuracy: {optimal_accuracy}")

    return (optimal_model,)

In [None]:
%%writefile ../components/deploy_model.py

from config.config import base_image, project_id, region, serving_image
from kfp.v2 import dsl
from typing import NamedTuple
from kfp.v2.dsl import Model, Input

@dsl.component(base_image=base_image)
def deploy_model(
    optimal_model_name: str,
    project: str,
    region: str,
    serving_image: str,
    rf_model: Input[Model],
    dt_model: Input[Model],
) -> NamedTuple('Outputs', [('endpoint_name', str)]):
    """
    Deploy the optimal model to a Vertex AI endpoint.
    """
    # Import necessary libraries within the function
    from google.cloud import aiplatform
    import logging

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Initialize the AI Platform client
    aiplatform.init(project=project, location=region)

    # Select the optimal model based on the name
    model_mapping = {
        "decision_tree": dt_model,
        "random_forest": rf_model,
        # Map additional models if necessary
    }
    model_to_deploy = model_mapping[optimal_model_name]
    model_name = 'pet-adoption'

    logging.info(f"Model URI: {model_to_deploy.uri}")
    # Upload model to Vertex AI Model Registry
    model_upload = aiplatform.Model.upload(
        display_name=model_name,  
        artifact_uri=model_to_deploy.uri.rpartition('/')[0],
        serving_container_image_uri=serving_image,
        serving_container_health_route=f"/v1/models/{model_name}",  
        serving_container_predict_route=f"/v1/models/{model_name}:predict",  
        serving_container_environment_variables={"MODEL_NAME": model_name}  
    )

    logging.info(f"Model uploaded: {model_upload.resource_name}")

    # Create an endpoint
    endpoint = aiplatform.Endpoint.create(
        display_name=model_name,
        project=project,
        location=region
    )

    # Deploy model to the endpoint
    model_deployed = endpoint.deploy(
        model=model_upload,
        deployed_model_display_name=model_name,
        traffic_split={"0": 100},
        machine_type="n1-standard-4"
    )

    logging.info(f"Model deployed to endpoint: {endpoint.resource_name}")

    return (endpoint.resource_name,)


In [None]:
# %%writefile ../mlplatform_pipeline.py

# integrate into a self contained script for automated run

from kfp.v2 import dsl, compiler
from kfp.v2.dsl import pipeline
from components.load_data import load_data
from components.preprocess_data import preprocess_data
from components.train_random_forest import train_random_forest
from components.train_decision_tree import train_decision_tree
from components.evaluate_model import evaluate_model
from components.deploy_model import deploy_model
from config.config import gcs_url, train_ratio, project_id, region, serving_image, service_account, pipeline_root

@pipeline(
    name="ml-platform-pipeline",
    description="A pipeline that performs data loading, preprocessing, model training, evaluation, and deployment",
    pipeline_root= pipeline_root
)
def mlplatform_pipeline(
    gcs_url: str = gcs_url,
    train_ratio: float = train_ratio,
    ):
    load_data_op = load_data(gcs_url=gcs_url)
    preprocess_data_op = preprocess_data(input_dataset=load_data_op.output, 
                                         train_ratio=train_ratio
                                         )

    train_rf_op = train_random_forest(train_dataset=preprocess_data_op.outputs['train_dataset'])
    train_dt_op = train_decision_tree(train_dataset=preprocess_data_op.outputs['train_dataset'])

    evaluate_op = evaluate_model(
        test_dataset=preprocess_data_op.outputs['test_dataset'],
        dt_model=train_dt_op.output,
        rf_model=train_rf_op.output
    )

    deploy_model_op = deploy_model(
        optimal_model_name=evaluate_op.outputs['optimal_model'],
        project=project_id,
        region=region,
        serving_image=serving_image,
        rf_model=train_rf_op.output,
        dt_model=train_dt_op.output
    )

if __name__ == "__main__":
    # Compiling the pipeline
    pipeline_filename = "mlplatform_pipeline.json"
    compiler.Compiler().compile(
        pipeline_func=mlplatform_pipeline,
        package_path=pipeline_filename
    )

    # Deploying the pipeline to Vertex AI
    from google.cloud import aiplatform
    aiplatform.init(project=project_id, location=region)
    _ = aiplatform.PipelineJob(
        display_name="ml-platform-pipeline",
        template_path=pipeline_filename,
        parameter_values={
            "gcs_url": gcs_url,
            "train_ratio": train_ratio
        },
        enable_caching=True
    ).submit(service_account=service_account)