<a href="https://colab.research.google.com/github/kootr/ml-study-session/blob/mlops_session_for_june/vertexai_pipelines_bqml_titanic_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Overview

The pipeline will 

1. Read Titanic data in Google Cloud Storage
2. Using Dataflow and ingest into BigQuery
3. Train a Logistic regression model to classify survived person. 
4. Evaluate the model 

### Dataset
GCSに以下のバケットとフォルダを作成して、CSVファイルを配置してください。
dataset は https://www.kaggle.com/competitions/titanic/data　
からダウンロードしてください


gs://session11/titanic/train.csv

gs://session11/titanic/test.csv

Google collabo での実行を想定しています。

In [1]:
import os

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# Google Cloud Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"

if os.getenv("IS_TESTING"):
    ! touch /builder/home/.local/lib/python3.9/site-packages/google_api_core-2.7.1.dist-info/METADATA

### Install additional packages


In [None]:
! pip3 install {USER_FLAG} --upgrade "apache-beam[gcp]==2.36.0"
! pip3 install {USER_FLAG} --upgrade "kfp==1.8.2"
! pip3 install {USER_FLAG} --upgrade "google-cloud-aiplatform==1.10.0"
! pip3 install {USER_FLAG} --upgrade "google_cloud_pipeline_components==1.0.1"
! pip3 install {USER_FLAG} --upgrade "fsspec"
! pip3 install {USER_FLAG} --upgrade "gcsfs"

### Restart the kernel

パッケージのインストール後にカーネルを再起動する。

In [3]:
# Automatically restart kernel after installs
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

### Authenticate your Google Cloud account

Google collaboを使用する場合は認証が必要。

In [4]:
import os
import sys

IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# If on Google Cloud Notebooks, then don't execute this code
if not IS_GOOGLE_CLOUD_NOTEBOOK:
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### 環境変数の設定


In [1]:
import google.cloud.aiplatform as vertex_ai

PROJECT_NAME = "ml-session"  # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}
ROOT_BUCKET = "gs://session11/bqml_pipeline"
PROJECT_NAME="ml-session"
BQ_DATASET = "session11"
BQ_TRAINING_TABLE = "train"
BQ_TESTING_TABLE = "test"
BQ_ML_MODEL = "model_titanic"
JOB_NAME = "titanic"

INFO:numexpr.utils:NumExpr defaulting to 2 threads.


### Initialize client

In [2]:
vertex_ai.init(project=PROJECT_NAME, location=REGION, staging_bucket=ROOT_BUCKET)

## Pipeline formalization

### BQML components

1) BigQuery で データセット session11を作成

2) DataflowでGCSからCSVファイルを加工し、BigQueryへtrain, testテーブルにロードする

3) BigQuery MLでモデル作成

4) 予測

In [None]:
%%writefile ./requirement.txt
pandas
fsspec
gcsfs

In [None]:
!cat requirement.txt

In [None]:
# ここはGCSからBigQuery へロードするための処理
# https://dev.classmethod.jp/articles/cloud-dataflow_gcs2bq_python/　を参考に

%%writefile ingest_pipeline.py
import logging
import csv
from datetime import datetime, timezone, timedelta
import pandas as pd
import json

import apache_beam as beam
from apache_beam.io.gcp.bigquery import WriteToBigQuery, BigQueryDisposition
from apache_beam.io.gcp.internal.clients import bigquery
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions

from apache_beam.dataframe import convert

class BeamOptions:
    def __init__(self,runner):
        self.options = PipelineOptions()
        # GoogleCloud Option
        self.gcloud_options = self.options.view_as(GoogleCloudOptions)
        self.gcloud_options.job_name = "loadtobq"
        self.gcloud_options.project = "ml-session"
        self.gcloud_options.temp_location = "gs://session11/bqml_pipeline/tmp" # 処理する際にGCSに一時ファイルを作成するのでその保管先のGCS URI
        self.gcloud_options.region = "us-central1"
        # Setup Option
        self.options.view_as(SetupOptions).save_main_session = True
        # Standard Option
        self.options.view_as(StandardOptions).runner = runner


# 前処理用
import numpy as np

def process(df):
    df = df.drop(["Ticket", "Cabin"], axis=1)
    df = add_title(df)
    df = label_encode(df)
    df = age(df)
    df = is_alone(df)
    df = port(df)
    df = fare(df)
    return df


def add_title(df):
    df["Title"] = df.Name.str.extract(" ([A-Za-z]+)\.", expand=False)
    df["Title"] = df["Title"].replace(
        [
            "Lady",
            "Countess",
            "Capt",
            "Col",
            "Don",
            "Dr",
            "Major",
            "Rev",
            "Sir",
            "Jonkheer",
            "Dona",
        ],
        "Rare",
    )
    df["Title"] = df["Title"].replace("Mlle", "Miss")
    df["Title"] = df["Title"].replace("Ms", "Miss")
    df["Title"] = df["Title"].replace("Mme", "Mrs")

    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

    df["Title"] = df["Title"].map(title_mapping)
    df["Title"] = df["Title"].fillna(0)
    df = df.drop(["Name"], axis=1)
    return df


def label_encode(df):
    df["Sex"] = df["Sex"].map({"female": 1, "male": 0}).astype(int)
    return df


def age(df):
    guess_ages = np.zeros((2, 3))

    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = df[(df["Sex"] == i) & (df["Pclass"] == j + 1)]["Age"].dropna()

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i, j] = int(age_guess / 0.5 + 0.5) * 0.5

    for i in range(0, 2):
        for j in range(0, 3):
            df.loc[
                (df.Age.isnull()) & (df.Sex == i) & (df.Pclass == j + 1), "Age"
            ] = guess_ages[i, j]

    df["Age"] = df["Age"].astype(int)
    df.loc[df["Age"] <= 16, "Age"] = 0
    df.loc[(df["Age"] > 16) & (df["Age"] <= 32), "Age"] = 1
    df.loc[(df["Age"] > 32) & (df["Age"] <= 48), "Age"] = 2
    df.loc[(df["Age"] > 48) & (df["Age"] <= 64), "Age"] = 3
    df.loc[df["Age"] > 64, "Age"]
    df["Age_Class"] = df.Age * df.Pclass
    return df


def is_alone(df):
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = 0
    df.loc[df["FamilySize"] == 1, "IsAlone"] = 1
    df = df.drop(["Parch", "SibSp", "FamilySize"], axis=1)
    return df


def port(df):
    freq_port = df.Embarked.dropna().mode()[0]
    df["Embarked"] = df["Embarked"].fillna(freq_port)
    df["Embarked"] = df["Embarked"].map({"S": 0, "C": 1, "Q": 2}).astype(int)
    return df


def fare(df):
    df["Fare"].fillna(df["Fare"].dropna().median(), inplace=True)
    df.loc[df["Fare"] <= 7.91, "Fare"] = 0
    df.loc[(df["Fare"] > 7.91) & (df["Fare"] <= 14.454), "Fare"] = 1
    df.loc[(df["Fare"] > 14.454) & (df["Fare"] <= 31), "Fare"] = 2
    df.loc[df["Fare"] > 31, "Fare"] = 3
    df["Fare"] = df["Fare"].astype(int)
    return df

# 前処理終わり


def get_bigquery_schema_train():
    """
    A function to get the BigQuery schema.
    Returns:
        A list of BigQuery schema.
    """

    table_schema = bigquery.TableSchema()
    columns = (
        ('PassengerId', 'integer', 'nullable'),
        ('Survived', 'integer', 'nullable'),
        ('Pclass', 'integer', 'nullable'),
        ('Sex', 'integer', 'nullable'),
        ('Age', 'integer', 'nullable'),
        ('Fare', 'integer', 'nullable'),
        ('Embarked', 'integer', 'nullable'),
        ('Title', 'integer', 'nullable'),
        ('Age_Class', 'integer', 'nullable'),
        ('IsAlone', 'integer', 'nullable'),
        )

    for column in columns:
        column_schema = bigquery.TableFieldSchema()
        column_schema.name = column[0]
        column_schema.type = column[1]
        column_schema.mode = column[2]
        table_schema.fields.append(column_schema)

    return table_schema



def get_bigquery_schema_test():
    """
    A function to get the BigQuery schema.
    Returns:
        A list of BigQuery schema.
    """

    table_schema = bigquery.TableSchema()
    columns = (
        ('PassengerId', 'integer', 'nullable'),
        # ('Survived', 'integer', 'nullable'),
        ('Pclass', 'integer', 'nullable'),
        ('Sex', 'integer', 'nullable'),
        ('Age', 'integer', 'nullable'),
        ('Fare', 'integer', 'nullable'),
        ('Embarked', 'integer', 'nullable'),
        ('Title', 'integer', 'nullable'),
        ('Age_Class', 'integer', 'nullable'),
        ('IsAlone', 'integer', 'nullable'),
        )

    for column in columns:
        column_schema = bigquery.TableFieldSchema()
        column_schema.name = column[0]
        column_schema.type = column[1]
        column_schema.mode = column[2]
        table_schema.fields.append(column_schema)

    return table_schema

def run(runner="DataflowRunner"):
    with beam.Pipeline(options=BeamOptions(runner).options) as pipeline:
        gcs_uri_train="gs://session11/titanic/train.csv"
        gcs_uri_test="gs://session11/titanic/test.csv"

        # GCSからファイル読み込み
        df_train = pd.read_csv(gcs_uri_train)
        df_train = process(df_train)

        df_test = pd.read_csv(gcs_uri_test)
        df_test = process(df_test)

        table_spec_train = 'ml-session:session11.train'
        table_spec_test = 'ml-session:session11.test'

        trans_datas_train = (
          # Convert the Pandas DataFrame to a PCollection.
          convert.to_pcollection(df_train, pipeline=pipeline)
          
          # We get named tuples, we can convert them to dictionaries like this.
          | 'To dictionaries train' >> beam.Map(lambda x: dict(x._asdict()))
         )

        trans_datas_test = (
          # Convert the Pandas DataFrame to a PCollection.
          convert.to_pcollection(df_test, pipeline=pipeline)
          
          # We get named tuples, we can convert them to dictionaries like this.
          | 'To dictionaries test' >> beam.Map(lambda x: dict(x._asdict()))
         )        

    # BigQueryへデータ登録
        trans_datas_train | "Write to BigQuery train" >> WriteToBigQuery(
            table_spec_train,
            schema=get_bigquery_schema_train(),
            create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=BigQueryDisposition.WRITE_TRUNCATE
        )
        trans_datas_test | "Write to BigQuery test" >> WriteToBigQuery(
            table_spec_test,
            schema=get_bigquery_schema_test(),
            create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=BigQueryDisposition.WRITE_TRUNCATE
        )


if __name__ == "__main__":
    logging.getLogger().setLevel(logging.INFO)
    run(
        # runner="DirectRunner"  # ローカル実行
        runner="DataflowRunner",  # Cloud Dataflow実行
    )


In [None]:
!echo $ROOT_BUCKET/script/

In [None]:
!gsutil cp ingest_pipeline.py $ROOT_BUCKET/script/
!gsutil cp requirement.txt $ROOT_BUCKET/script/

#### Create BQ queries

In [7]:
create_bq_dataset_query = f"""
CREATE SCHEMA IF NOT EXISTS `{PROJECT_NAME}.{BQ_DATASET}`;
"""

create_bq_model_query = f"""
CREATE or REPLACE MODEL `{PROJECT_NAME}.{BQ_DATASET}.{BQ_ML_MODEL}`
OPTIONS (
  model_type = 'logistic_reg',
  input_label_cols=['Survived']
  -- num_trials=20,
  -- max_parallel_trials=2
  ) AS (
SELECT
  Pclass, Title, isAlone, Age, Sex, Embarked, Fare, Age_Class, Survived
FROM
  `{PROJECT_NAME}.{BQ_DATASET}.{BQ_TRAINING_TABLE}`
WHERE PassengerId BETWEEN 1 AND 712
)
"""

create_bq_prediction_query = f"""
SELECT
 * 
FROM
  ML.PREDICT(MODEL `{PROJECT_NAME}.{BQ_DATASET}.{BQ_ML_MODEL}`, (
  SELECT
    PassengerId, Pclass, Title, Sex, Age, Fare, Embarked, IsAlone, Age_Class
  FROM
    `{PROJECT_NAME}.{BQ_DATASET}.{BQ_TESTING_TABLE}`
  )
)
"""

create_bq_evaluate_query = f"""
SELECT
 * 
FROM
  ML.EVALUATE(MODEL `{PROJECT_NAME}.{BQ_DATASET}.{BQ_ML_MODEL}`, (
  SELECT
    Pclass, Title, Sex, Age, Fare, Embarked, IsAlone, Age_Class, Survived
  FROM
    `{PROJECT_NAME}.{BQ_DATASET}.{BQ_TRAINING_TABLE}`
  WHERE PassengerId > 712
  )
)
"""

### Build Pipeline

#### Create the pipeline

In [8]:
from kfp.v2 import dsl, compiler

DEPLOY_IMAGE = "us-docker.pkg.dev/vertex-ai-restricted/prediction/tf_opt-cpu.2-8:latest" # See https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers

@dsl.pipeline(name="mlops-bqml-titanic",
              description="A batch pipeline to generate RG model",
              pipeline_root=ROOT_BUCKET+"/kfp_root")
def pipeline(
    create_bq_dataset_query: str,
    python_file_path: str,
    requeirement_file_path: str,
    temp_location: str = ROOT_BUCKET + "/kfp_tmp",
    project: str = PROJECT_NAME,
):

    from google_cloud_pipeline_components.types import artifact_types
    from google_cloud_pipeline_components.v1.bigquery import (
        BigqueryQueryJobOp, BigqueryCreateModelJobOp,
        BigqueryEvaluateModelJobOp,
        BigqueryPredictModelJobOp,
        BigqueryExportModelJobOp)
    from google_cloud_pipeline_components.v1.dataflow import \
        DataflowPythonJobOp
    from google_cloud_pipeline_components.v1.wait_gcp_resources import \
        WaitGcpResourcesOp
    from google_cloud_pipeline_components.v1.model import ModelUploadOp
    from google_cloud_pipeline_components.v1.endpoint import (
        EndpointCreateOp, ModelDeployOp)
    from kfp.v2.components import importer_node

    # create the dataset, training and testing tables
    bq_tables_op = BigqueryQueryJobOp(
        query=create_bq_dataset_query,
        project=project,
        location="US",
    )

    # run dataflow job
    dataflow_python_training_data_op = DataflowPythonJobOp(
        python_module_path=python_file_path,
        project=project,
        temp_location=temp_location,
        requirements_file_path=requeirement_file_path
    ).after(bq_tables_op)

    dataflow_wait_training_data_op = WaitGcpResourcesOp(
    gcp_resources=dataflow_python_training_data_op.outputs["gcp_resources"]
        ).after(dataflow_python_training_data_op)

    # create the logistic regression model
    bq_model_op = BigqueryCreateModelJobOp(
        query=create_bq_model_query,
        project=project,
        location="US",
    ).after(dataflow_wait_training_data_op)

    # evaluate the logistic regression model
    bq_evaluate_op = BigqueryEvaluateModelJobOp(
        project=project, location="US", model=bq_model_op.outputs["model"],
        job_configuration_query={
            "destinationTable": {
                "projectId": PROJECT_NAME,
                "datasetId": "session11",
                "tableId": "evaluation",
            }
        }
    ).after(bq_model_op)
    print("+++++++++++++++++++++++++++")
    print(bq_evaluate_op)

    # similuate prediction
    bq_predict_op = BigqueryPredictModelJobOp(
        model=bq_model_op.outputs["model"],
        query_statement=create_bq_prediction_query,
        job_configuration_query={
            "destinationTable": {
                "projectId": PROJECT_NAME,
                "datasetId": "session11",
                "tableId": "result",
            }
        },
        project=project,
        location="US",
      ).after(bq_evaluate_op)

    bq_export = BigqueryExportModelJobOp(
        project=project,
        location="US",
        model=bq_model_op.outputs["model"],
        model_destination_path=ROOT_BUCKET + "/kfp_model",
    ).after(bq_predict_op)

    import_unmanaged_model_task = importer_node.importer(
        artifact_uri=ROOT_BUCKET + "/kfp_model",
        artifact_class=artifact_types.UnmanagedContainerModel,
        metadata={
            "containerSpec": {
                "imageUri": DEPLOY_IMAGE,
            },
        },
    ).after(bq_export)

    model_upload = ModelUploadOp(
        project=project,
        display_name="bqml",
        unmanaged_container_model=import_unmanaged_model_task.outputs["artifact"],
    ).after(import_unmanaged_model_task)

    endpoint = EndpointCreateOp(
        project=project,
        location="us-central1",
        display_name="titanic_predict",
    ).after(model_upload)

    _ = ModelDeployOp(
        model=model_upload.outputs["model"],
        endpoint=endpoint.outputs["endpoint"],
        dedicated_resources_min_replica_count=1,
        dedicated_resources_max_replica_count=1,
        dedicated_resources_machine_type="n1-standard-2",
        # dedicated_resources_accelerator_type=accelerator_type, # if you want to set GPU see https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec#acceleratortype
        # dedicated_resources_accelerator_count=accelerator_count,
        traffic_split={"0": 100},
    )

## Compile and Run the pipeline

In [None]:
from pathlib import Path as path

compiler.Compiler().compile(pipeline_func=pipeline, package_path="mlops_bqml_titanic_pipeline.json")

In [None]:
!cat mlops_bqml_titanic_pipeline.json

In [None]:
pipeline = vertex_ai.PipelineJob(
    display_name=f"data_preprocess",
    template_path="mlops_bqml_titanic_pipeline.json",
    pipeline_root=ROOT_BUCKET+"/kfp_root",
    parameter_values={
        "create_bq_dataset_query": create_bq_dataset_query,
        "temp_location": ROOT_BUCKET+"/kfp_tmp",
        "python_file_path": ROOT_BUCKET+"/script/ingest_pipeline.py",
        "requeirement_file_path": ROOT_BUCKET+"/script/requirement.txt"
    },
    enable_caching=False,
)

pipeline.run()

In [None]:
%%writefile command.sh
#!/bin/bash

ENDPOINT_ID="<ENDPOINT_ID>"
PROJECT_ID="<YOUR_PROJECT_ID>"
INPUT_DATA_FILE="./request.json"

curl \
-X POST \
-H "Authorization: Bearer $(gcloud auth print-access-token)" \
-H "Content-Type: application/json" \
https://us-central1-aiplatform.googleapis.com/v1/projects/${PROJECT_ID}/locations/us-central1/endpoints/${ENDPOINT_ID}:predict \
-d "@${INPUT_DATA_FILE}"



In [None]:
%%writefile request.json
{
  "instances": [
    {
      "Age": 0,
      "Age_Class": 1,
      "Embarked": 2,
      "Fare": 0,
      "Pclass": 1,
      "Sex": 0,
      "Title": 1,
      "isAlone": 0
    }
  ]
}


In [None]:
!bash command.sh

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.


In [None]:
# Delete components in vertex ai
# TBA

In [None]:
# delete bucket
! gsutil -m rm -r $ROOT_BUCKET

# delete dataset
! bq rm -r -f -d $PROJECT_NAME:$BQ_DATASET