## Set-up

In [1]:
# Upgrade pip
!pip install --upgrade pip



In [1]:
#!pip install --user google-cloud-bigquery==3.4.1
#!pip install --user pandas
#!pip install google-cloud-bigquery
#!pip install google-cloud-bigquery-storage
#!pip install google-cloud-storage
#!pip install pyarrow
#!pip install db-dtypes
#!pip install tqdm
#!pip install matplotlib
#!pip install ipywidgets
#!pip install ipywidgets
#!pip install google-cloud-aiplatform
#!pip install numpy
#!jupyter nbextension enable --py widgetsnbextension

!pip install autokeras
!pip install scikit-learn 
! pip install tensorflow==2.13.*

Collecting autokeras
  Downloading autokeras-1.1.0-py3-none-any.whl (148 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.6/148.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow>=2.8.0 (from autokeras)
  Obtaining dependency information for tensorflow>=2.8.0 from https://files.pythonhosted.org/packages/5a/f2/5c2f878c62c8b79c629b11b33516bb55054d7677eba6f56f3a20296b56bd/tensorflow-2.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading tensorflow-2.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting keras-tuner>=1.1.0 (from autokeras)
  Downloading keras_tuner-1.3.5-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-nlp>=0.4.0 (from autokeras)
  Obtaining dependency information for keras-nlp>=0.4.0 from https://files.pythonhosted.org/packages/13/fc/258d2a78

In [8]:
from google.cloud import bigquery
from google.cloud import storage
from tqdm import tqdm
import matplotlib.pyplot as plt
import ipywidgets
import numpy as np
from google.cloud import aiplatform

import pandas as pd

from sklearn.model_selection import train_test_split
import tensorflow as tf
import autokeras as ak


In [12]:
!PROJECT_ID=$(gcloud config get-value project)
PROJECT_ID = "bqml-sandbox-396011"
VERTEX_AI_LOCATION = 'europe-west4'

In [20]:
%load_ext google.cloud.bigquery

In [13]:
aiplatform.init(project=PROJECT_ID, location=VERTEX_AI_LOCATION)

## Owerview

There are four ways to export BigQueryMl models:
1. by using the Google Cloud Console,
2. by using `EXPORT MODEL` statement,
3. by using `bq extract` command,
4. Using API or Client Library.

Most of the time the model is saved by default as `TensorfFlow SavedModel`

In [5]:
# list all models
!bq ls -m --format=pretty $PROJECT_ID:BQ_ML_ID


Welcome to BigQuery! This script will walk you through the 
process of initializing your .bigqueryrc configuration file.

First, we need to set up your credentials if they do not 
already exist.

Setting project_id bqml-sandbox-396011 as the default.

BigQuery configuration complete! Type "bq" to get started.

+--------------------------+--------------------------------+--------+-----------------+
|            Id            |           Model Type           | Labels |  Creation Time  |
+--------------------------+--------------------------------+--------+-----------------+
| BASE_LOGISTIC_REGRESSION | LOGISTIC_REGRESSION            |        | 10 Sep 08:38:52 |
| DNN                      | DNN_LINEAR_COMBINED_CLASSIFIER |        | 10 Sep 11:43:48 |
+--------------------------+--------------------------------+--------+-----------------+


In [18]:
# Create gcs bucket to store models

project_id = "bqml-sandbox-396011"
bucket_name = "bq-ml-store"
default_storage_class = "STANDARD" 

# Initialize the client
client = storage.Client(project=project_id)

# Create the bucket with the specified default storage class
bucket = client.bucket(bucket_name)
bucket.location = "EU"
bucket.storage_class = default_storage_class
    # Try to create the bucket (it will raise an error if it already exists)
try:
    bucket.create()
    print(f"Bucket '{bucket_name}' created with default storage class '{default_storage_class}'.")
except Exception as e:
    print(f"Error creating bucket: {e}")


  bucket.location = "EU"


Bucket 'bq-ml-store' created with default storage class 'STANDARD'.


In [19]:
!bq extract --model 'BQ_ML_ID.BASE_LOGISTIC_REGRESSION' gs://bq-ml-store/base-logistic-regression

Waiting on bqjob_r53868589930e1046_0000018a7f0057a2_1 ... (33s) Current status: DONE   


In [21]:
%%bigquery

 EXPORT MODEL `BQ_ML_ID.DNN`
 OPTIONS(URI = 'gs://bq-ml-store/dnn')
 

Query is running:   0%|          |

## Register the model in Vertex AI

In [25]:
%%bigquery
ALTER MODEL BQ_ML_ID.BASE_LOGISTIC_REGRESSION SET OPTIONS (vertex_ai_model_id="base_logistic_regression");

Query is running:   0%|          |

## Deploying Model in Vertex AI

In [15]:
# create an endpoint

endpoint = aiplatform.Endpoint.create(
        display_name= "base_logistic_regression",
        project= PROJECT_ID,
        location= VERTEX_AI_LOCATION,
    )


Creating Endpoint
Create Endpoint backing LRO: projects/115333740492/locations/europe-west4/endpoints/8224984692508590080/operations/4462031765648703488
Endpoint created. Resource name: projects/115333740492/locations/europe-west4/endpoints/8224984692508590080
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/115333740492/locations/europe-west4/endpoints/8224984692508590080')


In [18]:
# deploy a model
model = aiplatform.Model(model_name = "base_logistic_regression")
model.deploy(
    endpoint = endpoint,
    deployed_model_display_name = "base_logistic_regression",
    traffic_percentage = 100, # only one model in the endpoint so it must be 100%
    machine_type = "n1-standard-2",
    min_replica_count = 1,
    max_replica_count = 4,
    accelerator_type = None ,
    accelerator_count = None ,
    sync=True,
    )

model.wait()

Deploying model to Endpoint : projects/115333740492/locations/europe-west4/endpoints/8224984692508590080
Deploy Endpoint model backing LRO: projects/115333740492/locations/europe-west4/endpoints/8224984692508590080/operations/3963821056870842368


Endpoint model deployed. Resource name: projects/115333740492/locations/europe-west4/endpoints/8224984692508590080


<google.cloud.aiplatform.models.Endpoint object at 0x7f66b34ce710> 
resource name: projects/115333740492/locations/europe-west4/endpoints/8224984692508590080

In [None]:
## Sample prediction
# To TEST !
example = {
    "island": "Dream",
    "culmen_length_mm": 36.6,
    "culmen_depth_mm": 18.4,
    "flipper_length_mm": 184.0,
    "body_mass_g": 3475.0,
    "sex": "FEMALE",
}

prediction = endpoint.predict([example])
print(prediction)


## Importing a Model to BigQuery ML

The models defined and trained outside of the BigQuery ML can be also imported into the service
The possible extensions are:
1. XGBoost,
2. Tensorflow,
3. Tensorflow light,
4. Open Neural Network Exchange (ONNX)


### Importing a XGBoost model <br>

I tried importing the XGBoost models but the were many errors especially because at the moment the BigQuery ML does not support the current version of XGBoost, but only below 1.5.1.

### Importing the Tensorflow model 

BigQueryMl should work well with Tensorflow because there are both created by Google. <br>
Keras is a high-level interface for tensorflow which simplifies building the deep-learning models. <br>
AutoKeras is library that perform automatic machine learning using Keras. <br>
It was developed by Texas A@M University. <br>
Check out the website "https://autokeras.com and the paper https://jmlr.org/papers/v24/20-1355.html .

In [3]:
data = pd.read_csv("../data/default-of-credit-card-clients.csv",delimiter=';')

In [4]:
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [10]:
y = data["default payment next month"]
X = data.drop(["default payment next month","ID","MARRIAGE"], axis=1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [13]:
auto_keras_classifier = ak.StructuredDataClassifier(
    overwrite=True, max_trials=3
)  # It tries 3 different models

In [14]:
auto_keras_classifier.fit(X_train, y_train, epochs=10)

Trial 3 Complete [00h 00m 21s]
val_accuracy: 0.8116817474365234

Best val_accuracy So Far: 0.8116817474365234
Total elapsed time: 00h 01m 01s
INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets


INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets


<keras.src.callbacks.History at 0x7f611cbb6a40>

In [15]:
# export model
model = auto_keras_classifier.export_model()

print(type(model))  

try:
    model.save("auto_keras_classifier", save_format="tf")
except Exception:
    print("unable to save the model in tensorflow format, try legacy h5 format")


<class 'keras.src.engine.functional.Functional'>
INFO:tensorflow:Assets written to: auto_keras_classifier/assets


INFO:tensorflow:Assets written to: auto_keras_classifier/assets


In [18]:
# Coping the model to gcs
# It turns out that you can copy to folder that does not exist and it will be created automatically
!gsutil cp -r auto_keras_classifier/* gs://default-credit-clients-2023/auto_keras_classifier

Copying file://auto_keras_classifier/fingerprint.pb [Content-Type=application/octet-stream]...
Copying file://auto_keras_classifier/keras_metadata.pb [Content-Type=application/octet-stream]...
Copying file://auto_keras_classifier/saved_model.pb [Content-Type=application/octet-stream]...
Copying file://auto_keras_classifier/variables/variables.index [Content-Type=application/octet-stream]...
- [4 files][454.7 KiB/454.7 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://auto_keras_classifier/variables/variables.data-00000-of-00001 [Content-Type=application/octet-stream]...
- [5 files][607.4 KiB/607.4 KiB]                                                
Operation completed over 5 objects/607.4 KiB.                           

In [22]:
%%bigquery

CREATE OR REPLACE MODEL `BQ_ML_ID.AUTO_KERAS_MODEL`
 OPTIONS(MODEL_TYPE='TENSORFLOW',
         MODEL_PATH="gs://default-credit-clients-2023/auto_keras_classifier/*")

Query is running:   0%|          |

In [23]:
# checking if model was properly imported 

!PROJECT_ID=$(gcloud config get-value project)
!bq ls -m --format=pretty $PROJECT_ID:BQ_ML_ID

+-------------------------------------+--------------------------------+--------+-----------------+
|                 Id                  |           Model Type           | Labels |  Creation Time  |
+-------------------------------------+--------------------------------+--------+-----------------+
| AUTO_KERAS_MODEL                    | TENSORFLOW                     |        | 16 Sep 17:29:09 |
| AUTO_ML                             | AUTOML_CLASSIFIER              |        | 10 Sep 13:27:47 |
| BASE_LOGISTIC_REGRESSION            | LOGISTIC_REGRESSION            |        | 10 Sep 08:38:52 |
| DNN                                 | DNN_LINEAR_COMBINED_CLASSIFIER |        | 10 Sep 11:43:48 |
| LOGISTIC_REGRESSION_WITH_HP_TUNNING | LOGISTIC_REGRESSION            |        | 10 Sep 11:58:26 |
+-------------------------------------+--------------------------------+--------+-----------------+
