# This notebook creates an experimental design to test featurestore and executes the test

In [1]:
%pip install -q doepy google-cloud-aiplatform

Note: you may need to restart the kernel to use updated packages.


In [1]:
# set up the featurestore client
import pandas as pd
from datetime import datetime
from dateutil.parser import parse
import datetime as datetime_class
import json
from google.cloud import bigquery
import time
from helper_fns.helpers import *


#variables change to your liking
BUCKET = "matching-engine-demo-blog"
BQ_DATASET = 'movielens'
PROJECT_ID = 'matching-engine-blog'
API_ENDPOINT = "us-central1-aiplatform.googleapis.com"  # @param {type:"string"}
FEATURESTORE_ID = "performance_testing"
REGION = 'us-central1'

n_iterations = 2
n_predictions = 100
n_workers = 1

admin_client = FeaturestoreServiceClient(client_options={"api_endpoint": API_ENDPOINT})

data_client = FeaturestoreOnlineServingServiceClient(
    client_options={"api_endpoint": API_ENDPOINT}
)

BASE_RESOURCE_PATH = admin_client.common_location_path(PROJECT_ID, REGION)

#initialize bq client for building benchmark datasets for FS
client = bigquery.Client()


## Create repeat measure run function

In [3]:
def repeat_measure(n_iterations, n_predictions, n_workers, n_repeats=30):
    data = {'create_stats': [],
            'n_features': [],
            'n_predictions': [],
            'n_workers': [],
            'total_seconds': [],
            'n_features': []
           }
    stats = create_a_fs_run(n_iterations, n_predictions, n_workers)
    
    
    for run in range(n_repeats):
        total_time, n_features = measure_fs(n_iterations, n_predictions)
        data['create_stats'].append(stats)
        data['n_features'].append(n_features)
        data['n_predictions'].append(n_predictions)
        data['n_workers'].append(n_workers)
        data['total_seconds'].append(total_time)

    print(total_time)
    return data

In [4]:
from doepy import build
import pandas as pd

design_data = build.lhs(
{'Nodes':[2,10],
'N_Rows':[1, 10],
'N_Iterations':[1,10],
}, num_samples=3)

design_data = design_data[['Nodes', 'N_Rows', 'N_Iterations']].astype(int)
design_data

Unnamed: 0,Nodes,N_Rows,N_Iterations
0,7,4,3
1,6,9,4
2,2,2,9


In [17]:
cols = ['create_stats',
        'n_features',
        'n_predictions',
        'n_workers',
        'total_seconds']


data = pd.DataFrame([], columns=cols)

append_frame = pd.DataFrame.from_dict(repeat_run_data)
data = data.append(append_frame)
data

Unnamed: 0,create_stats,n_features,n_predictions,n_workers,total_seconds
0,-387.904502,8,1,1,0.222427
1,-387.904502,8,1,1,0.182707
2,-387.904502,8,1,1,0.137839
3,-387.904502,8,1,1,0.017539
4,-387.904502,8,1,1,0.139842
5,-387.904502,8,1,1,0.184758
6,-387.904502,8,1,1,0.14239
7,-387.904502,8,1,1,0.134935
8,-387.904502,8,1,1,0.01384
9,-387.904502,8,1,1,0.163715


In [7]:
repeat_run_data = repeat_measure(1, 1, 1)

name: "projects/180938242395/locations/us-central1/featurestores/performance_testing"

name: "projects/180938242395/locations/us-central1/featurestores/performance_testing/entityTypes/movies"
etag: "AMEw9yMQBKbyLuAcUXafun0DROoZgk-iNeCohjEEwH78chbqTq0q"

Ran for a total of 0:05:52.755053
imported_entity_count: 1
imported_feature_value_count: 7

Ran for a total of: 0.2224266529083252 seconds for 1 streaming predictions 
 Per prediction seconds: 0.2224266529083252
Ran for a total of: 0.1827068328857422 seconds for 1 streaming predictions 
 Per prediction seconds: 0.1827068328857422
Ran for a total of: 0.13783884048461914 seconds for 1 streaming predictions 
 Per prediction seconds: 0.13783884048461914
Ran for a total of: 0.017539262771606445 seconds for 1 streaming predictions 
 Per prediction seconds: 0.017539262771606445
Ran for a total of: 0.13984203338623047 seconds for 1 streaming predictions 
 Per prediction seconds: 0.13984203338623047
Ran for a total of: 0.18475842475891113 second

In [5]:
# run the experiment and store data

cols = ['create_stats',
        'n_features',
        'n_predictions',
        'n_workers',
        'total_seconds']


data = pd.DataFrame([], columns=cols)

for index, row in design_data.iterrows():
    print(f"Testing for following row: \n{row}")
    repeat_run_data = repeat_measure(row['N_Iterations']
                                     , row['N_Rows']
                                     , row['Nodes'], n_repeats=30)
    append_frame = pd.DataFrame.from_dict(repeat_run_data)
    data = data.append(append_frame, ignore_index=True)
    ts = datetime.now()
    data.to_csv(f'data/experiment-{ts}.csv')
    
#save to csv
ts = datetime.now()
data.to_csv(f'data/experiment-{ts}.csv')

data

Testing for following row: 
Nodes           7
N_Rows          4
N_Iterations    3
Name: 0, dtype: int64
Deleted featurestore 'performance_testing'.
name: "projects/180938242395/locations/us-central1/featurestores/performance_testing"

name: "projects/180938242395/locations/us-central1/featurestores/performance_testing/entityTypes/movies"
etag: "AMEw9yO8gMqPBcdpLNJ4DSIL0_F_rCBQblDURlEmW5yGKJzUIR92"

Ran for a total of 0:06:04.743344
imported_entity_count: 4
imported_feature_value_count: 84

Ran for a total of: 0.26798510551452637 seconds for 4 streaming predictions 
 Per prediction seconds: 0.06699627637863159
Ran for a total of: 0.2606830596923828 seconds for 4 streaming predictions 
 Per prediction seconds: 0.0651707649230957
Ran for a total of: 0.21416187286376953 seconds for 4 streaming predictions 
 Per prediction seconds: 0.05354046821594238
Ran for a total of: 0.2054762840270996 seconds for 4 streaming predictions 
 Per prediction seconds: 0.0513690710067749
Ran for a total of: 0

KeyboardInterrupt: 

In [None]:
!gsutil cp f'experiment-$ts.csv' gs://$BUCKET/benchmarking_data