# ML Engineering Exercise 1:
In this section of the demo, we will utilize Snowpark's Python client-side Dataframe API and server-side runtime to build an **ML ops monitoring process**.  For ML governance we need to monitor model performance over time. We will be building 100's of models (one per station) so as part of the pipeline we will add a step to evaluate model performance and save metrics for each training/inference run.

Additionally, since the data science teams may use many different model frameworks, we want to have a standard evaluation framework instead of using the model's built-in evaluation which may different for each framework or version.  We will deploy the evaluation functions to the Snowpark Python server-side runtime as UDF so that all projects will have a **standard, centralized framework for evaluation and monitoring**.  We will save the model performance metrics in tables for historical analysis and drift detection as well as full reproducibility to support the company's GDPR policies.

Input: Historical trips in the `TRIPS` table. Predictions in `PRED_<model_id>` table. Unique model ID number.  
Output: Evaluation metrics in `EVAL_<model_id>` table. 

In [None]:
#!pip install -q rexmex

### 1. Generate features for one station

In [None]:
import snowflake.snowpark as snp
import json
import getpass 

with open('creds.json') as f:
    data = json.load(f)
    connection_parameters = {
      'account': data['account'],
      'user': data['username'],
      'password': data['password'], #getpass.getpass(),
      'role': data['role'],
      'schema': data['schema'],
      'database': data['database'],
      'warehouse': data['warehouse']}

session = snp.Session.builder.configs(connection_parameters).create()

trips_table_name = 'TRIPS'
holiday_table_name = 'HOLIDAYS'
precip_table_name = 'WEATHER'

In [None]:
import uuid

model_id = str(uuid.uuid1()).replace('-', '_')

feature_view_name = 'TRIPS_FEATURES_<station_id>_'+str(model_id)
pred_table_name = 'PRED_'+str(model_id)
eval_table_name = 'EVAL_'+str(model_id)

In [None]:
from snowflake.snowpark import functions as F
from citibike_ml.feature_engineering import generate_features

station_id = '519'

input_df = session.table(trips_table_name).filter(F.col('START_STATION_ID') == station_id)
feature_df = generate_features(session=session, 
                               input_df=input_df, 
                               holiday_table_name=holiday_table_name, 
                               precip_table_name=precip_table_name)

feature_df.sort('DATE').show(5)

Train a model for these features

In [None]:
def train(X, y, cutpoint=365, cat_idxs=[]):    
    X_valid = X[-cutpoint:]
    y_valid = y[-cutpoint:]
    X_train = X[:-cutpoint]
    y_train = y[:-cutpoint]

    from pytorch_tabnet.tab_model import TabNetRegressor

    max_epochs = 1000
    regression_model = TabNetRegressor(cat_idxs=cat_idxs)

    regression_model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        max_epochs=max_epochs,
        patience=100,
        batch_size=1024, 
        virtual_batch_size=128,
        num_workers=0,
        drop_last=False)
    
    return regression_model

def predict(model, X):
    y_hat = model.predict(X).reshape(-1)
    return y_hat
    
def plot(df, x_lab:str, y_true_lab:str, y_pred_lab:str):
    plt.figure(figsize=(15, 8))
    df = pd.melt(df, id_vars=[x_lab], value_vars=[y_true_lab, y_pred_lab])
    ax = sns.lineplot(x=x_lab, y='value', hue='variable', data=df)

In [None]:
import pandas as pd

target = ['COUNT']
feature_columns = [feature.replace('\"', '') for feature in feature_df.columns]
feature_columns.remove(target[0])
feature_columns.remove('DATE')
feature_columns.remove('STATION_ID')

df = feature_df.sort('DATE', ascending=True).toPandas()

model = train(df[feature_columns].values, df[target].values)
df['PRED'] = predict(model, df[feature_columns].values).astype('int')

In [None]:
df

### 2. Evaluation: 
We will use [rexmex](https://rexmex.readthedocs.io/en/latest/index.html) for consistent evaluation rather than the models' built-in eval metrics.  Evaluation metrics will be saved as table output tagged with the model_id.  

In [None]:
from rexmex import RatingMetricSet, ScoreCard

metric_set = RatingMetricSet()
score_card = ScoreCard(metric_set)

input_column_names = ['COUNT', 'PRED', 'STATION_ID']
eval_df = df[input_column_names].rename(columns={'COUNT': 'y_true', 'PRED':'y_score'})

eval_df = score_card.generate_report(eval_df,grouping=['STATION_ID']).reset_index()
eval_df.drop('level_1', axis=1, inplace=True)

In [None]:
eval_df

### 3. Deploy Evaluation UDF
We will create a UDF for the evaluation with Rexmex.

In [None]:
def eval_model_output_func(input_data: list, 
                           y_true_name: str, 
                           y_score_name: str,
                           group_id_name: str) -> str:
    import pandas as pd
    from rexmex import RatingMetricSet, ScoreCard
    
    metric_set = RatingMetricSet()
    score_card = ScoreCard(metric_set)
    
    input_column_names = [y_true_name, y_score_name, group_id_name]
    df = pd.DataFrame(input_data, columns = input_column_names)
    df.rename(columns={y_true_name: 'y_true', y_score_name:'y_score'}, inplace=True)
    
    df = score_card.generate_report(df,grouping=[group_id_name]).reset_index()
    df.drop('level_1', axis=1, inplace=True)
    
    return [df.values.tolist(), df.columns.tolist()]

Deploying the UDF to Snowflake makes it available for all users.  This is a regression evaluation.  Likely we will want to deploy a categorical function as well or add if/then logic to our single instance.

In [None]:
#from citibike_ml.model_eval import eval_model_output_func

session.clearImports()
session.addImport('./include/rexmex.zip')
session.addImport('citibike_ml')

model_stage_name = 'model_stage'
_ = session.sql('CREATE STAGE IF NOT EXISTS model_stage').collect()

eval_model_output_udf = session.udf.register(eval_model_output_func, 
                                              name="eval_model_output_udf",
                                              is_permanent=True,
                                              stage_location='@'+str(model_stage_name), 
                                              replace=True)

eval_model_output_udf.name

### 4. Test the output of the model eval UDF

In [None]:
from citibike_ml.mlops_pipeline import generate_feature_views, train_predict_feature_views, deploy_pred_train_udf

station_train_pred_udf_name = deploy_pred_train_udf(session=session,
                                                    function_name='station_train_predict_udf', 
                                                    model_stage_name='model_stage')

In [None]:
feature_view_names = generate_feature_views(session=session, 
                                            clone_table_name=trips_table_name, 
                                            feature_view_name=feature_view_name,
                                            holiday_table_name=holiday_table_name,
                                            precip_table_name=precip_table_name,
                                            target_column='COUNT',
                                            top_n=2)

pred_table_name = train_predict_feature_views(session=session, 
                                              station_train_pred_udf_name=station_train_pred_udf_name,
                                              feature_view_names=feature_view_names, 
                                              pred_table_name=pred_table_name)

pred_table_name

In [None]:
import ast

eval_df = session.table(pred_table_name)\
                 .select(F.array_agg(F.array_construct('COUNT', 'PRED', 'STATION_ID')).alias('INPUT_DATA'))

output_df = eval_df.select(F.call_udf('eval_model_output_udf',
                                      'INPUT_DATA',
                                      F.lit('COUNT'), 
                                      F.lit('PRED'),
                                      F.lit('STATION_ID')).alias('OUTPUT_DATA')).collect()

df = pd.DataFrame(data = ast.literal_eval(output_df[0][0])[0], 
                      columns = ast.literal_eval(output_df[0][0])[1])

eval_df = session.createDataFrame(df).write.mode('overwrite').saveAsTable(eval_table_name)

df = session.table(eval_table_name).toPandas()
df

Consolidate all functions for orchestration.

In [None]:
%%writefile citibike_ml/model_eval.py

def eval_model_output_func(input_data: list, 
                           y_true_name: str, 
                           y_score_name: str,
                           group_id_name: str) -> str:
    import pandas as pd
    from rexmex import RatingMetricSet, ScoreCard
    
    metric_set = RatingMetricSet()
    score_card = ScoreCard(metric_set)
    
    input_column_names = [y_true_name, y_score_name, group_id_name]
    df = pd.DataFrame(input_data, columns = input_column_names)
    df.rename(columns={y_true_name: 'y_true', y_score_name:'y_score'}, inplace=True)
    
    df = score_card.generate_report(df,grouping=[group_id_name]).reset_index()
    df.drop('level_1', axis=1, inplace=True)
    
    return [df.values.tolist(), df.columns.tolist()]

def deploy_eval_udf(session, function_name, model_stage_name) -> str:
    from citibike_ml.model_eval import eval_model_output_func

    session.clearImports()
    session.addImport('./include/rexmex.zip')
    session.addImport('citibike_ml')

    eval_model_output_udf = session.udf.register(eval_model_output_func, 
                                                  name=function_name,
                                                  is_permanent=True,
                                                  stage_location='@'+str(model_stage_name), 
                                                  replace=True)

    return eval_model_output_udf.name

def evaluate_station_predictions(session, pred_table_name, eval_model_udf_name, eval_table_name) -> str:
    from snowflake.snowpark import functions as F
    import pandas as pd
    import ast
    
    eval_df = session.table(pred_table_name)\
                     .select(F.array_agg(F.array_construct('COUNT', 'PRED', 'STATION_ID')).alias('input_data'))

    output_df = eval_df.select(F.call_udf(eval_model_udf_name,
                                          'INPUT_DATA',
                                          F.lit('COUNT'), 
                                          F.lit('PRED'),
                                          F.lit('STATION_ID'))).collect()
    
    df = pd.DataFrame(data = ast.literal_eval(output_df[0][0])[0], 
                      columns = ast.literal_eval(output_df[0][0])[1])

    eval_df = session.createDataFrame(df).write.saveAsTable(eval_table_name)


    return eval_table_name