In [ ]:
%%configure -f
{
"conf": {
    "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
    "spark.dynamicAllocation.enabled": true,
    "spark.dynamicAllocation.minExecutors": 2,
    "spark.dynamicAllocation.maxExecutors": 8,
    "spark.driver.maxResultSize": "20g"
   }
}

In [ ]:
import pyspark.sql.functions as F
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import udf, struct
from pyspark.sql.types import FloatType
import numpy as np
from io import BytesIO
import joblib
import pandas as pd

In [ ]:
batch_id = ''
prepped_data_path = ''
iFor_data_prefix = ''
overhead_data_path = ''
overhead_results_prefix = ''
id_feat = ''
id_feat_types = ''
seed = ''
overhead_size = ''
time_slice_folder = ''

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
config_integration.trace_integrations(['logging'])

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'batch_id': batch_id,
    'prepped_data_path': prepped_data_path,
    'iFor_data_prefix': iFor_data_prefix,
    'overhead_data_path': overhead_data_path,
    'overhead_results_prefix': overhead_results_prefix,
    'id_feat': id_feat,
    'id_feat_types': id_feat_types,
    'seed': seed,
    'overhead_size': overhead_size,
    'time_slice_folder': time_slice_folder,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
if prepped_data_path != "":
    prepped_data_path = "/".join(prepped_data_path.split("/")[:-1]) + "/" + time_slice_folder + "/" + prepped_data_path.split("/")[-1]
    logger.info(f'prepped_data_path = {prepped_data_path}')
if iFor_data_prefix != "":
    iFor_data_prefix = "/".join(iFor_data_prefix.split("/")[:-1]) + "/" + time_slice_folder + "/" + iFor_data_prefix.split("/")[-1]
    logger.info(f'iFor_data_prefix = {iFor_data_prefix}')
if overhead_data_path != "":
    overhead_data_path = "/".join(overhead_data_path.split("/")[:-1]) + "/" + time_slice_folder + "/" + overhead_data_path.split("/")[-1]
    logger.info(f'overhead_data_path = {overhead_data_path}')
if overhead_results_prefix != "":
    overhead_results_prefix = "/".join(overhead_results_prefix.split("/")[:-1]) + "/" + time_slice_folder + "/" + overhead_results_prefix.split("/")[-1]
    logger.info(f'overhead_results_prefix = {overhead_results_prefix}')

In [ ]:
# Casting parameters
id_feat = [i for i in id_feat.split(",")]
id_feat_types = [i for i in id_feat_types.split(",")]
seed = int(seed)
overhead_size = float(overhead_size)

In [ ]:
df = spark.read.parquet(prepped_data_path)
m = df.count()
logger.info("Number of records: {:,}".format(m))

In [ ]:
# Creation of overhead sample
df_W = df.sample(withReplacement=False, fraction=overhead_size, seed=seed)
df_W.write.mode('overwrite').parquet(overhead_data_path)

num_feats = len(df_W.take(1)[0]['scaled'])
df_W_unassembled = df_W.withColumn('f', vector_to_array("scaled")).select(id_feat + [F.col("f")[i] for i in range(num_feats)])
logger.info("Number of records of overhead dataset: {:,}".format(df_W.count()))

In [ ]:
#This function takes a dictionary of models and runs them across a pandas dataframe from mapInPandas
def clf_predict():
    def _fun(iterator):
        for pdf in iterator:
            pdf_out_exists = False
            for key in clf_dict:
                clf = clf_dict[key]
                pdf.set_index(["issuer_id_indexed","issued_date"], inplace=True)
                _predict = clf.score_samples(pdf)
                pdf.reset_index(drop=False, inplace=True)
                pdf_temp = pdf[["issuer_id_indexed","issued_date"]]
                pdf_temp['tree_size'] = [key[0]]*pdf.shape[0]
                pdf_temp['subsample_size'] = [key[1]]*pdf.shape[0]
                pdf_temp['group_num'] = [key[2]]*pdf.shape[0]
                pdf_temp["predict"] = _predict
                if pdf_out_exists==False:
                    pdf_out = pdf_temp
                    pdf_out_exists = True
                else:
                    pdf_out = pd.concat([pdf_out,pdf_temp])
            yield(pdf_out)
    return(_fun)

In [ ]:
df_iFor = spark.read.parquet(iFor_data_prefix) #add select by partition'
df_iFor = df_iFor.withColumn("model_id",F.monotonically_increasing_id())
ids = df_iFor.select('model_id').collect()
ids = sorted(ids)
num_models = len(ids)
logger.info("Number of models: {:,}".format(num_models))

In [ ]:
#The main idea of this piece of code is: Loading models has overhead. So load many at once. Running models in mapInPandas has overhead, so run many at once.
#The while loop does both of these things. It loads a 'chunk' of models with one collect, creates a python dictionary with the models and the parameters to the models
#and runs those models using mapInPandas and the clf_predict function. It also writes out the models for each iteration of the while loop, so that the process DAG
#doesn't get too complicated if the write happens after each call of mapInPandas.
#This M factor is very important. M is the number of models loaded into memory at once. Too many and the executors will run out of memory. But too few and the job won't run optimally.
M = 250
chunk = 0
schema = "issuer_id_indexed integer, issued_date timestamp, tree_size integer, subsample_size integer, group_num integer, predict float"
df_predict_exists = False
while M*chunk < num_models:
    rows = ids[M*chunk:(chunk+1)*M]
    chunk_models = df_iFor.where(F.col("model_id") >= rows[0][0]).where(F.col("model_id") <= rows[-1][0]).collect()
    clf_dict = {}
    for model in chunk_models:
        clf_dict[(model.tree_size, model.subsample_size, model.id)] = joblib.load(BytesIO(model.model))
    df_predict = df_W_unassembled.mapInPandas(clf_predict(), schema = schema)
    if not df_predict_exists:
        df_predict_exists = True
        df_predict.write.mode('overwrite').parquet(overhead_results_prefix)
    else:
        try:
            df_predict.write.mode('append').parquet(overhead_results_prefix)
        except:
            logger.info("Error in writing df_predict in 5_3.")
            logger.info("Number of models in chunk: {:,}".format(len(num_models.keys())))
            logger.info("Models in chunk: {}".format(model.keys()))
    chunk += 1