In [ ]:
%%configure -f
{
"conf": {
    "spark.sql.autoBroadcastJoinThreshold": -1,
    "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
    "spark.dynamicAllocation.enabled": true,
    "spark.dynamicAllocation.minExecutors": 2,
    "spark.dynamicAllocation.maxExecutors": 8,
    "spark.rpc.message.maxSize": 1024
   }
}

In [ ]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.ml.functions import vector_to_array
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
import joblib
from io import BytesIO
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, LongType

In [ ]:
batch_id = ''
prepped_data_path = ''
iFor_data_prefix = ''
subsample_list = ''
trees_list = ''
train_size = ''
id_feat = ''
seed = ''
time_slice_folder = ''

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
config_integration.trace_integrations(['logging'])

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'batch_id': batch_id,
    'prepped_data_path': prepped_data_path,
    'iFor_data_prefix': iFor_data_prefix,
    'subsample_list': subsample_list,
    'trees_list': trees_list,
    'train_size': train_size,
    'id_feat': id_feat,
    'seed': seed,
    'time_slice_folder': time_slice_folder,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
if prepped_data_path != "":
    prepped_data_path = "/".join(prepped_data_path.split("/")[:-1]) + "/" + time_slice_folder + "/" + prepped_data_path.split("/")[-1]
    logger.info(f'prepped_data_path = {prepped_data_path}')
if iFor_data_prefix != "":
    iFor_data_prefix = "/".join(iFor_data_prefix.split("/")[:-1]) + "/" + time_slice_folder + "/" + iFor_data_prefix.split("/")[-1]
    logger.info(f'iFor_data_prefix = {iFor_data_prefix}')

In [ ]:
# Cast parameters
subsample_list = [int(i) for i in subsample_list.split(",")]
trees_list = [int(i) for i in trees_list.split(",")]
train_size = float(train_size)
id_feat = [i for i in id_feat.split(",")]
seed = int(seed)

In [ ]:
df = spark.read.parquet(prepped_data_path)
m = df.count()
logger.info(f'Number of records: {m}')

In [ ]:
# Add id to join with group table
df_id = df.withColumn('_id',F.monotonically_increasing_id())

In [ ]:
def ijungle_train(id_feat, seed):
    def _fun(key, pdf):
        trees = key[0]
        subsample_size = key[1]
        group = key[2]
        pdf.set_index(id_feat, inplace=True)
        feats = list(pdf.columns)
        feats.remove('_group')
        feats.remove('_tree_size')
        feats.remove('_subsample_size')
        pdf = pdf[feats]
        clf = IsolationForest(
            n_estimators = trees, 
            max_samples=min(subsample_size, pdf.shape[0]), 
            random_state=seed, n_jobs=-1)
        clf.fit(pdf)
        bytes_container = BytesIO()
        joblib.dump(clf, bytes_container)
        bytes_container.seek(0)
        model_bytes = bytes_container.read()
        return(pd.DataFrame([(trees, subsample_size, group, model_bytes)]))
    return(_fun)

In [ ]:
df_unassembled_exists = False
num_feats = len(df_id.take(1)[0]['scaled'])
tot_models = 0
idx = 0
for trees in trees_list:
    for subsample_size in subsample_list:
        #Get random train_size fraction sample of data
        df_id_group = df_id.sample(withReplacement=False, fraction=train_size, seed=seed+idx)
        
        #Calculate how many groups can fit. Each group is a distinct subset of the data that the model will be trained on.
        rand_group_count = df_id_group.count()
        num_groups = int(np.floor(rand_group_count / subsample_size))
        logger.info(f"For trees:{trees} and subsample_size:{subsample_size}, the number of rows is {rand_group_count} and the number of groups is {num_groups}.")
        tot_models += num_groups

        #Throw a random number for each row, sort by the ranom number, give it a monotonically increasing id and take the modulus of that id against the number of groups
        #The modulus and the fact that the row numbers are not consecutive numbers will lead to some variation in the number of samples in the groups.
        #However, this variation will be small and keeps the spirit and intent of the isolation forest methodology intact. Getting the exact number of samples for each model
        #in a random fashion would be a much more expensive operation, at least for large datasets. 
        df_id_group = df_id_group.withColumn("_rand",F.rand(seed+idx)).select('_id','_rand')
        df_id_group = df_id_group.orderBy('_rand').withColumn("shuffled_index",F.monotonically_increasing_id()).drop('_rand')
        df_id_group = df_id_group.withColumn("_group",F.col("shuffled_index") % num_groups)
        
        # Join of random selection of groups with training data
        df_subsamples = df_id.join(df_id_group, on='_id').where(F.col('_group')>=0).select(id_feat+['scaled','_group'])
        df_subsamples = df_subsamples.withColumn("_tree_size",F.lit(trees)).withColumn("_subsample_size",F.lit(subsample_size))
        
        # Vector to individual columns to prepare for parallel training
        if df_unassembled_exists:
            df_unassembled = df_unassembled.union(df_subsamples.withColumn('f', vector_to_array("scaled")).select(id_feat + ['_tree_size','_subsample_size','_group'] + [F.col("f")[i] for i in range(num_feats)]))
        else:
            df_unassembled = df_subsamples.withColumn('f', vector_to_array("scaled")).select(id_feat + ['_tree_size','_subsample_size','_group'] + [F.col("f")[i] for i in range(num_feats)])
            df_unassembled_exists = True
        idx += 1
logger.info(f'Number of models: {tot_models}')

In [ ]:
df_iFor = df_unassembled.groupBy('_tree_size', '_subsample_size', '_group').applyInPandas(
    ijungle_train(id_feat, seed), 
    schema="tree_size long, subsample_size long, id long, model binary"
)
df_iFor.write.mode('overwrite').parquet(iFor_data_prefix)