## Locating the PC Boundaries for Anomalies

* Create a simplified Isolation Forest algorithm that works within PySpark

In [1]:
### Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors

import ipyleaflet as ipy

from pyspark import SparkContext
import pyspark.sql.functions as F
import pyspark.ml.functions as M
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql import Window as W
from pyspark.sql.types import *
from pyspark.ml.feature import PCA, VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

import time
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

Matplotlib created a temporary cache directory at /tmp/matplotlib-6q6ir_m9 because the default path (/home/jovyan/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
### For server
# 32 nodes
# 64g

## Start Spark context
total_nodes = 10
memory_per_node = 100

driver_memory = f"{memory_per_node}g"
executor_memory = f"{int(memory_per_node/total_nodes)}g"
n_executors = total_nodes - 1
print(f"Driver memory: {driver_memory}\nExecutor memory: {executor_memory}\nNumber of executors: {n_executors}\n")
try:
    print("Initializing SparkContext")
    sc = SparkSession.builder.config("spark.driver.memory", driver_memory) \
                             .config("spark.executor.memory", executor_memory) \
                             .config('spark.local.dir', "test_dir/") \
                             .config("spark.driver.maxResultSize", "16g") \
                             .config("spark.executor.instances", n_executors) \
                             .getOrCreate()
except:
    print("Starting new SparkContext")
    sc.stop()
    sc = SparkSession.builder.config("spark.driver.memory", driver_memory) \
                             .config("spark.executor.memory", executor_memory) \
                             .config('spark.local.dir', "test_dir/") \
                             .config("spark.driver.maxResultSize", "16g") \
                             .config("spark.executor.instances", n_executors) \
                             .appName("MyApp") \
                             .config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:0.9.4") \
                             .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") \
                             .getOrCreate()
print(sc)

# Start SQL Context
sqlContext = SQLContext(sc)

sc.getActiveSession()

Driver memory: 100g
Executor memory: 10g
Number of executors: 9

Initializing SparkContext
<pyspark.sql.session.SparkSession object at 0x15551e220350>




In [3]:
%%time
### Load datasets

### Open the preprocessed dataset
################################################################################### Ensure that the path here matches the path of the file from preprocessing
df = sqlContext.read.load("preprocessed_df_subset/preprocessed_df_subset.parquet") \
               .select("id", "ss_id", "stamp_date", "power_kW_values", "reconstructions", "recon_PC1", "recon_PC2")
# df_count = df.count()

# metadata
meta_filename = "metadata_preprocessed.csv"
df_meta = sc.read.csv(meta_filename, inferSchema=True, header=True)

### Since metadata table is so small, convert to Pandas
df_meta = df_meta.toPandas()

df.show(1)


+---+-----+----------+--------------------+--------------------+--------------------+--------------------+
| id|ss_id|stamp_date|     power_kW_values|     reconstructions|           recon_PC1|           recon_PC2|
+---+-----+----------+--------------------+--------------------+--------------------+--------------------+
|  0| 2405|2012-01-21|[0.0, 0.0, 0.0, 0...|[0.00410907621307...|-0.41782595826965074|-0.02458836598737569|
+---+-----+----------+--------------------+--------------------+--------------------+--------------------+
only showing top 1 row

CPU times: user 96 ms, sys: 8.13 ms, total: 104 ms
Wall time: 4.27 s


## Initial filtering of close outliers

Set initial close-outlier cutoffs at PC1 > 2 and PC2 > 2

In [4]:
%%time
### Add a condition to filter out these outliers
cond = (F.col("recon_PC1")>1) | (F.col("recon_PC2")>1)
df2 = df.where(~cond)

CPU times: user 1.16 ms, sys: 124 µs, total: 1.29 ms
Wall time: 12.9 ms


# Isolation Forest
Algorithm and dataset processing in this notebook only

In [5]:
### Get n splits on m columns randomly chosen

def get_min_max_values(data, inputCols):
    # Get the min and max value of columns in inputCols, output dictionary of dict[col] = (min, max)
    return {i:data.select(F.min(col), F.max(col)).collect()[0] for i,col in enumerate(inputCols)}

def get_line_params(data, inputCols, num_splits, offset_size, min_max_dict=False):
    # Get a list of line parameters to randomly split values

    def randomLine_parameters(data, min_max_dict, offset_size):
        # Pick two random points within the boundaries of inputCols, then determine the parameters for the resulting line
        # y-y1 = (y2-y1)/(x2-x1) * (x-x1)
        # --> (y1-y2)*x + (x2-x1)*y + (x1-x2)*y1 - (y2-y1)*x1 = 0
    
        # Get offsets --> This ensures the selected points are closer to the mean (and away from boundaries)
        # Having higher offsets is better when the data is more concentrated
        col1_offset = (min_max_dict[0][1] - min_max_dict[0][0]) * offset_size
        col2_offset = (min_max_dict[1][1] - min_max_dict[1][0]) * offset_size
        
        x1 = np.random.uniform(min_max_dict[0][0] + col1_offset, min_max_dict[0][1] - col1_offset)
        x2 = np.random.uniform(min_max_dict[0][0] + col1_offset, min_max_dict[0][1] - col1_offset)
        y1 = np.random.uniform(min_max_dict[1][0] + col2_offset, min_max_dict[1][1] - col2_offset)
        y2 = np.random.uniform(min_max_dict[1][0] + col2_offset, min_max_dict[1][1] - col2_offset)
    
        # Get line parameters in standardized form
        A = y2 - y1
        B = x1 - x2
        C = y1*(x2-x1) + x1*(y1-y2)
        line_params = (A, B, C)
        
        return line_params
    ############################################################################
    # Variables
    output = []
    if min_max_dict is False:
        min_max_dict = get_min_max_values(data, inputCols)
    
    ### Get a list of (A,B,C) tuples
    for i in np.arange(0, num_splits):
        output.append(randomLine_parameters(data, min_max_dict, offset_size))

    return output

def build_isolation_forest_simple(data, inputCols, outputCol, num_trees, num_splits, min_max_dict=False, offset_size = 0.001):
    ### Build a custom isolation forest (simplified without sampling or recursive structure)
    # Inputs:
    #    Data --> Input dataframe
    #    inputCols --> The columns to split on (recon_PC1 and recon_PC2 in this code)
    #    num_splits --> The number of splits to run on each tree
    #    random_offset --> An offset from the edges of the column split ranges to ensure the split stays within the range
    #
    # Determine a list of tuple-pairings of randomly selected columns and respective values to split on
    # For each split:
    #     Determine whether each point falls to the left (group 0) or right (group 1) of the split
    #     Count the number of points in group 0 and group 1
    #     Record the resulting fraction for each point and add the fraction to the running sum (tree_scoreSum)
    # After all splits are complete, take the average splitScore as scoreSum
    # After all trees are complete, take the average scoreSum and output as score
    #
    # Points that are outliers should have smaller scores than normal points

    # Record the columns and get row count
    print("Model Start")
    print()
    model_start = time.time()
    df_columns = data.schema.names

    # Initialize the score column and necessary columns to handle the sampling, id column
    output = data.withColumn(outputCol, F.lit(0))  \
                 .withColumn("scoreSum", F.lit(0))

    # Get the number of rows in the data
    rowCount = output.count()

    ### Start the trees
    for j in np.arange(0, num_trees):
        print(f"Tree {j+1}/{num_trees} start:")
        tree_start = time.time()
        # Find the columns and respective values to split on
        line_param_list = get_line_params(output, inputCols, num_splits, offset_size, min_max_dict)
        # Instantiate the tree_scoreSum column
        output = output.withColumn("tree_scoreSum", F.lit(0))
        
        ### Start the splits
        split_start = time.time()
        for i,(line_params) in enumerate(line_param_list):
            if (((i-1)%10 == 0) & (i!=0)):
                split_start = time.time()
            A,B,C = line_params
            # Mark each selected row as being on one side of the split
            output = output.withColumn("which_side", F.when(F.lit(A)*F.col("recon_PC1") + F.lit(B)*F.col("recon_PC2") + C >=0, 1).otherwise(0))
            # Get the fraction of rows on either side of the split
            group1 = output.select(F.sum("which_side")).collect()[0][0] / rowCount
            group0 = 1 - group1
    
            # For each selected row, assign group0 or group1 to splitScore
            output = output.withColumn("splitScore", F.when(F.col("which_side")==1, F.lit(group1)).otherwise(F.lit(group0))) \
    
            # Add the splitScore to the running total tree_scoreSum
            output = output.withColumn("tree_scoreSum", F.col("tree_scoreSum") + F.col("splitScore"))

            if ((i+1)%10) == 0:
                print(f"  Split {i+1}/{num_splits}: Time elapsed --> {time.time() - split_start}")
        ### Add the score to the overall running score total (scoreSum)
        output = output.withColumn("scoreSum", F.col("scoreSum") + (F.col("tree_scoreSum") / F.lit(num_splits)))
        
        print(f"Tree {j+1}:  Time elapsed --> {time.time() - tree_start}")
        print()

    # Add the average score
    output = output.withColumn(outputCol, F.round(F.col("scoreSum") / F.lit(num_trees), 5))

    ### Calculate the overall score and output it
    output = output.select(df_columns + [outputCol]) \
                   .orderBy(outputCol, ascending=True)
    
    print(f"Model complete: Time elapsed --> {time.time() - model_start}")
    return output


In [9]:
%%time
### Run the model on a number of splits
### Setting parameters
#################################################################################
inputCols = ["recon_PC1", "recon_PC2"]
# num_trees = 5
# num_splits = 100
outputCol = "score"
#################################################################################

# Initialize df_iforest
df_iforest = df2

# Get input columns and corresponding min/max values
min_max_dict = get_min_max_values(df_iforest, inputCols)

for item in min_max_dict.items():
    print(item)

(0, Row(min(recon_PC1)=-7.031215404680069, max(recon_PC1)=0.9817973693571098))
(1, Row(min(recon_PC2)=-0.7223526339584964, max(recon_PC2)=0.7194646672990974))
CPU times: user 3.61 ms, sys: 1.94 ms, total: 5.55 ms
Wall time: 10.9 s


In [21]:
%%time
# Run a series of tests at differing numbers of trees and splits

# 1t_10s #
# 3t_10s #
# 5t_10s #
# 7t_10s #

# 1t_30s
# 3t_30s #
# 5t_30s #
# 7t_30s #

# 1t_20s
# 3t_20s
# 5t_20s
# 7t_20s

run_num = 7
run_vec = ((1,10), (3,10), (5,10), (7,10), (7,30), (5,30), (3,30), (1,30), (10, 50))
num_trees = run_vec[run_num][0]
num_splits = run_vec[run_num][1]

print(run_num)

# Initialize df_iforest
select_columns = ["id", "recon_PC1", "recon_PC2"]
output_columns = []
df_iforest = df2.select(select_columns).cache()

for i in np.arange(0,5):
    run_start = time.time()
    print(f"\n\nRun {i+1} --> Start")
    outputCol = f"score_{i}_{num_trees}t_{num_splits}s"
    df_iforest = build_isolation_forest_simple(data=df_iforest, 
                                               inputCols=inputCols, 
                                               outputCol=outputCol,
                                               num_trees=num_trees,
                                               num_splits=num_splits,
                                               min_max_dict=min_max_dict)
    select_columns = select_columns + [outputCol]
    output_columns.append(outputCol)
    print("\nCaching the output...")
    df_iforest = df_iforest.cache()
    df_iforest.count()
    print(f"Done --> Total runtime elapsed: {time.time() - run_start}")

print(f"\nAll Runs complete --> Saving model to outliers_df_iforest/df_iforest_{num_trees}trees_{num_splits}splits")
df_iforest.select(["id"] + output_columns).repartition(1).write.mode("overwrite").parquet(f"outliers_df_iforest/df_iforest_{num_trees}trees_{num_splits}splits")

print("Complete\n")

7


Run 1 --> Start
Model Start

Tree 1/1 start:
  Split 10/30: Time elapsed --> 48.442001819610596
  Split 20/30: Time elapsed --> 24.06851315498352
  Split 30/30: Time elapsed --> 32.95797419548035
Tree 1:  Time elapsed --> 114.57231664657593

Model complete: Time elapsed --> 117.5252640247345

Caching the output...
Done --> Total runtime elapsed: 130.64413213729858


Run 2 --> Start
Model Start

Tree 1/1 start:
  Split 10/30: Time elapsed --> 3.6160566806793213
  Split 20/30: Time elapsed --> 3.516334295272827
  Split 30/30: Time elapsed --> 4.014955282211304
Tree 1:  Time elapsed --> 12.262625694274902

Model complete: Time elapsed --> 12.576621532440186

Caching the output...
Done --> Total runtime elapsed: 27.6690571308136


Run 3 --> Start
Model Start

Tree 1/1 start:
  Split 10/30: Time elapsed --> 4.7903361320495605
  Split 20/30: Time elapsed --> 4.091628313064575
  Split 30/30: Time elapsed --> 4.08524489402771
Tree 1:  Time elapsed --> 14.265088319778442

Model complete: Ti

In [22]:
df_iforest.unpersist().count()

28087969

In [27]:
df3.show(1)

+------------+-----+----------+--------------------+--------------------+-------------------+------------------+--------------+--------------+--------------+--------------+--------------+
|          id|ss_id|stamp_date|     power_kW_values|     reconstructions|          recon_PC1|         recon_PC2|score_0_1t_10s|score_1_1t_10s|score_2_1t_10s|score_3_1t_10s|score_4_1t_10s|
+------------+-----+----------+--------------------+--------------------+-------------------+------------------+--------------+--------------+--------------+--------------+--------------+
|652835274355|13015|2016-05-24|[0.0, 0.0, 0.0, 0...|[6.09746580686240...|-2.2372372455956455|0.7194646672990974|       0.49986|       0.62795|       0.61327|       0.30982|       0.39662|
+------------+-----+----------+--------------------+--------------------+-------------------+------------------+--------------+--------------+--------------+--------------+--------------+
only showing top 1 row



In [59]:
df2.show(1)

+---+-----+----------+--------------------+--------------------+--------------------+--------------------+
| id|ss_id|stamp_date|     power_kW_values|     reconstructions|           recon_PC1|           recon_PC2|
+---+-----+----------+--------------------+--------------------+--------------------+--------------------+
|  0| 2405|2012-01-21|[0.0, 0.0, 0.0, 0...|[0.00410907621307...|-0.41782595826965074|-0.02458836598737569|
+---+-----+----------+--------------------+--------------------+--------------------+--------------------+
only showing top 1 row



In [61]:
### Calculate aggregate stats and save as a condensed file

filenames = ["outliers_df_iforest/df_iforest_1t10s.parquet", "outliers_df_iforest/df_iforest_1t30s.parquet",
             "outliers_df_iforest/df_iforest_3t10s.parquet", "outliers_df_iforest/df_iforest_3t30s.parquet",
             "outliers_df_iforest/df_iforest_5t10s.parquet", "outliers_df_iforest/df_iforest_5t30s.parquet",
             "outliers_df_iforest/df_iforest_7t10s.parquet", "outliers_df_iforest/df_iforest_7t30s.parquet"]

score_SScolumns = ((F.col("score_5t_20s")-F.col("mean_score"))**2 +
                   (F.col("score_5t_50s")-F.col("mean_score"))**2 +
                   (F.col("score_3t_20s")-F.col("mean_score"))**2 +
                   (F.col("score_3t_50s")-F.col("mean_score"))**2 +
                   (F.col("score_3t_100s")-F.col("mean_score"))**2)

df2_withScores = df2

for i,(file, save) in enumerate(zip(filenames, savelocs)):
    run_config = file.split(".")[0][-5:-3] + "_" + file.split(".")[0][-3:]
    score_cols = [f"\'score_{j}_{run_config}\'" for j in np.arange(0,5)]
    score_columns = [f"F.col(\'score_{j}_{run_config}\')" for j in np.arange(0,5)]
    mean_func = eval(" + ".join(score_columns)) / len(score_columns)
    std_string = f"F.col(\'{run_config}_mean_score\'))**2"
    std_func =  eval(f"((F.col({score_cols[0]})-{std_string} + (F.col({score_cols[1]})-{std_string} + (F.col({score_cols[2]})-{std_string} + (F.col({score_cols[3]})-{std_string} + (F.col({score_cols[4]})-{std_string})")

    df3 = sqlContext.read.load(file).withColumn(f"{run_config}_mean_score", mean_func).withColumn(f"{run_config}_std_score", std_func)

    df2_withScores = df2_withScores.join(df3.select("id", f"{run_config}_mean_score", f"{run_config}_std_score"), on="id", how="inner")

df2_withScores.repartition(1).write.mode("overwrite").parquet("outliers_df_iforest/df_iforest_all_scores")
