---
title: "Inference Analysis"
date: 2021-04-25
type: technical_note
draft: false
---

## Check monitoring analysis

Collect statistics, outliers and drift detections from Parquet and Kafka.

In [1]:
from hops import hdfs
import pyarrow.parquet as pq
from hops import kafka
from hops import tls
from confluent_kafka import Producer, Consumer
import json

import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
14,application_1620032599856_0032,pyspark,idle,Link,Link


SparkSession available as 'spark'.


### Inference Statistics

Read inference statistics from parquet files

In [2]:
MONITORING_DIR = "hdfs:///Projects/" + hdfs.project_name() + "/Resources/CardFraudDetection/Monitoring/"
LOGS_STATS_DIR =  MONITORING_DIR + "credit_card_activity_stats-parquet/"
credit_card_activity_stats = spark.read.parquet(LOGS_STATS_DIR + "*.parquet")

In [3]:
credit_card_activity_stats.createOrReplaceTempView("credit_card_activity_stats")

In [4]:
desc_stats_df = spark.sql("SELECT window, feature, min, max, mean, stddev FROM credit_card_activity_stats ORDER BY window")
distr_stats_df = spark.sql("SELECT feature, distr FROM credit_card_activity_stats ORDER BY window")
corr_stats_df = spark.sql("SELECT window, feature, corr FROM credit_card_activity_stats ORDER BY window")
cov_stats_df = spark.sql("SELECT feature, cov FROM credit_card_activity_stats ORDER BY window")

#### Descriptive statistics

In [5]:
print(desc_stats_df.show(6, truncate=False))

+------------------------------------------+-----------------+------+------------------+------+-------+
|window                                    |feature          |min   |max               |mean  |stddev |
+------------------------------------------+-----------------+------+------------------+------+-------+
|[2021-05-04 06:41:20, 2021-05-04 06:41:26]|num_trans_per_10m|1.2   |8491.63           |197.45|1480.08|
|[2021-05-04 06:41:20, 2021-05-04 06:41:26]|avg_amt_per_10m  |1.0   |3.0               |0.05  |0.34   |
|[2021-05-04 06:41:20, 2021-05-04 06:41:26]|num_trans_per_1h |1.44  |9441.65           |219.54|1863.76|
|[2021-05-04 06:41:20, 2021-05-04 06:41:26]|avg_amt_per_12h  |3.0   |15.0              |0.28  |2.85   |
|[2021-05-04 06:41:20, 2021-05-04 06:41:26]|num_trans_per_12h|34.489|1956.6783333333333|44.7  |557.39 |
|[2021-05-04 06:41:20, 2021-05-04 06:41:26]|avg_amt_per_1h   |1.0   |3.0               |0.05  |0.67   |
+------------------------------------------+-----------------+--

#### Distributions

In [6]:
print(distr_stats_df.show(6, truncate=False))

+-----------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|feature          |distr                                                                                                                                                                                                                                                      |
+-----------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|num_trans_per_10m|[5894264.812000004 -> 0.0, 14735662.030000009 -> 0.0, 8841397.218000006 -> 0.0, 23577059.248000015 -> 0.0, 11788529.624000007 -> 0.0, 0.0 -> 43.0, 17682794.43600001 

#### Correlations

In [7]:
print(corr_stats_df.show(6, truncate=False))

+------------------------------------------+-----------------+-------------------------------------------------------------------------------------------------------------------------------------+
|window                                    |feature          |corr                                                                                                                                 |
+------------------------------------------+-----------------+-------------------------------------------------------------------------------------------------------------------------------------+
|[2021-05-04 06:41:20, 2021-05-04 06:41:26]|num_trans_per_10m|[avg_amt_per_10m -> 0.01, avg_amt_per_12h -> 0.0, num_trans_per_1h -> -0.09, avg_amt_per_1h -> -0.18, num_trans_per_12h -> 0.07]     |
|[2021-05-04 06:41:20, 2021-05-04 06:41:26]|avg_amt_per_10m  |[avg_amt_per_12h -> -0.14, num_trans_per_1h -> 0.24, avg_amt_per_1h -> -0.13, num_trans_per_12h -> -0.26, num_trans_per_10m -> 0.01] |
|[2021-05-04 06

#### Covariance

In [8]:
print(cov_stats_df.show(6, truncate=False))

+-----------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|feature          |cov                                                                                                                                                                                                              |
+-----------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|num_trans_per_10m|[avg_amt_per_10m -> 3.0957043261568E11, avg_amt_per_12h -> -4.520635835511398E13, num_trans_per_1h -> 2.4112523343981465E15, avg_amt_per_1h -> 1.75275420088811E12, num_trans_per_12h -> 1.3834256019623732E16]  |
|avg_amt_per_10m  |[avg_amt_per_12h -> 1.226308168475E10, num_trans_per_1h -> 4.

## Outliers and Data Drift Detection (kafka)

In [9]:
def get_consumer(topic):
    config = kafka.get_kafka_default_config()
    config['default.topic.config'] = {'auto.offset.reset': 'earliest'}
    consumer = Consumer(config)
    consumer.subscribe([topic])
    return consumer

In [22]:
def poll(consumer, n=2):
    df = pd.DataFrame([])
    for i in range(0, n):
        msg = consumer.poll(timeout=5.0)
        if msg is not None:
            value = msg.value()
            try: 
                d = json.loads(value.decode('utf-8'))
                df_msg = pd.DataFrame(d.items()).transpose()
                df_msg.columns = df_msg.iloc[0]
                df_msg.drop('window', inplace=True, axis=1)
                df = df.append(df_msg.drop(df_msg.index[[0]]))
            except Exception as e:
                print("A message was read but there was an error parsing it")
                print(e)
    return df

### Outliers detected

In [11]:
outliers_consumer = get_consumer("credit_card_activity_outliers")

In [12]:
outliers = poll(outliers_consumer, 20)

In [13]:
outliers.head(10)

0            feature    value  type           outlier  \
1  num_trans_per_12h  1680.89   max  descriptiveStats   
1     avg_amt_per_1h        1  mean  descriptiveStats   
1    avg_amt_per_12h        9  mean  descriptiveStats   
1    avg_amt_per_12h        9   min  descriptiveStats   
1  num_trans_per_12h  264.259   max  descriptiveStats   
1     avg_amt_per_1h        2  mean  descriptiveStats   
1    avg_amt_per_12h        5  mean  descriptiveStats   
1    avg_amt_per_12h        5   min  descriptiveStats   
1  num_trans_per_12h  1331.38  mean  descriptiveStats   
1  num_trans_per_12h  1331.38   max  descriptiveStats   

0             detectionTime  
1  2021-05-04T08:53:51.427Z  
1  2021-05-04T08:53:51.429Z  
1  2021-05-04T08:53:51.429Z  
1  2021-05-04T08:53:51.429Z  
1  2021-05-04T08:53:51.429Z  
1  2021-05-04T08:53:51.430Z  
1  2021-05-04T08:53:51.430Z  
1  2021-05-04T08:53:51.430Z  
1  2021-05-04T08:53:51.430Z  
1  2021-05-04T08:53:51.430Z

### Data drift detected

In [19]:
drift_consumer = get_consumer("credit_card_activity_drift")

In [25]:
drift = poll(drift_consumer, 20)

In [36]:
drift.head(5)

0           feature            drift     value             detectionTime
1    avg_amt_per_1h  kullbackLeibler   7.38321  2021-05-04T10:05:23.014Z
1    avg_amt_per_1h    jensenShannon  0.690542  2021-05-04T10:05:23.014Z
1  num_trans_per_1h      wasserstein  0.766589  2021-05-04T10:05:23.015Z
1  num_trans_per_1h  kullbackLeibler  0.677257  2021-05-04T10:05:23.015Z
1  num_trans_per_1h    jensenShannon  0.182436  2021-05-04T10:05:23.015Z