In [None]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.types import *

import requests
from datetime import datetime, timedelta
from functools import partial
import math
import ipaddress
import re
import json
import pandas as pd


def round_datetime(timestamp, min_interval=0, sec_interval=0):
    if timestamp : 
        timestamp = int(timestamp)
    else :
        timestamp=0
    ## scaling timestamp to seconds resolution
    scale=1
    resolution = int(math.log10(timestamp))
    if resolution==12:
        scale=1E3
    elif resolution==15:
        scale=1E6
    elif resolution==18:
        scale=1E9
    timestamp = timestamp//scale
    tm = datetime.fromtimestamp(timestamp)
    if min_interval > 0 and sec_interval > 0:
        tm = tm - timedelta(minutes=tm.minute % min_interval, seconds=tm.second % sec_interval)
    elif min_interval > 0 :
        tm = tm - timedelta(minutes=tm.minute % min_interval, seconds=tm.second)
    elif sec_interval > 0 :
        tm = tm - timedelta(minutes=tm.minute, seconds=tm.second % sec_interval)
    return tm

def date_list(endDate, delta=14):
    temp=[endDate]
    for i in range(1,delta+1):
        temp.append(endDate - timedelta(days=i))
    return '{' + ','.join([str(d.date()) for d in temp]) + '}'


udf_rounddate = F.udf(round_datetime, TimestampType())

def getDateString(dates):
    if ',' not in dates:
        return dates.split('{')[1].split('}')[0]
    else:
        return dates.split('{')[1].split(',')[0] + "_" + dates.split('{')[1].split(',')[-1].split('}')[0]

def get_ssr_app_ids(env):
    PAPI_URL = 'http://papi-internal-{}.mist.pvt/internal/const/ssr_app_ids?format=json'
    papi_url = PAPI_URL.format(env)
    res = requests.get(papi_url)

    if res.status_code != 200:
        raise Exception(
            'Fail to get app sle application thresholds from PAPI:{}: {}'.format(res.status_code,
                                                                                 res.text))
    return res.json()
############################################################

# endDate = datetime.today()
endDate = datetime.strptime('2022-04-26', "%Y-%m-%d")
interval = 1
dates=date_list(endDate, delta=interval)
date_string = getDateString(dates)
hour = '{*}'
hour_string = re.split('\{|\}', hour)[1]
if hour_string=="*":
    hour_string = "all"
print(date_string, '', hour)
storage = 's3'
env = 'production'
local = False
sle_path = 'cv-sle-app-health'
appsum_path = 'ssr-application-summary-analytics'

df_org = spark.read.parquet(f"{storage}://mist-secorapp-{env}/dimension/org/*.parquet")
df_site = spark.read.parquet(f"{storage}://mist-secorapp-{env}/dimension/site/*.parquet")

df_app_thresholds = pd.DataFrame(get_ssr_app_ids(env))
df_app_thresholds = pd.concat([df_app_thresholds, df_app_thresholds._sle_threshold.transform(pd.Series)], axis=1)
CURATED_APPS = df_app_thresholds.device_app_name.unique()
CURATED_APPS = [x.lower() for x in CURATED_APPS]
print(CURATED_APPS)

In [None]:
def get_S3_data(path, dates, hour, env='staging', storage='s3', filtering=True, local=False):
    
    S3Filepath = f"{storage}://mist-secorapp-{env}/{path}{'-local' if local else ''}/{path}-{env}/dt={dates}/hr={hour}"
    print(f"{S3Filepath=}")

    if 'cv-sle-app-health'==path:

        info_cols = ["time","site_id","switch_mac","switch_version",
                     "device_app_name", "app_name", "client", "client_wcid",
                     "category", "classifier_reason", "classifier",
                     "interface_name","custom","internal_flags","org_id",
                     "traffic_class"]

        wan_metrics_cols = ["latency", "jitter", "loss"]
        app_metrics_cols = ["tx_tcp_retransmission_packets", "tx_tcp_reset",
                            "tx_packets", "tx_bytes",
                            "rx_tcp_retransmission_packets", "rx_tcp_reset",
                            "rx_packets", "rx_bytes",
                            "time_to_first_packet","session_length"]
        df = spark.read.orc(S3Filepath)
#         df.printSchema()
        df = df.filter((F.col('switch_model').startswith('SSR')))
        if filtering:
            cols = info_cols + wan_metrics_cols + app_metrics_cols
            df = df.select(cols)


    elif 'ssr-application-summary-analytics'==path:
        info_cols = ["when", "site_id", "mac", "src_address",
                     "device_app_name", "app_key", "app_summary_type",
                     "category", "egress_network_interface",
                     "custom_app", "org_id", "traffic_class", 
                     "protocol", "ingress_network_interface"]

        wan_metrics_cols = []
        app_metrics_cols = ["tcp_retransmission_packets_from_client", 
                            "tcp_reset_from_client",
                            "packets_from_client", "bytes_from_client",
                            "tcp_retransmission_packets_from_server",
                            "tcp_reset_from_server",
                            "packets_from_server", "bytes_from_server",
                            "new_sessions", "active_sessions",
                            "time_to_first_data_packet_avg", "session_length_avg"]
        
        rename_dict = dict(zip(["when", "mac","src_address", "wcid",
                                "egress_network_interface", "ingress_network_interface",
                                "tcp_retransmission_packets_from_client", 
                                "tcp_reset_from_client",
                                "packets_from_client", "bytes_from_client",
                                "tcp_retransmission_packets_from_server",
                                "tcp_reset_from_server",
                                "packets_from_server", "bytes_from_server",
                               "time_to_first_data_packet_avg", "session_length_avg"], 
                              ["time", "switch_mac","client", "client_wcid",
                               "interface_name", "ingress_interface_name",
                               "tx_tcp_retransmission_packets", "tx_tcp_reset",
                                "tx_packets", "tx_bytes",
                                "rx_tcp_retransmission_packets", "rx_tcp_reset",
                                "rx_packets", "rx_bytes",
                                "time_to_first_packet","session_length"]))
        df = spark.read.parquet(S3Filepath)
        if filtering:
            cols = info_cols + wan_metrics_cols + app_metrics_cols
            df = df.select(cols)
        for col in rename_dict:
            df = df.withColumnRenamed(col, rename_dict[col])
        df = df.withColumn('time', udf_rounddate('time'))

    df = df.withColumn("hour", (F.round(F.unix_timestamp("time")/3600)*3600).cast("timestamp"))
    return df
    

In [None]:
filter_curated_apps = False
MIN_PACKETS = 10 # from app sle topology
group_cols = ['switch_mac', 'device_app_name', 'hour', 'client']


################################################################################################
df_sle =  get_S3_data(path=sle_path, dates=dates, hour=hour, env=env)
df_sle = df_sle.withColumn('min_packets',
                            F.when(F.greatest('rx_packets', 'tx_packets')<MIN_PACKETS, 1).otherwise(0))
df_appsum =  get_S3_data(path=appsum_path, dates=dates, hour=hour, env=env)
df_appsum = df_appsum.withColumn('min_packets',
                                 F.when(F.greatest('rx_packets', 'tx_packets')<MIN_PACKETS, 1).otherwise(0))


if filter_curated_apps:
    df_sle = df_sle.filter(F.lower('device_app_name').isin(CURATED_APPS))
    df_appsum = df_appsum.filter(F.lower('device_app_name').isin(CURATED_APPS))

df_sle_count = df_sle.groupby(group_cols).agg(F.count('traffic_class').alias('count_sle'),
                                                    F.sum('min_packets').alias('sum_sle_min_packets'))

df_appsum_count = df_appsum.groupby(group_cols).agg(F.count('traffic_class').alias('count_appsum'),
                                                    F.sum('min_packets').alias('sum_appsum_min_packets'))




### Analyze SLE missing and mismatches from App summary

In [3]:
from datetime import datetime, timedelta
import re
import json
import pandas as pd

sle_path = 'cv-sle-app-health'
appsum_path = 'ssr-application-summary-analytics'
inpath = f'{storage}://mist-data-science-dev/ruchitm'

date_string = '2022-04-26_2022-04-25'
env = 'production'
group_cols = ['switch_mac', 'device_app_name', 'hour', 'client']

filepath = f"{inpath}/app-health-{env}/{sle_path}/agg_{'-'.join(group_cols)}/dt={date_string}/"
df_sle_pd = spark.read.parquet(filepath).toPandas()

filepath = f"{inpath}/app-health-{env}/{appsum_path}/agg_{'-'.join(group_cols)}/dt={date_string}/"
df_appsum_pd = spark.read.parquet(filepath).toPandas()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [60]:
df_merge = pd.merge(df_sle_pd, df_appsum_pd, on=group_cols, how='outer')
idx_time = (df_merge.hour>df_merge.hour.min()) & (df_merge.hour<df_merge.hour.max())
df_merge = df_merge[idx_time].copy()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [82]:
df_merge['diff_appsum_sle'] = df_merge.count_appsum - df_merge.count_sle
df_merge['diff_min_packets'] = df_merge.sum_appsum_min_packets - df_merge.sum_sle_min_packets

print(f"Total count: {df_merge.shape[0]}")
print(f"SLE NaN count: {df_merge.count_sle.isna().sum()}")
print(f"APPSUM NaN count: {df_merge.count_appsum.isna().sum()}")

TOTAL_APPSUM_RECORDS = df_merge['count_appsum'].sum()
print(f"APPSUM record count: {TOTAL_APPSUM_RECORDS}")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Total count: 1262268
SLE NaN count: 65201
APPSUM NaN count: 0
APPSUM record count: 32071581.0

In [75]:
idx_sle_missing = (df_merge.count_sle.isna()) & (~df_merge.count_appsum.isna())
print(f"SLE missing: {df_merge[idx_sle_missing]['count_appsum'].sum()*100/TOTAL_APPSUM_RECORDS: .3f}%")

idx_missing_min_packets = (df_merge.count_appsum == df_merge.sum_appsum_min_packets) & idx_sle_missing
print(f"SLE missing due to min packets: {df_merge[idx_missing_min_packets]['count_appsum'].sum()*100/TOTAL_APPSUM_RECORDS: .3f}%")

idx_missing_other = ~(df_merge.count_appsum == df_merge.sum_appsum_min_packets) & idx_sle_missing
print(f"SLE missing unexplained: {df_merge[idx_missing_other]['count_appsum'].sum()*100/TOTAL_APPSUM_RECORDS: .3f}%")
print("\n","SLE missing unexplained")
df_merge[idx_missing_other][['count_appsum', 'sum_appsum_min_packets']].describe()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SLE missing:  3.182%
SLE missing due to min packets:  3.182%
SLE missing unexplained:  0.000%

 SLE missing unexplained
       count_appsum  sum_appsum_min_packets
count           0.0                     0.0
mean            NaN                     NaN
std             NaN                     NaN
min             NaN                     NaN
25%             NaN                     NaN
50%             NaN                     NaN
75%             NaN                     NaN
max             NaN                     NaN

In [76]:
idx_sle_mismatch = df_merge.diff_appsum_sle > 0.
print(f"SLE mismatch: {df_merge[idx_sle_mismatch]['count_appsum'].sum()*100/TOTAL_APPSUM_RECORDS: .1f}%", "\n")
df_merge[idx_sle_mismatch]['diff_appsum_sle'].describe()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SLE mismatch:  4.8% 

count    64654.000000
mean        11.831565
std         17.547245
min          1.000000
25%          2.000000
50%          4.000000
75%         13.000000
max        148.000000
Name: diff_appsum_sle, dtype: float64

In [79]:
idx_mismatch_min_packets = (df_merge.sum_appsum_min_packets >= df_merge.diff_appsum_sle) & idx_sle_mismatch
print(f"sle_mismatch(min packets): {df_merge[idx_mismatch_min_packets]['count_appsum'].sum()*100/TOTAL_APPSUM_RECORDS:.1f}%", "\n")
df_merge[idx_mismatch_min_packets]['diff_appsum_sle'].describe()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

sle_mismatch(min packets): 4.8% 

count    64646.000000
mean        11.832735
std         17.548010
min          1.000000
25%          2.000000
50%          4.000000
75%         13.000000
max        148.000000
Name: diff_appsum_sle, dtype: float64

In [81]:
idx_mismatch_other = (df_merge.sum_appsum_min_packets < df_merge.diff_appsum_sle) & idx_sle_mismatch
print(f"sle_mismatch(other): {df_merge[idx_mismatch_other]['count_appsum'].sum()*100/TOTAL_APPSUM_RECORDS:.3f}%", "\n")
df_merge[idx_mismatch_other]['diff_appsum_sle'].describe()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

sle_mismatch(other): 0.002% 

count    8.00000
mean     2.37500
std      1.30247
min      1.00000
25%      1.75000
50%      2.00000
75%      3.00000
max      5.00000
Name: diff_appsum_sle, dtype: float64

In [66]:
df_merge['sle_mismatch'] = idx_sle_mismatch.values
df_merge['sle_missing(min packets)'] = idx_missing_min_packets.values
df_merge['sle_missing(other)'] = idx_missing_other.values
df_merge['sle_mismatch(min packets)'] = idx_mismatch_min_packets.values
df_merge['sle_mismatch(other)'] = idx_mismatch_other.values


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [68]:
print(f"sle_mismatch(other): #appsum records = {df_merge[idx_mismatch_other]['count_appsum'].sum()}")
print(f"sle_mismatch(other): #sle records = {df_merge[idx_mismatch_other]['count_sle'].sum()}")
print(f"sle_mismatch(other): #diff records = {df_merge[idx_mismatch_other]['diff_appsum_sle'].sum()}")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

sle_mismatch(other): #appsum records = 619.0
sle_mismatch(other): #sle records = 600.0
sle_mismatch(other): #diff records = 19.0

In [67]:
# from io import StringIO # python3; python2: BytesIO 
# import boto3

# bucket = 'mist-data-science-dev' # already created on S3
# csv_buffer = StringIO()
# df_temp = df_merge.to_csv(csv_buffer)
# s3_resource = boto3.resource('s3')
# temp_filepath = f"ruchitm/app-health-{env}/diff/{'-'.join(group_cols)}/dt={date_string}/sle_mismatch.csv"
# s3_resource.Object(bucket, temp_filepath).put(Body=csv_buffer.getvalue())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [69]:
df_merge[df_merge.diff_appsum_sle < 0.]['diff_appsum_sle'].describe()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: diff_appsum_sle, dtype: float64

In [70]:
df_merge[df_merge.diff_appsum_sle < 0.]['hour'].describe()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

count       0
unique      0
top       NaN
freq      NaN
Name: hour, dtype: object