In [None]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime, timedelta
from functools import partial
import math

def round_datetime(timestamp, min_interval=0, sec_interval=0):
    if timestamp : 
        timestamp = int(timestamp)
    else :
        timestamp=0
    ## scaling timestamp to seconds resolution
    scale=1
    resolution = int(math.log10(timestamp))
    if resolution==12:
        scale=1E3
    elif resolution==15:
        scale=1E6
    elif resolution==18:
        scale=1E9
    timestamp = timestamp//scale
    tm = datetime.fromtimestamp(timestamp)
    if min_interval > 0 and sec_interval > 0:
        tm = tm - timedelta(minutes=tm.minute % min_interval, seconds=tm.second % sec_interval)
    elif min_interval > 0 :
        tm = tm - timedelta(minutes=tm.minute % min_interval, seconds=tm.second)
    elif sec_interval > 0 :
        tm = tm - timedelta(minutes=tm.minute, seconds=tm.second % sec_interval)
    return tm

def date_list(endDate, delta=14):
    temp=[endDate]
    for i in range(1,delta+1):
        temp.append(endDate - timedelta(days=i))
    return '{' + ','.join([str(d.date()) for d in temp]) + '}'

@F.udf(IntegerType())
def ts_resolution(timestamp):
    return int(math.log10(timestamp))+1


udf_rounddate = F.udf(round_datetime, TimestampType())
udf_round_1hour_floor = F.udf(partial(round_datetime, min_interval=60), TimestampType())
udf_round_10min_floor = F.udf(partial(round_datetime, min_interval=10), TimestampType())
udf_round_1min_floor = F.udf(partial(round_datetime, min_interval=1), TimestampType())
udf_round_3min_floor = F.udf(partial(round_datetime, min_interval=3), TimestampType())
udf_round_30sec_floor = F.udf(partial(round_datetime, min_interval=1, sec_interval=30), TimestampType())

time_agg_map = {"When_1min": udf_round_1min_floor,
               "When_30sec": udf_round_30sec_floor,
               "When_10min": udf_round_10min_floor,
               "When_3min": udf_round_3min_floor,
               "When_60min": udf_round_1hour_floor}

def magic_median(varname):
    return F.expr(f'percentile_approx({varname}, 0.5)')

def magic_iqr(varname):
    return F.expr(f'percentile_approx({varname}, 0.75)-percentile_approx({varname}, 0.25)')

def getDateString(dates):
    if ',' not in dates:
        return dates.split('{')[1].split('}')[0]
    else:
        return dates.split('{')[1].split(',')[0] + "_" + dates.split('{')[1].split(',')[-1].split('}')[0]

def get_agg_funcs(data_cols):
    avg_funcs = [F.avg(col).alias(f'avg_{col}') for col in data_cols]
    std_funcs = [F.stddev(col).alias(f'std_{col}') for col in data_cols]
    median_funcs = [magic_median(col).alias(f'median_{col}') for col in data_cols]
    iqr_funcs = [magic_iqr(col).alias(f'iqr_{col}') for col in data_cols]
    cnt_funcs = [F.count(col).alias(f'cnt_{col}') for col in data_cols]

    return avg_funcs + std_funcs + median_funcs + iqr_funcs + cnt_funcs

@F.udf(returnType=FloatType())
def calculate_loss(mark_count, ack_count):
    if mark_count>0.:
        return 100 * (mark_count-ack_count)/mark_count
    return -99.

@F.udf(returnType=FloatType())
def calculate_tagged_pct(conf_count, mark_count):
    if conf_count>0.:
        return 100 * mark_count/conf_count
    return -99.

############################################################

endDate = datetime.today()
interval = 0
dates=date_list(endDate, delta=interval)
date_string = getDateString(dates)
hour = '{19}'
print(date_string, '', hour)
storage = 's3'
# env = 'production'
env = 'staging'
source = 'oc-stats-analytics'
#username = 'ruchitm' # change to personal folder name
outpath = f'{storage}://mist-data-science-dev/{username}'

cols = ["when","site_id","mac","firmware_version","t128.router_name",
        "t128.node_name","t128.plugin_version","interfaces","org_id","model","ssr_peer_path_stats"]

df = spark.read.parquet(f"{storage}://mist-secorapp-{env}/{source}/{source}-{env}/dt={dates}/hr={hour}")

df = df.select(cols).filter(F.col('model').startswith('SSR'))

##### Define aggregation window #####
#time_agg_col = 'When_30sec'
time_agg_col = 'When_1min'


In [None]:
# df.printSchema()

## Process peer-path data

In [29]:
FPM_PLUGIN_VERSION = '3.2.0-207'
FPM_FW_VERSION = '5.5.0-0'

SPOKE_MACS = ['020001e9707f']

bfd_group_cols = [time_agg_col,'site_id','peer_path_named','mac','device_interface', 'router_name']
fpm_group_cols = [time_agg_col,'site_id','peer_path_named','mac','device_interface','router_name',
                  'fpm_traffic_class','fpm_protocol']

bfd_cols = [f'bfd_{metric}' for metric in ['latency','jitter','loss']]                  
fpm_cols = [f'fpm_{metric}' for metric in ['latency','jitter','tx_loss', 'rx_loss', 'tx_mark_delta', 'rx_mark_delta']]


df_peer_path = df.filter(F.col('mac').isin(SPOKE_MACS))\
                 .filter(F.size('ssr_peer_path_stats')>0)\
                 .filter((F.col('plugin_version')>=FPM_PLUGIN_VERSION)&\
                         (F.col('firmware_version')>=FPM_FW_VERSION))

df_peer_path = df_peer_path.select('site_id','mac','router_name','node_name','org_id','firmware_version','plugin_version',
                          F.explode('ssr_peer_path_stats').alias('ssr_peer_path_stats'))

df_peer_path = df_peer_path.select('site_id','mac','router_name','node_name','org_id','firmware_version','plugin_version',
                                   'ssr_peer_path_stats.*')

df_peer_path = df_peer_path.select('site_id','mac','router_name','node_name','org_id',
                                   'firmware_version','plugin_version',
                                    'peer_name','adjacent_address', 'device_interface',
                                    'network_interface','vlan_id','uptime',
                                    'peer_site_id', 'peer_mac', 'peer_network',
                                    'fpmconfigured_tx_mark_count', 'fpmconfigured_rx_mark_count',
                                    F.col('t128explicit_interface_type').alias('explicit_interface_type'),
                                     F.size('samples').alias('n_samples'),
                                     F.explode(F.col('samples')))


df_peer_path = df_peer_path.select('site_id','mac','router_name','node_name','org_id','firmware_version','plugin_version',
                                    'peer_name','adjacent_address', 'device_interface',
                                    'network_interface','vlan_id','uptime',
                                   'peer_site_id', 'peer_mac', 'peer_network','explicit_interface_type',
                                   'fpmconfigured_tx_mark_count', 'fpmconfigured_rx_mark_count',
                                    F.col('col.*'),'n_samples') \
                           .withColumn('When', udf_rounddate(F.col('timestamp'))) \
                           .withColumn(time_agg_col, time_agg_map[time_agg_col](F.col('timestamp'))) \
                           .withColumnRenamed('latency','bfd_latency') \
                           .withColumnRenamed('jitter','bfd_jitter')\
                           .withColumnRenamed('loss','bfd_loss')

# df_peer_path = df_peer_path.filter(F.col('is_active')==True)
# df_peer_path = df_peer_path.filter(F.col('is_up')==True)

df_peer_path = df_peer_path.withColumn('peer_path', F.concat_ws("__", 'mac', 'peer_mac',
                                                                'network_interface', 'adjacent_address'))
df_peer_path = df_peer_path.withColumn('peer_path_named', F.concat_ws("__", 'router_name', 'peer_name',
                                                                'network_interface', 'adjacent_address'))


df_bfd_agg = df_peer_path.groupby(bfd_group_cols).agg(*get_agg_funcs(bfd_cols))

other_cols = [col for col in df_peer_path.columns if col!='fpmstats']

df_peer_path_fpm = df_peer_path.filter(F.size('fpmstats')>0)\
                            .select(*other_cols, F.explode('fpmstats').alias('fpmstats'))

df_peer_path_fpm = df_peer_path_fpm.select(*other_cols, 'fpmstats.*') \
                .withColumnRenamed('latency','fpm_latency') \
                .withColumnRenamed('jitter','fpm_jitter')\
                .withColumnRenamed('traffic_class','fpm_traffic_class')\
                .withColumnRenamed('protocol','fpm_protocol')

df_peer_path_fpm = df_peer_path_fpm.withColumn('fpm_tx_mark_delta', calculate_tagged_pct('fpmconfigured_tx_mark_count', 'tx_mark_count'))\
                        .withColumn('fpm_rx_mark_delta', calculate_tagged_pct('fpmconfigured_rx_mark_count', 'rx_mark_count'))\
                        .withColumn('fpm_tx_loss', calculate_loss('tx_mark_count', 'tx_acknowledged_count'))\
                        .withColumn('fpm_rx_loss', calculate_loss('rx_mark_count', 'rx_acknowledged_count'))

df_fpm_agg = df_peer_path_fpm.groupby(fpm_group_cols).agg(*get_agg_funcs(fpm_cols))

bfd_outpath = f"{outpath}/ssr_peer_path/data/{env}/bfd_stats_agg_{time_agg_col.split('_')[1]}/dt={date_string}/"
fpm_outpath = f"{outpath}/ssr_peer_path/data/{env}/fpm_stats_agg_{time_agg_col.split('_')[1]}/dt={date_string}/"

# df_bfd_agg.write.csv(bfd_outpath, mode="overwrite", header=True)
# df_fpm_agg.write.csv(fpm_outpath, mode="overwrite", header=True)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [30]:
df_peer_path_fpm.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+---+-----------+---------+------+----------------+--------------+---------+----------------+----------------+-----------------+-------+------+------------+--------+------------+-----------------------+---------------------------+---------------------------+---------+---------+------+---+-----------+--------+----------+---+-----+---------+----+---------+---------+---------------+-----------+----+----------+-----------------+------------+-------------+---------------------+-------------+---------------------+-----------------+-----------------+-----------+-----------+
|site_id|mac|router_name|node_name|org_id|firmware_version|plugin_version|peer_name|adjacent_address|device_interface|network_interface|vlan_id|uptime|peer_site_id|peer_mac|peer_network|explicit_interface_type|fpmconfigured_tx_mark_count|fpmconfigured_rx_mark_count|timestamp|is_active|status|mtu|bfd_latency|bfd_loss|bfd_jitter|mos|is_up|n_samples|When|When_1min|peer_path|peer_path_named|fpm_latency|loss|fpm_jitter|

In [10]:
df_bfd_agg.show()
df_fpm_agg.show()



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+--------------------+--------------------+------------+----------------+-----------+------------------+--------------+------------------+-------------------+-----------------+-----------------+------------------+-----------------+---------------+---------------+--------------+------------+---------------+--------------+------------+
|          When_1min|             site_id|     peer_path_named|         mac|device_interface|router_name|   avg_bfd_latency|avg_bfd_jitter|      avg_bfd_loss|    std_bfd_latency|   std_bfd_jitter|     std_bfd_loss|median_bfd_latency|median_bfd_jitter|median_bfd_loss|iqr_bfd_latency|iqr_bfd_jitter|iqr_bfd_loss|cnt_bfd_latency|cnt_bfd_jitter|cnt_bfd_loss|
+-------------------+--------------------+--------------------+------------+----------------+-----------+------------------+--------------+------------------+-------------------+-----------------+-----------------+------------------+-----------------+---------------+---------------+-----

In [9]:
print(bfd_outpath)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

s3://mist-data-science-dev/ruchitm/ssr_peer_path/data/staging/bfd_stats_agg_3min/dt=2022-02-10_2022-02-03/

In [9]:
df_peer_path.filter(F.size('fpmstats')>0)\
            .select('When','mac','peer_path_named','plugin_version','device_interface',
                    'bfd_latency','fpmconfigured_tx_mark_count','fpmconfigured_rx_mark_count',
                    'fpmstats', 'is_active','is_up')\
            .show(truncate=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+------------+--------------------+--------------+----------------+-----------+---------------------------+---------------------------+--------------------+---------+-----+
|               When|         mac|     peer_path_named|plugin_version|device_interface|bfd_latency|fpmconfigured_tx_mark_count|fpmconfigured_rx_mark_count|            fpmstats|is_active|is_up|
+-------------------+------------+--------------------+--------------+----------------+-----------+---------------------------+---------------------------+--------------------+---------+-----+
|2022-01-14 16:48:35|02000107c3cb|cupertino-branch-...|     3.2.0-207|             wan|       44.0|                        500|                        500|[{57.0, 69.0, 8.0...|     true| true|
|2022-01-14 16:48:50|02000107c3cb|cupertino-branch-...|     3.2.0-207|             wan|       50.0|                        500|                        500|[{53.0, 47.0, 5.0...|     true| true|
|2022-01-14 16:49:05|02000107c3cb|c