In [3]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime, timedelta
from functools import partial
import math


def round_datetime(timestamp, min_interval=0, sec_interval=0):
    if timestamp : 
        timestamp = int(timestamp)
    else :
        timestamp=0
    ## scaling timestamp to seconds resolution
    scale=1
    resolution = int(math.log10(timestamp))
    if resolution==12:
        scale=1E3
    elif resolution==15:
        scale=1E6
    elif resolution==18:
        scale=1E9
    timestamp = timestamp//scale
    tm = datetime.fromtimestamp(timestamp)
    if min_interval > 0 and sec_interval > 0:
        tm = tm - timedelta(minutes=tm.minute % min_interval, seconds=tm.second % sec_interval)
    elif min_interval > 0 :
        tm = tm - timedelta(minutes=tm.minute % min_interval, seconds=tm.second)
    elif sec_interval > 0 :
        tm = tm - timedelta(minutes=tm.minute, seconds=tm.second % sec_interval)
    return tm

def date_list(endDate, delta=14):
    temp=[endDate]
    for i in range(1,delta+1):
        temp.append(endDate - timedelta(days=i))
    return '{' + ','.join([str(d.date()) for d in temp]) + '}'

@F.udf(IntegerType())
def ts_resolution(timestamp):
    return int(math.log10(timestamp))+1


udf_rounddate = F.udf(round_datetime, TimestampType())
udf_round_1hour_floor = F.udf(partial(round_datetime, min_interval=60), TimestampType())
udf_round_10min_floor = F.udf(partial(round_datetime, min_interval=10), TimestampType())
udf_round_1min_floor = F.udf(partial(round_datetime, min_interval=1), TimestampType())
udf_round_30sec_floor = F.udf(partial(round_datetime, min_interval=1, sec_interval=30), TimestampType())

@F.udf(StringType())
def is_privateIP(ip):
    try :
        ip_obj = ipaddress.ip_address(ip)
        return "private_IP" if ip_obj.is_private else "public_IP"
    except ValueError:
        return "other"

def magic_median(varname):
    return F.expr(f'percentile_approx({varname}, 0.5)')

def magic_iqr(varname):
    return F.expr(f'percentile_approx({varname}, 0.75)-percentile_approx({varname}, 0.25)')

def getDateString(dates):
    if ',' not in dates:
        return dates.split('{')[1].split('}')[0]
    else:
        return dates.split('{')[1].split(',')[0] + "_" + dates.split('{')[1].split(',')[-1].split('}')[0]

def get_agg_funcs(data_cols):
    avg_funcs = [F.avg(col).alias(f'avg_{col}') for col in data_cols]
    std_funcs = [F.stddev(col).alias(f'std_{col}') for col in data_cols]
    median_funcs = [magic_median(col).alias(f'median_{col}') for col in data_cols]
    iqr_funcs = [magic_iqr(col).alias(f'iqr_{col}') for col in data_cols]
    return avg_funcs + std_funcs + median_funcs + iqr_funcs
############################################################

endDate = datetime.today()
interval = 1
dates=date_list(endDate, delta=interval)
date_string = getDateString(dates)
print(date_string)
hour = '{*}'
storage = 's3'
env = 'production'
#env = 'staging'
local = False
path = 'oc-stats-analytics'
user = 'test' ## change to personal username
cols = ["site_id","mac","t128.router_name","interfaces","org_id","model","ssr_peer_path_stats"]

df = spark.read.parquet(f"{storage}://mist-secorapp-{env}/{path}{'-local' if local else ''}/{path}-{env}/dt={dates}/hr={hour}")

df2 = df.select(cols).filter((F.col('model')=='128T-Router')|(F.col('model')=='SSR'))

##### #####
time_agg_col = 'When_30sec'


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2021-10-06_2021-10-05

## Process peer-path data

In [4]:
bfd_group_cols = [time_agg_col,'site_id','peer_path_named','mac','device_interface']
fpm_group_cols = [time_agg_col,'site_id','peer_path_named','mac','device_interface','fpm_traffic_class','fpm_protocol']
bfd_cols = [f'bfd_{metric}' for metric in ['latency','jitter','loss']]                  
fpm_cols = [f'fpm_{metric}' for metric in ['latency','jitter','loss']]                  


df_peer_path = df2.select('site_id','mac','router_name','org_id',
                          F.explode('ssr_peer_path_stats').alias('ssr_peer_path_stats'))

df_peer_path = df_peer_path.select('site_id','mac','router_name','org_id','ssr_peer_path_stats.*')

df_peer_path = df_peer_path.select('site_id','mac','router_name','org_id',
                                    'peer_name','adjacent_address', 'device_interface',
                                    'network_interface','vlan_id','uptime',
                                    'peer_site_id', 'peer_mac', 'peer_network',
                                    F.col('t128explicit_interface_type').alias('explicit_interface_type'),
                                     F.size('samples').alias('n_samples'),
                                     F.explode(F.col('samples')))


df_peer_path = df_peer_path.select('site_id','mac','router_name','org_id',
                                    'peer_name','adjacent_address', 'device_interface',
                                    'network_interface','vlan_id','uptime',
                                   'peer_site_id', 'peer_mac', 'peer_network','explicit_interface_type',
                                    F.col('col.*'),'n_samples') \
                           .withColumn('When', udf_rounddate(F.col('timestamp'))) \
                           .withColumn('When_60sec', udf_round_1min_floor(F.col('timestamp'))) \
                           .withColumn('When_30sec', udf_round_30sec_floor(F.col('timestamp'))) \
                           .withColumnRenamed('latency','bfd_latency') \
                           .withColumnRenamed('jitter','bfd_jitter')\
                           .withColumnRenamed('loss','bfd_loss')

df_peer_path = df_peer_path.filter(F.col('is_active')==True)
df_peer_path = df_peer_path.filter(F.col('is_up')==True)

df_peer_path = df_peer_path.withColumn('peer_path', F.concat_ws("__", 'mac', 'peer_name',
                                                                'network_interface', 'adjacent_address'))
df_peer_path = df_peer_path.withColumn('peer_path_named', F.concat_ws("__", 'router_name', 'peer_name',
                                                                'network_interface', 'adjacent_address'))


df_bfd_agg = df_peer_path.groupby(bfd_group_cols).agg(*get_agg_funcs(bfd_cols))

other_cols = [col for col in df_peer_path.columns if col!='fpmstats']
df_peer_path = df_peer_path.select(*other_cols, F.explode('fpmstats').alias('fpmstats'))
df_peer_path = df_peer_path.select(*other_cols, 'fpmstats.*') \
                .withColumnRenamed('latency','fpm_latency') \
                .withColumnRenamed('jitter','fpm_jitter')\
                .withColumnRenamed('loss','fpm_loss')\
                .withColumnRenamed('traffic_class','fpm_traffic_class')\
                .withColumnRenamed('protocol','fpm_protocol')

df_fpm_agg = df_peer_path.groupby(fpm_group_cols).agg(*get_agg_funcs(fpm_cols))


# df_site = spark.read.parquet("s3://mist-secorapp-staging/dimension/site") \
#                 .withColumnRenamed('id','site_id') \
#                 .withColumnRenamed('name','site_name') \
#                 .select('site_id','address','country_code','lat','lng')

# df_site_peer = df_site
# for col in df_site.columns:
#     df_site_peer = df_site_peer.withColumnRenamed(col, 'peer_'+col)
# #df_site.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
#df_bfd_agg.select('router_name','peer_path').distinct().show()
#df_fpm_agg.select('router_name','peer_path').distinct().show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
bfd_outpath = f'{storage}://mist-data-science-dev/{user}/ssr_peer_path/data/bfd_stats_dt={date_string}/'
fpm_outpath = f'{storage}://mist-data-science-dev/{user}/ssr_peer_path/data/fpm_stats_dt={date_string}/'

df_bfd_agg.write.csv(bfd_outpath, mode="overwrite", header=True)
df_fpm_agg.write.csv(fpm_outpath, mode="overwrite", header=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…