* global recommendation based on geo-location, 
* local update based on site, router-level telemtry
* Ranking of peer-paths for each policy
* Probability of next violation if first violation happens
* Feedback mechanism: Get some labels, application flow events
* Cost of user impact?

In [1]:
from functools import partial
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import os
import re
import glob
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
pd.plotting.register_matplotlib_converters()
pd.set_option('display.max_colwidth', 1)
from statsmodels.distributions.empirical_distribution import ECDF



import seaborn as sns
#sns.set(rc={'figure.figsize':(15,6)})
sns.set_style("darkgrid")
sns.set_context("notebook", font_scale=1.4, rc={"lines.linewidth": 2.5})


def getFileName(WDIR,pathName):
    files = glob.glob(WDIR+pathName)
    files.sort(key=os.path.getmtime,reverse=True)
    return files[0], os.path.getmtime(files[0])    

def timedelta_to_time(duration):
    totsec = duration.total_seconds()
    h = str(int(totsec//3600)).zfill(2)
    m = str(int((totsec%3600) // 60)).zfill(2)
    return f"{h}:{m}"    
    
def round_datetime(timestamp, interval=0):
    if isinstance(timestamp,int):
        tm = datetime.fromtimestamp(timestamp)
    if interval > 0:
        tm = tm - timedelta(minutes=tm.minute % interval, seconds=tm.second)
    return tm

In [2]:
gaming = {'latency':100}
voip_audio_video = {'latency':150, 'jitter':30, 'loss':1}
remote_desktop = {'latency':225}
voip_signaling = {'latency':250}
management_interactive = {'latency':200}
management_m2m = {'latency':1000}
video_streaming = {'latency':4000, 'loss':5}
data_critical = {'latency':250}
data_interactive = {'latency':250}


policy = pd.DataFrame([gaming,voip_audio_video,remote_desktop,voip_signaling,
                       management_interactive,management_m2m,
                       video_streaming,data_critical, data_interactive], 
                      index=['gaming','voip_audio_video','remote_desktop','voip_signaling',
                       'management_interactive','management_m2m',
                       'video_streaming','data_critical','data_interactive']).to_dict('index')

## Analytic functions

$$med \leftarrow Median\left(differences\_from\_limit\right)$$
$$iqr \leftarrow IQR\left(differences\_from\_limit\right)$$


$$Median\_percent = \left(\frac{med}{limit}\right)*100$$

$$relative\_iqr = \left(\frac{iqr}{med+limit}\right)$$

$$percent\_violation = \frac{\#violations}{\#timepoints}$$

$$duration\_violation = \#contiguous\_timepoints\_with\_violation$$


In [3]:
def count_contiguous(A, bit=1):
    running_sum=0
    length_of_bits=[]
    for c in A:
        if c!=bit: 
            if running_sum>0: 
                length_of_bits.append(running_sum) 
                running_sum=0
            continue
        else:
            running_sum+=1

    if running_sum>0: 
        length_of_bits.append(running_sum) 
    
    if len(length_of_bits)>0 :
        return length_of_bits
    return None

A = [0,0,1,1,1,0,0,1,0,1,1,1,1,0,0,0,0,1,0,1,0,1,1]
#print(f"Length of contiguous ones: {count_contiguous(A,1)}")
#print(f"Length of contiguous zeros: {count_contiguous(A,0)}")


def diff_over(A, limit):
    A = np.array(A)
    idx = A>=limit
    if sum(idx)>0:
        diff = A[idx]-limit
        return list(diff)
    return None

A = [101,99,50,34,103,220,120,45,107]
limit=100
#print(f"Differences over {limit} = {diff_over(A, limit)}")


def diff_under(A, limit):
    A = np.array(A)
    idx = A<limit
    if sum(idx)>0:
        diff = A[idx]-limit
        return list(diff)
    return None

A = [101,99,50,34,103,220,120,45,107]
limit=100
#print(f"Differences under {limit} = {diff_under(A, limit)}")

def median_percent(x, limit):
    return x.quantile(0.5)*100/limit
    
# 90th Percentile
def q90(x):
    return x.quantile(0.9)

# 10th Percentile
def q10(x):
    return x.quantile(0.1)

def iqr(x):
    return x.quantile(0.75) - x.quantile(0.25)

def relative_iqr(x, limit):
    if x.quantile(0.5)+limit !=0 :
        return iqr(x)/(x.quantile(0.5)+limit)
    else: return np.nan

def features_by_metric(df, metric, limit):
    """ 
    Generate features (summary stats) for given metric and policy definition. 
    Grouping is done by ('site_id','mac','peer_name','interface','day')
    
    Parameters:
        df: Pandas DataFrame with ssr_peer_path_stats data
        metric: Path metric, options=['latency', 'jitter', 'loss']
        limit: SLA policy limit for metric
        
    Returns:
        df_summary: Dataframe with features
    
    """

    ### Duration of violations
    agg_funcs = ['median', iqr]
    group_cols = ['site_id','mac','peer_name','interface','time']
    df['violation'] = df[metric].apply(lambda x: 1 if x>limit else 0)

    duration_summary = pd.DataFrame()
    for bit in [1]:
        col_name=f"duration_{'good' if bit==0 else 'violation'}_{metric}"
        temp = df.groupby(group_cols)['violation'] \
                    .apply(count_contiguous, bit=bit).reset_index() \
                    .explode('violation').rename(columns={'violation':col_name}) \

        temp[col_name] = temp[col_name].apply(pd.to_numeric, errors="ignore")
        temp_agg = temp.groupby(group_cols).agg({col_name:agg_funcs})

        if duration_summary.empty: 
            duration_summary = temp_agg
        else: 
            duration_summary = duration_summary.join(temp_agg)

    duration_summary = duration_summary.apply(np.round, decimals=2)        
    
    ### Magnitude and frequency of violations
    _median_percent = partial(median_percent, limit=limit)
    _relative_iqr = partial(relative_iqr, limit=limit)

    agg_funcs = [_median_percent, _relative_iqr,'count']
    group_cols = ['site_id','mac','peer_name','interface','time']

    magnitude_summary = pd.DataFrame()
    col_name=f'over_limit_{metric}'
    temp = df.groupby(group_cols)[metric] \
                .apply(diff_over, limit=limit).reset_index() \
                .explode(metric).rename(columns={metric:col_name}) \

    temp[col_name] = temp[col_name].apply(pd.to_numeric, errors="ignore")
    temp_agg = temp.groupby(group_cols).agg({col_name:agg_funcs})
    temp_agg.loc[:,temp_agg.columns.get_level_values(1)=='median'] /= limit
    temp_agg.loc[:,temp_agg.columns.get_level_values(1)=='median'] *= 100

    magnitude_summary = temp_agg
    
    col_name=f'within_limit_{metric}'
    temp = df.groupby(group_cols)[metric] \
                .apply(diff_under, limit=limit).reset_index() \
                .explode(metric).rename(columns={metric:col_name}) \
                .dropna(subset=[col_name])

    temp[col_name] = temp[col_name].apply(pd.to_numeric, errors="ignore")
    temp_agg = temp.groupby(group_cols).agg({col_name:agg_funcs})
    temp_agg.loc[:,temp_agg.columns.get_level_values(1)=='median'] /= limit
    temp_agg.loc[:,temp_agg.columns.get_level_values(1)=='median'] *= 100

    magnitude_summary  = magnitude_summary.join(temp_agg)
    magnitude_summary = magnitude_summary.apply(np.round, decimals=2)        

    N = magnitude_summary[(f'over_limit_{metric}','count')] + magnitude_summary[(f'within_limit_{metric}','count')]
    magnitude_summary[f'percent_violation_{metric}'] = np.round(magnitude_summary[(f'over_limit_{metric}','count')] *100 / N, 1)

    idx_violate = (magnitude_summary[(f'over_limit_{metric}','count')]>0) \
                    & (magnitude_summary[(f'within_limit_{metric}','count')].isna())

    magnitude_summary.loc[idx_violate, (f'percent_violation_{metric}','')] = 100
    
    df_summary = magnitude_summary.join(duration_summary, how='outer')
    df_summary.reset_index(inplace=True)
    df_summary.columns = [f"{col[0]} ({col[1]})" if col[1]!='' else col[0] for col in df_summary.columns.values]

    return df_summary

def get_peer_paths(df_features, site, mac_idx=0):
    idx = df_features.site_id==site
    macs = df_features[idx].mac.unique()
    idx &= df_features.mac==macs[mac_idx]
    return df_features[idx].groupby(['site_id','peer_name','interface']) \
                          .count().reset_index()[['site_id','peer_name','interface']]

def get_features_site_mac(df_features, site, mac_idx=0):
    cols=list(df_features.columns[:5])
    #feature_cols = df_features.filter(regex='over_limit\w+|percent_violation|duration_violation').columns
    feature_cols = df_features.filter(regex='limit\w+|percent_violation|duration_violation').columns

    feature_cols = [col for col in feature_cols if 'count' not in col]
    cols.extend(feature_cols)
    idx = df_features.site_id==site
    macs = df_features[idx].mac.unique()
    idx &= df_features.mac==macs[mac_idx]
    print(f"site={sites[site_idx]}\nmac={macs[mac_idx]}")
    return df_features.loc[idx, cols]

def plot_features_trend(df_features):
    id_cols = df_features.columns[:5]
    feature_cols = df_features.columns[5:]
    for i, col in enumerate(feature_cols):
        fig = px.line(df_features, x='time', y=col,
                    color='peer_name', facet_col='interface',
                    facet_row_spacing=0.04)
        #fig.update_layout(height=30*len(feature_cols), width=700)
        fig.update_traces(mode='lines+markers')
        fig.update_xaxes(tickformat="%Y-%m-%d|%H:%M", 
                         tickangle=30, 
                         nticks=min(20, df_features['time'].drop_duplicates().shape[0]))
        fig.update_layout(title=col)
        fig.update_yaxes(title='value')

        fig.show()
        
#     temp = df_features.melt(id_vars=id_cols, value_vars=feature_cols)
#     idx = temp.site_id==site
#     macs = temp[idx].mac.unique()
#     idx &= temp.mac==macs[mac_idx]
#     fig = px.line(temp.loc[idx,:], x='day', y='value',
#                 color='peer_name', 
#                 facet_col='interface', 
#                 facet_row='variable')
#     fig.update_layout(height=300*len(feature_cols), width=1000)
#     fig.update_traces(mode='lines+markers')
#     fig.update_xaxes(tickformat="%Y-%m-%d")
#     fig.update_yaxes(matches=None)
#     fig.show()


def generate_features(df, policy, agg_window='3600min'):
    df['time'] = df['date_sample'].dt.floor(agg_window)
    join_cols = ['site_id','mac','peer_name','interface','time']
    df_all = pd.DataFrame()
    for metric in policy.keys():
        if not np.isnan(policy[metric]): 
            df_features = features_by_metric(df, metric, policy[metric])
            if df_all.empty:
                df_all = df_features
            else:
                df_all = df_all.merge(df_features, on=join_cols, how='left')
            
    return df_all
    

In [9]:
WDIR=r'/Users/ruchitm/Documents/Work/ssr-peer-path/aws-data/ssr_peer_path/'
folder = 'dt=2021-03-29_2021-03-15'

all_files = glob.glob(os.path.join(WDIR, folder, "*.csv")) 
df_from_each_file = (pd.read_csv(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)
df['date'] = pd.to_datetime(df['date'])
df['date_sample'] = pd.to_datetime(df['date_sample'])
df.dropna(subset=['latency','loss','jitter'], how='all', inplace=True)
df['interface'] = df.network_interface + "__" +df.device_interface
df.head()

Unnamed: 0,mac,date,org_id,site_id,peer_name,adjacent_address,device_interface,network_interface,vlan_id,uptime,timestamp,is_active,status,mtu,latency,loss,jitter,mos,date_sample,interface
5,020001108a80,2021-03-15 20:59:54+00:00,0c160b7f-1027-4cd1-923b-744534c4b070,725a8d34-a126-4f2c-b990-d1219421cb75,AZDCBBP1,12.51.52.30,StoreLTE,Lte,0,123835634,1615842000000000.0,True,UP,1500.0,67.0,0.0,0.0,437.0,2021-03-15 20:59:30+00:00,Lte__StoreLTE
11,020001108a80,2021-03-15 20:59:54+00:00,0c160b7f-1027-4cd1-923b-744534c4b070,725a8d34-a126-4f2c-b990-d1219421cb75,AZDCBBP1,12.51.52.30,StoreWAN,Broadband,0,123834924,1615842000000000.0,True,UP,1500.0,63.0,0.0,0.0,437.0,2021-03-15 20:59:30+00:00,Broadband__StoreWAN
17,020001108a80,2021-03-15 20:59:54+00:00,0c160b7f-1027-4cd1-923b-744534c4b070,725a8d34-a126-4f2c-b990-d1219421cb75,AZDCLTEP1,12.51.52.22,StoreLTE,Lte,0,372500833,1615842000000000.0,True,UP,1500.0,102.0,0.0,0.0,435.0,2021-03-15 20:59:30+00:00,Lte__StoreLTE
23,020001108a80,2021-03-15 20:59:54+00:00,0c160b7f-1027-4cd1-923b-744534c4b070,725a8d34-a126-4f2c-b990-d1219421cb75,AZDCLTEP1,12.51.52.22,StoreWAN,Broadband,0,372500934,1615842000000000.0,True,UP,1500.0,102.0,0.0,0.0,435.0,2021-03-15 20:59:30+00:00,Broadband__StoreWAN
29,020001108a80,2021-03-15 20:59:54+00:00,0c160b7f-1027-4cd1-923b-744534c4b070,725a8d34-a126-4f2c-b990-d1219421cb75,RIDCBBP1,12.230.70.254,StoreLTE,Lte,0,486401823,1615842000000000.0,True,UP,1500.0,98.0,0.0,0.0,435.0,2021-03-15 20:59:30+00:00,Lte__StoreLTE


In [10]:
index_cols = ['date','date_sample','site_id','mac','peer_name','adjacent_address','interface', 'vlan_id']
df2 = df.drop_duplicates(subset=index_cols)
df3 = df2.sort_values('date_sample')
#df3['day'] = df3.date_sample.dt.date
#df3['time_of_day'] = df3.date_sample.dt.time
df3 = df3[df3['latency']>0.]

In [47]:
df_features = generate_features(df3, agg_window='1H', policy=policy['gaming'])
df_features.head()

Unnamed: 0,site_id,mac,peer_name,interface,time,over_limit_latency (median_percent),over_limit_latency (relative_iqr),over_limit_latency (count),within_limit_latency (median_percent),within_limit_latency (relative_iqr),within_limit_latency (count),percent_violation_latency,duration_violation_latency (median),duration_violation_latency (iqr)
0,395c7060-2ac0-4783-81cc-90df3a8f2dd4,0200012356c3,Prajwal-router,mpls1__mpls,2021-03-29 14:00:00+00:00,,,0,-99.0,0.0,9.0,0.0,,
1,395c7060-2ac0-4783-81cc-90df3a8f2dd4,020001237172,Prajwal-router,mpls1__mpls,2021-03-29 14:00:00+00:00,,,0,-99.0,0.0,3.0,0.0,,
2,395c7060-2ac0-4783-81cc-90df3a8f2dd4,020001237172,Prajwal-router,mpls1__mpls,2021-03-29 15:00:00+00:00,,,0,-99.0,0.0,42.0,0.0,,
3,395c7060-2ac0-4783-81cc-90df3a8f2dd4,020001237172,Prajwal-router,mpls1__mpls,2021-03-29 16:00:00+00:00,,,0,-99.0,0.0,37.0,0.0,,
4,395c7060-2ac0-4783-81cc-90df3a8f2dd4,020001237172,Prajwal-router,mpls1__mpls,2021-03-29 17:00:00+00:00,,,0,-99.0,0.0,44.0,0.0,,


## Visualizations

In [63]:
df_summary = df_features.groupby(['site_id','mac','peer_name','interface'])['time'].count().reset_index()
df_summary.head(50)

Unnamed: 0,site_id,mac,peer_name,interface,time
0,395c7060-2ac0-4783-81cc-90df3a8f2dd4,0200012356c3,Prajwal-router,mpls1__mpls,1
1,395c7060-2ac0-4783-81cc-90df3a8f2dd4,020001237172,Prajwal-router,mpls1__mpls,8
2,395c7060-2ac0-4783-81cc-90df3a8f2dd4,02000196a2b5,128t-router,mpls1__mpls,1
3,725a8d34-a126-4f2c-b990-d1219421cb75,020001108a80,AZDCBBP1,Broadband__StoreWAN,34
4,725a8d34-a126-4f2c-b990-d1219421cb75,020001108a80,AZDCBBP1,Lte__StoreLTE,34
5,725a8d34-a126-4f2c-b990-d1219421cb75,020001108a80,AZDCLTEP1,Broadband__StoreWAN,34
6,725a8d34-a126-4f2c-b990-d1219421cb75,020001108a80,AZDCLTEP1,Lte__StoreLTE,34
7,725a8d34-a126-4f2c-b990-d1219421cb75,020001108a80,RIDCBBP1,Broadband__StoreWAN,34
8,725a8d34-a126-4f2c-b990-d1219421cb75,020001108a80,RIDCBBP1,Lte__StoreLTE,34
9,725a8d34-a126-4f2c-b990-d1219421cb75,020001108a80,RIDCLTEP1,Broadband__StoreWAN,34


In [48]:
site_idx = 6
mac_idx = 0
sites = df_features.site_id.unique()
get_peer_paths(df_features, sites[site_idx], mac_idx)

Unnamed: 0,site_id,peer_name,interface
0,ae0802ba-327f-4031-9da5-35fd43ef4c2f,atlanta-site-01,WAN1-vlan0__WAN1
1,ae0802ba-327f-4031-9da5-35fd43ef4c2f,aws-site-01,WAN1-vlan0__WAN1
2,ae0802ba-327f-4031-9da5-35fd43ef4c2f,berlin-site-02,WAN1-vlan0__WAN1
3,ae0802ba-327f-4031-9da5-35fd43ef4c2f,berlin-site-07,WAN1-vlan0__WAN1
4,ae0802ba-327f-4031-9da5-35fd43ef4c2f,boston-site-01,WAN1-vlan0__WAN1
5,ae0802ba-327f-4031-9da5-35fd43ef4c2f,boulder-site-01,WAN1-vlan0__WAN1
6,ae0802ba-327f-4031-9da5-35fd43ef4c2f,nuremberg-dc-01,WAN1-vlan0__WAN1
7,ae0802ba-327f-4031-9da5-35fd43ef4c2f,seattle-site-02,WAN1-vlan0__WAN1
8,ae0802ba-327f-4031-9da5-35fd43ef4c2f,seattle-site-02-lte,WAN1-vlan0__WAN1
9,ae0802ba-327f-4031-9da5-35fd43ef4c2f,seattle-site-02-ztp,WAN1-vlan0__WAN1


In [62]:
site_idx = 6
mac_idx = 0
sites = df_features.site_id.unique()
temp = get_features_site_mac(df_features, site=sites[site_idx], mac_idx=mac_idx)
temp[temp.peer_name.isin(['atlanta-site-01','virginia-site-01','seattle-site-02'])][50::100].T

site=ae0802ba-327f-4031-9da5-35fd43ef4c2f
mac=0200016a96fa


Unnamed: 0,5237,5337,6395,6495,7211,7311,7411
site_id,ae0802ba-327f-4031-9da5-35fd43ef4c2f,ae0802ba-327f-4031-9da5-35fd43ef4c2f,ae0802ba-327f-4031-9da5-35fd43ef4c2f,ae0802ba-327f-4031-9da5-35fd43ef4c2f,ae0802ba-327f-4031-9da5-35fd43ef4c2f,ae0802ba-327f-4031-9da5-35fd43ef4c2f,ae0802ba-327f-4031-9da5-35fd43ef4c2f
mac,0200016a96fa,0200016a96fa,0200016a96fa,0200016a96fa,0200016a96fa,0200016a96fa,0200016a96fa
peer_name,atlanta-site-01,atlanta-site-01,seattle-site-02,seattle-site-02,virginia-site-01,virginia-site-01,virginia-site-01
interface,WAN1-vlan0__WAN1,WAN1-vlan0__WAN1,WAN1-vlan0__WAN1,WAN1-vlan0__WAN1,WAN1-vlan0__WAN1,WAN1-vlan0__WAN1,WAN1-vlan0__WAN1
time,2021-03-22 02:00:00+00:00,2021-03-26 23:00:00+00:00,2021-03-21 05:00:00+00:00,2021-03-26 02:00:00+00:00,2021-03-20 08:00:00+00:00,2021-03-25 05:00:00+00:00,2021-03-29 09:00:00+00:00
over_limit_latency (median_percent),2,1,15.5,33,2,1.5,2
over_limit_latency (relative_iqr),0.03,0.02,0.56,0.81,0.03,0.03,0.02
within_limit_latency (median_percent),-3,-3,-46,-39.5,-4,-4,-4
within_limit_latency (relative_iqr),0.02,0.01,0.14,0.33,0.02,0.04,0.04
percent_violation_latency,45,46.2,30,40,35,35,30


In [50]:
plot_features_trend(temp)