In [1]:
from collections import Counter
from functools import partial
from datetime import datetime, timedelta
import timeit
import boto3
import json
import copy
import pickle
import io
import pandas as pd
import numpy as np
import networkx as nx
import pyspark.sql.functions as F
from pyspark.sql import Row, Window
from pyspark.sql.types import *


"""
Utility functions 
"""
def date_list(endDate, delta=14):
    temp = [endDate]
    for i in range(1, delta + 1):
        temp.append(endDate - timedelta(days=i))
    return '{' + ','.join([str(d.date()) for d in temp]) + '}'


def readParquet(prefix, dates="{*}", hour="{*}", fields=None):
    path = prefix + "/dt=" + dates + "/hr=" + hour + "/"
    return spark.read.parquet(path)

def df_to_s3(df, path, filename):
    s3 = boto3.resource('s3')
    file_type = filename.split(".")[-1]
    with io.StringIO() as outputBuffer:
        if file_type == "pickle":
            pickle.dump(df, outputBuffer)
        elif file_type == "json":
            df.to_json(outputBuffer, orient='index')
        elif file_type == "csv":
            df.to_csv(outputBuffer)
            #json.dump(df, buffer)
        print(outputBuffer.closed)
        outputBuffer.seek(0)
        obj = s3.Object('mist-data-science-dev', f'{path}/{filename}')
        obj.put(Body=outputBuffer.getvalue())
    print(outputBuffer.closed)

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1618773637871_0003,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
def get_site_graph(df_site, site_id=None):
    if site_id:
        df_site = df_site.filter(F.col('siteId')==site_id)

    df_site_adj_list = df_site.select('siteId','src','dst')\
                            .groupby('siteId','src')\
                            .agg(F.collect_set(F.col('dst')).alias('dst'))
    df_site_adj_list = df_site_adj_list.groupby("siteId")\
             .agg(F.map_from_arrays(F.collect_list("src"),F.collect_list("dst")).alias("graph"))
    
    return df_site_adj_list

def _isCycle(graph, v, done, stack): 

    done[v] = True
    stack[v] = True

    for neighbour in graph[v] : 
        if neighbour in graph :
            if done[neighbour] == False : 
                if _isCycle(graph, neighbour, done, stack) == True: 
                    print(f'Part of cycle: {neighbour}')
                    return True
            elif stack[neighbour] == True : 
                print(f'Last neighbour found on stack: {neighbour}')
                return True

    stack[v] = False
    return False


def isCycle(graph): 
    src_vertices = graph.keys()
    done = {k:False for k in src_vertices} 
    stack = {k:False for k in src_vertices} 

    for node in src_vertices: 
        if done[node] == False: 
            if _isCycle(graph, node, done, stack) == True: 
                return True
    return False

udf_isCycle = F.udf(isCycle, BooleanType())

class Cycles:
    def __init__(self, graph):
        self._graph = graph
        self.nodes = self.getNodes()
        self.nodeMap = {node : i for i, node in enumerate(self.nodes)}
        self.visited = ['NOT_VISITED' for _ in range(len(self.nodes))]  # Initialize all nodes unvisited
        self.stack = []  # Stack to keep track of visited nodes
        self.cycles = []

    def getNodes(self):
        _nodes=set([])
        for k, v in self._graph.items():
            items = [k] + v
            for item in items:
                _nodes.add(item)
        return list(_nodes)

    def printCycles(self):
        return self.cycles
    
    def addCycle(self, v):
        cycle=[]
        cycle.append(self.nodes[self.stack[-1]])
        i=1
        while cycle[-1]!=v and i<len(self.stack):
            i+=1
            cycle.append(self.nodes[self.stack[-i]])
            
        self.cycles.append(cycle)
        
    def dfs(self):
        curr = self.stack[-1]
        if self.nodes[curr] in self._graph:
            for neighbour in self._graph[self.nodes[curr]]:
                to = self.nodeMap[neighbour]
                if self.visited[to] == 'ON_STACK':
                    self.addCycle(to)
                elif self.visited[to] == 'NOT_VISITED':
                    self.stack.append(to)
                    self.visited[to] = 'ON_STACK'
                    self.dfs()
                
        self.visited[curr] = 'DONE'
        self.stack.pop()

    def findCycles(self):
        for i, node in enumerate(self.nodes):
            if self.visited[i] == 'NOT_VISITED':
                self.stack = []
                self.stack.append(i)
                self.visited[i] = 'ON_STACK'
                self.dfs()
        
        return self.printCycles()
    
def find_cycles(graph):
    cycles = Cycles(graph)
    return cycles.findCycles()

udf_findCycles = F.udf(find_cycles, ArrayType(ArrayType(StringType())))

def find_cycles_nx(graph):
    G = nx.DiGraph(graph)
    return list(nx.simple_cycles(G))

udf_findCyclesNX = F.udf(find_cycles_nx, ArrayType(ArrayType(StringType())))

# g = nx.DiGraph({'a':['b','c','e'], 'b':['a','c','d'], 'c':['a','d','f'], 'd':['a','f','e']})
# list(nx.simple_cycles(g))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
"""
Variables
"""
ENV = 'production'
component = 'device'
prefix_edges = f"s3://mist-aggregated-stats-{ENV}/aggregated-stats/graph/snapshots/{component}-edges/"
prefix_nodes = f"s3://mist-aggregated-stats-{ENV}/aggregated-stats/graph/snapshots/{component}-nodes/"

END_DATE = datetime.today()
#END_DATE = datetime.strptime('2021-04-12', '%Y-%m-%d')
LAG = 0

hour = '18'

""""""

dates = date_list(END_DATE, delta=LAG)
df_edges = readParquet(prefix_edges, dates, hour=hour)
df_edges = df_edges.withColumn('isExpired', F.when(F.isnull(F.col('expiredAt')),'False').otherwise('True')) \
                    .filter(F.col('relType') == 'uplink')
df_active_edges = df_edges.filter(F.col('isExpired')==False)
df_nodes = readParquet(prefix_nodes, dates)
df_nodes = df_nodes.withColumn('isExpired', F.when(F.isnull(F.col('expiredAt')),'False').otherwise('True'))
df_active_nodes = df_nodes.filter(F.col('isExpired')==False)
df_active_nodes = df_active_nodes.withColumnRenamed("siteId","siteId_2")

"""Filter out deviceType=='ap'"""   
df_join = df_active_edges.join(df_active_nodes, [df_active_edges.source==df_active_nodes.mac,
                                                 df_active_edges.siteId==df_active_nodes.siteId_2]).drop('siteId_2')
df_active_nodes = df_active_nodes.withColumn('mac_target', df_active_nodes.mac) \
                                .withColumn('deviceType_target', df_active_nodes.deviceType)
df_join = df_join.join(df_active_nodes.select('mac_target','deviceType_target','siteId_2'),
                       [df_join.target==df_active_nodes.mac_target,
                        df_join.siteId==df_active_nodes.siteId_2])

df_active_edges_filtered = df_join.filter((F.col('deviceType')!='ap')&(F.col('deviceType_target')!='ap'))\
        .select('siteId','source','target','sourcePort','targetPort','sourceVendor','targetVendor','deviceType','deviceType_target', 'relType')

df_active_edges_filtered = df_active_edges_filtered.withColumn('src',F.concat_ws("__",F.col('source'),F.col('sourcePort'))) \
                        .withColumn('dst',F.concat_ws("__",F.col('target'),F.col('targetPort')))


#df_active_edges_filtered.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
# df_join.filter((F.col('deviceType')=='ap')|(F.col('deviceType_target')=='ap')) \
#         .select('source','target',
#                 'sourcePort','targetPort',
#                 'deviceType','deviceType_target', 'relType') \
#         .show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
df_active_edges_filtered.select('deviceType','deviceType_target').distinct().show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+-----------------+
|deviceType|deviceType_target|
+----------+-----------------+
|    switch|           switch|
|   gateway|          gateway|
|   gateway|           switch|
|    switch|            third|
|   gateway|            third|
|    switch|          gateway|
+----------+-----------------+

In [26]:
df_site_graph = get_site_graph(df_active_edges_filtered)
df_site_graph_cycle = df_site_graph.withColumn('cycle_detected', udf_isCycle(F.col('graph')))
df_site_graph_cycle_true = df_site_graph_cycle.filter(F.col('cycle_detected')==True)
cycle_detect = df_site_graph_cycle_true.count()
print(f'Sites with cycles in {component} collection(relType==uplink) = {cycle_detect}')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Sites with cycles in device collection(relType==uplink) = 59

In [27]:
df_site_graph_all_cycles = df_site_graph_cycle_true.withColumn('cycles', udf_findCycles(F.col('graph')))
df_graph_cycles_pd = df_site_graph_all_cycles.select('siteId', 'cycles').toPandas()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
df_site_graph_all_cycles_nx = df_site_graph_cycle_true.withColumn('cycles', udf_findCyclesNX(F.col('graph')))
df_graph_cycles_nx_pd = df_site_graph_all_cycles_nx.select('siteId', 'cycles').toPandas()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [42]:
temp = df_graph_cycles_nx_pd.copy()
for site in temp.siteId[::1]:
    print(f'Site={site}')
    for cycles in temp[temp.siteId==site].cycles:
        for cycle in cycles:
            if len(cycle)>=1:
                print(f'-->{cycle}, len={len(cycle)}')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Site=341880de-2d4d-430f-9b5e-928577cd6c84
-->['840328014ed0__ge-0/2/0', '840328015f95__ge-0/2/0'], len=2
Site=94650a18-f0a9-4852-a709-d12a2ce23eea
-->['e8a24550deb2__ge-0/0/0', '20d80ba74102__ge-0/0/0'], len=2
Site=aba6715c-92f9-4828-9080-22dfbc2c2053
-->['5800bb98c690__ge-0/1/0', '80acac531b40__ge-0/2/1'], len=2
Site=d24cc1e5-c5d1-45e9-aa93-0f2ded05748a
-->['fc964368dcc0__xe-0/0/19', 'e8a2452c66c0__xe-0/2/0'], len=2
Site=b3610203-d3ca-4511-803c-dca6f17c470c
-->['0c599c6bacc7__ge-0/1/0', '0c599c8333fa__ge-0/1/0'], len=2
Site=c6ba09d0-bf78-46a3-832e-491f3b8441b7
-->['fc964328d895__xe-0/1/1', 'fc964328b25f__xe-0/1/1'], len=2
-->['fc964328b25f__xe-1/1/1', 'fc964328d895__xe-1/1/1'], len=2
Site=a4b10ceb-22df-4272-97dd-e2f88da828b1
-->['fc3342b92b2e__xe-2/2/2', '50c709a51a64__xe-0/2/0'], len=2
-->['50c709a51a64__xe-1/2/0', 'fc3342b92b2e__xe-1/2/2'], len=2
Site=af909340-6a2e-4b38-8e6c-1196b6afac8d
-->['fc964367a6c0__ge-0/0/0', '544b8c17cc0e__ge-0/1/0'], len=2
Site=f03f310f-42d4-4bd2-bcbe-d8f0

In [43]:
_temp = temp['cycles'].apply(lambda cycles: dict(Counter([len(cycle) for cycle in cycles]))).apply(pd.Series)
temp = pd.concat([temp, _temp], axis=1)
temp.sort_values(by=2,ascending=False)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

                                  siteId  ...  2
18  218aa8e0-5d72-4d09-ac08-b966956e10ca  ...  2
12  3ac31149-7c0a-4bd4-84e0-22ebccb96e34  ...  2
17  70e22b5d-7351-4d6c-8f57-c67fa6aad368  ...  2
30  061f9433-7f3a-44f0-8344-542a2e373f62  ...  2
36  3a16da80-7cdb-4076-9774-23e6d2c2abcf  ...  2
5   c6ba09d0-bf78-46a3-832e-491f3b8441b7  ...  2
6   a4b10ceb-22df-4272-97dd-e2f88da828b1  ...  2
43  944b991b-ecdd-434e-a29b-6bd3ef876736  ...  2
25  580c3489-06ee-41be-8955-b9004a443d4c  ...  2
41  f766e5fb-0f43-481a-b9c5-ed4432e49e6e  ...  2
44  9eeb8211-c0d6-4c71-9ca8-fe650571f374  ...  1
42  58520b3b-938c-40b6-942f-cda40bd72139  ...  1
0   341880de-2d4d-430f-9b5e-928577cd6c84  ...  1
40  dccf8cb0-7b22-4cdd-9ef2-2793e79aadb9  ...  1
45  00df7e5c-64b8-4caa-8745-0c2395dd9f73  ...  1
38  7930e8a4-c4f2-4e8c-9cf6-42ce24a0ec61  ...  1
37  e10b6e4d-350c-4e4c-91fb-01e4ef07bbe6  ...  1
35  54154b5a-35e0-4ef4-9574-d95cb57b855d  ...  1
39  24233519-bcd1-48e1-b764-4321abcb3558  ...  1
46  630feeb6-fac3-4a