In [2]:
from functools import partial
from datetime import datetime, timedelta
import timeit
import boto3
import json
import copy
import pickle
import io
import pandas as pd
import numpy as np
import pyspark.sql.functions as F
from pyspark.sql import Row, Window
from pyspark.sql.types import *

"""
Utility functions 
"""
def date_list(endDate, delta=14):
    temp = [endDate]
    for i in range(1, delta + 1):
        temp.append(endDate - timedelta(days=i))
    return '{' + ','.join([str(d.date()) for d in temp]) + '}'


def readParquet(prefix, dates="{*}", hour="{*}", fields=None):
    path = prefix + "/dt=" + dates + "/hr=" + hour + "/"
    return spark.read.parquet(path)

def df_to_s3(df, path, filename):
    s3 = boto3.resource('s3')
    file_type = filename.split(".")[-1]
    with io.StringIO() as outputBuffer:
        if file_type == "pickle":
            pickle.dump(df, outputBuffer)
        elif file_type == "json":
            df.to_json(outputBuffer, orient='index')
        elif file_type == "csv":
            df.to_csv(outputBuffer)
            #json.dump(df, buffer)
        print(outputBuffer.closed)
        outputBuffer.seek(0)
        obj = s3.Object('mist-data-science-dev', f'{path}/{filename}')
        obj.put(Body=outputBuffer.getvalue())
    print(outputBuffer.closed)



### Pure python function for cycle detection and printing cycles

In [35]:
def get_site_graph(df_site, site_id=None):
    if site_id:
        df_site = df_site.filter(F.col('siteId')==site_id)

    df_site_adj_list = df_site.select('siteId','src','dst')\
                            .groupby('siteId','src')\
                            .agg(F.collect_set(F.col('dst')).alias('dst'))
    df_site_adj_list = df_site_adj_list.groupby("siteId")\
             .agg(F.map_from_arrays(F.collect_list("src"),F.collect_list("dst")).alias("graph"))
    
    return df_site_adj_list

def _isCycle(graph, v, done, stack): 

    done[v] = True
    stack[v] = True

    for neighbour in graph[v] : 
        if neighbour in graph :
            if done[neighbour] == False : 
                if _isCycle(graph, neighbour, done, stack) == True: 
                    print(f'Part of cycle: {neighbour}')
                    return True
            elif stack[neighbour] == True : 
                print(f'Last neighbour found on stack: {neighbour}')
                return True

    stack[v] = False
    return False


def isCycle(graph): 
    src_vertices = graph.keys()
    done = {k:False for k in src_vertices} 
    stack = {k:False for k in src_vertices} 

    for node in src_vertices: 
        if done[node] == False: 
            if _isCycle(graph, node, done, stack) == True: 
                return True
    return False

udf_isCycle = F.udf(isCycle, BooleanType())

class Cycles:
    def __init__(self, graph):
        self._graph = graph
        self.nodes = self.getNodes()
        self.nodeMap = {node : i for i, node in enumerate(self.nodes)}
        self.visited = ['NOT_VISITED' for _ in range(len(self.nodes))]  # Initialize all nodes unvisited
        self.stack = []  # Stack to keep track of visited nodes
        self.cycles = []

    def getNodes(self):
        _nodes=set([])
        for k, v in self._graph.items():
            items = [k] + v
            for item in items:
                _nodes.add(item)
        return list(_nodes)

    def printCycles(self):
        return self.cycles
    
    def addCycle(self, v):
        cycle=[]
        cycle.append(self.nodes[self.stack[-1]])
        i=1
        while cycle[-1]!=v and i<len(self.stack):
            i+=1
            cycle.append(self.nodes[self.stack[-i]])
            
        self.cycles.append(cycle)
        
    def dfs(self):
        curr = self.stack[-1]
        if self.nodes[curr] in self._graph:
            for neighbour in self._graph[self.nodes[curr]]:
                to = self.nodeMap[neighbour]
                if self.visited[to] == 'ON_STACK':
                    self.addCycle(to)
                elif self.visited[to] == 'NOT_VISITED':
                    self.stack.append(to)
                    self.visited[to] = 'ON_STACK'
                    self.dfs()
                
        self.visited[curr] = 'DONE'
        self.stack.pop()

    def findCycles(self):
        for i, node in enumerate(self.nodes):
            if self.visited[i] == 'NOT_VISITED':
                self.stack = []
                self.stack.append(i)
                self.visited[i] = 'ON_STACK'
                self.dfs()
        
        return self.printCycles()
    
def find_cycles(graph):
    cycles = Cycles(graph)
    return cycles.findCycles()

udf_findCycles = F.udf(find_cycles, ArrayType(ArrayType(StringType())))

In [3]:
"""
Variables
"""
ENV = 'production'
component = 'device'
prefix_edges = f"s3://mist-aggregated-stats-{ENV}/aggregated-stats/graph/snapshots/{component}-edges/"
prefix_nodes = f"s3://mist-aggregated-stats-{ENV}/aggregated-stats/graph/snapshots/{component}-nodes/"

END_DATE = datetime.today()
#END_DATE = datetime.strptime('2021-04-12', '%Y-%m-%d')
LAG = 0

hour = '14'

""""""

dates = date_list(END_DATE, delta=LAG)
df_edges = readParquet(prefix_edges, dates, hour=hour)
df_edges = df_edges.withColumn('isExpired', F.when(F.isnull(F.col('expiredAt')),'False').otherwise('True'))
df_active_edges = df_edges.filter(F.col('isExpired')==False)
df_nodes = readParquet(prefix_nodes, dates)
df_nodes = df_nodes.withColumn('isExpired', F.when(F.isnull(F.col('expiredAt')),'False').otherwise('True'))
df_active_nodes = df_nodes.filter(F.col('isExpired')==False)
df_active_nodes = df_active_nodes.withColumnRenamed("siteId","siteId_2")

"""Filter out deviceType=='ap'"""   
df_join = df_active_edges.join(df_active_nodes, df_active_edges.source==df_active_nodes.mac)
df_active_nodes = df_active_nodes.withColumn('mac_target', df_active_nodes.mac) \
                                .withColumn('deviceType_target', df_active_nodes.deviceType)
df_join = df_join.join(df_active_nodes.select('mac_target','deviceType_target'), df_join.target==df_active_nodes.mac_target)

df_active_edges_filtered = df_join.filter((F.col('deviceType')!='ap')&(F.col('deviceType_target')!='ap'))\
        .select('siteId','source','target','sourcePort','targetPort','sourceVendor','targetVendor','deviceType','deviceType_target')

df_active_edges_filtered = df_active_edges_filtered.withColumn('src',F.concat_ws("__",F.col('source'),F.col('sourcePort'))) \
                        .withColumn('dst',F.concat_ws("__",F.col('target'),F.col('targetPort')))

#df_active_edges_filtered.show()

In [5]:
df_site_graph = get_site_graph(df_active_edges_filtered)
df_site_graph_cycle = df_site_graph.withColumn('cycle_detected', udf_isCycle(F.col('graph')))
df_site_graph_cycle_true = df_site_graph_cycle.filter(F.col('cycle_detected')==True)
cycle_detect = df_site_graph_cycle_true.count()
print(f'Sites with cycles in {component} collection = {cycle_detect}')

Sites with cycles in device collection = 347


In [39]:
#df_to_s3(df_site_graph_cycle_true.toPandas(),
#         path="ruchitm/graphdb",
#         filename=f"sites_with_cycles_{dates}_{hour}.csv")

False
True


In [36]:
df_site_graph_all_cycles = df_site_graph_cycle_true.withColumn('cycles', udf_findCycles(F.col('graph')))
temp_pd = df_site_graph_all_cycles.select('siteId', 'cycles').toPandas()

In [9]:
#df_site_graph_all_cycles.show()

In [37]:
for site in temp_pd.siteId[::10]:
    print(f'Site={site}')
    for cycles in temp_pd[temp_pd.siteId==site].cycles:
        for cycle in cycles:
            if len(cycle)>=1:
                print(f'-->{cycle}, len={len(cycle)}')

Site=60241098-9de0-4744-bdcd-595e25fd303d
-->['3c8c93ab2b7f__ge-0/0/8', '3c8c93ab2b7f__ge-1/0/21'], len=2
-->['3c8c93ab24fd__ge-1/0/20', '3c8c93ab24fd__ge-1/0/23'], len=2
-->['045c6ce8b61a__ge-0/0/46', '3c8c93954202__ge-0/0/46'], len=2
-->['045c6ce8b61a__ge-1/0/47', '3c8c93954202__ge-1/0/47'], len=2
-->['3c8c93ab096d__ge-0/0/20', '3c8c93ab096d__ge-0/0/10'], len=2
-->['3c8c93ab0da3__ge-0/1/3', '3c8c93ab378e__ge-0/1/3'], len=2
-->['3c8c93ab33eb__ge-0/0/23', 'e45d376cd7fd__ge-0/0/22'], len=2
-->['3c8c93aaf8f7__ge-1/0/43', '3c8c93aaf8f7__ge-1/0/42'], len=2
-->['3c8c93ab0c7d__ge-1/0/23', '3c8c93ab500e__ge-1/0/23'], len=2
Site=9e425d0f-d2b3-44c7-aa0e-bc5078ba4918
-->['182ad3558dd2__ge-0/0/1', 'd0dd49ebba7a__ge-0/0/1'], len=2
Site=aba6715c-92f9-4828-9080-22dfbc2c2053
-->['80acac541320__ge-0/2/1', 'c0bfa7b7c1f3__ge-0/1/0'], len=2
-->['5800bb98c690__ge-0/1/0', '80acac531b40__ge-0/2/1'], len=2
-->['80acac53f880__ge-0/2/1', 'c003809f99b0__ge-0/1/0'], len=2
Site=d24cc1e5-c5d1-45e9-aa93-0f2ded05748

In [19]:
df_site_graph_all_cycles.write.parquet(f"s3://mist-data-science-dev/ruchitm/graphdb/site_graphdb_cycles/dt={dates}/hr={hour}",
                                  mode='overwrite')

+------+------+----------+----------+------------+------------+---+
|source|target|sourcePort|targetPort|sourceVendor|targetVendor|mac|
+------+------+----------+----------+------------+------------+---+
+------+------+----------+----------+------------+------------+---+



### source or target empty filter

In [32]:
df_empty = df_active_edges_filtered.filter((F.col('source')=='')|(F.col('target')=='')|(F.col('sourcePort')=='')|(F.col('targetPort')==''))

df_empty.show()

+--------------------+------------+------------+----------+----------+----------------+------------+----------+-----------------+--------------------+--------------+
|              siteId|      source|      target|sourcePort|targetPort|    sourceVendor|targetVendor|deviceType|deviceType_target|                 src|           dst|
+--------------------+------------+------------+----------+----------+----------------+------------+----------+-----------------+--------------------+--------------+
|af8d2d16-1bcc-4f6...|f4a739932000|00012e9681bb| ge-6/0/35|          |Juniper Networks|            |    switch|            third|f4a739932000__ge-...|00012e9681bb__|
|af8d2d16-1bcc-4f6...|f4a739932000|00012e9681bb| ge-6/0/35|          |Juniper Networks|            |    switch|            third|f4a739932000__ge-...|00012e9681bb__|
|af8d2d16-1bcc-4f6...|f4a739932000|00012e9681bb| ge-6/0/35|          |Juniper Networks|            |    switch|            third|f4a739932000__ge-...|00012e9681bb__|
|af8

In [31]:
df_active_edges_filtered.dropDuplicates().show()

root
 |-- siteId: string (nullable = true)
 |-- source: string (nullable = true)
 |-- target: string (nullable = true)
 |-- sourcePort: string (nullable = true)
 |-- targetPort: string (nullable = true)
 |-- sourceVendor: string (nullable = true)
 |-- targetVendor: string (nullable = true)
 |-- deviceType: string (nullable = true)
 |-- deviceType_target: string (nullable = true)
 |-- src: string (nullable = false)
 |-- dst: string (nullable = false)

