In [2]:
from functools import partial
from datetime import datetime, timedelta
import timeit
import boto3
import json
import copy
import pickle
import io
import pandas as pd
import numpy as np
import pyspark.sql.functions as F
from pyspark.sql import Row, Window
from pyspark.sql.types import *
#from graphframes import *

"""
Utility functions 
"""
def date_list(endDate, delta=14):
    temp = [endDate]
    for i in range(1, delta + 1):
        temp.append(endDate - timedelta(days=i))
    return '{' + ','.join([str(d.date()) for d in temp]) + '}'


def readParquet(prefix, dates="{*}", hour="{*}", fields=None):
    path = prefix + "/dt=" + dates + "/hr=" + hour + "/"
    return spark.read.parquet(path)

def readCSV(prefix, dates="{*}", hour="{*}", fields=None):
    path = prefix + "/dt=" + dates + "/hr=" + hour + "/"
    return spark.read.csv(path)

def readSeq(prefix, date, hour="*", fields=None, toPandas=False):
    path = prefix + "/dt=" + date + "/hr=" + hour + "/"
    temp = sc.sequenceFile(path).values() \
        .map(bytearray.decode).map(json.loads)
    if isinstance(fields, list):
        temp = temp.flatMap(lambda x: Row([x[field] for field in fields]))
    temp = spark.createDataFrame(temp)
    if isinstance(fields, list):
        for idx, field in enumerate(fields):
            temp = temp.withColumnRenamed("_" + str(idx + 1), field)
    _schema = copy.deepcopy(temp.schema)
    if toPandas:
        return temp.toPandas(), _schema
    return temp, _schema


def roundDatetime(timestamp, interval=0):
    tm = datetime.fromtimestamp(timestamp)
    if interval > 0:
        tm = tm - timedelta(minutes=tm.minute % interval, seconds=tm.second)
    return tm


@F.pandas_udf("int", F.PandasUDFType.GROUPED_AGG)
def median_udf(v):
    return v.median()


@F.pandas_udf("int", F.PandasUDFType.GROUPED_AGG)
def iqr_udf(v):
    iqr = v.quantile(0.75) - v.quantile(0.25)
    return iqr


def df_to_s3(df, path, filename):
    s3 = boto3.resource('s3')
    file_type = filename.split(".")[-1]
    with io.StringIO() as outputBuffer:
        if file_type == "pickle":
            pickle.dump(df, outputBuffer)
        elif file_type == "json":
            df.to_json(outputBuffer, orient='index')
        elif file_type == "csv":
            df.to_csv(outputBuffer)
            #json.dump(df, buffer)
        print(outputBuffer.closed)
        outputBuffer.seek(0)
        obj = s3.Object('mist-data-science-dev', f'{path}/{filename}')
        obj.put(Body=outputBuffer.getvalue())
    print(outputBuffer.closed)

"""
Spark UDFs
"""
curried_roundDatetime = partial(roundDatetime, interval=0)
udf_roundDate = F.udf(curried_roundDatetime, TimestampType())

udf_scaleToSeconds = F.udf(lambda tm : int(float(tm)/1E6), LongType())
udf_numElem = F.udf(lambda x : len(x), IntegerType())

### Pure python function for cycle detection and printing cycles

In [3]:
def get_site_graph(df_site, site_id=None):
    if site_id:
        df_site = df_site.filter(F.col('siteId')==site_id)

    df_site_adj_list = df_site.select('siteId','src','dst')\
                            .groupby('siteId','src')\
                            .agg(F.collect_set(F.col('dst')).alias('dst'))
    df_site_adj_list = df_site_adj_list.groupby("siteId")\
             .agg(F.map_from_arrays(F.collect_list("src"),F.collect_list("dst")).alias("graph"))
    
    return df_site_adj_list

def _isCycle(graph, v, done, stack): 

    done[v] = True
    stack[v] = True

    for neighbour in graph[v] : 
        if neighbour in graph :
            if done[neighbour] == False : 
                if _isCycle(graph, neighbour, done, stack) == True: 
                    print(f'Part of cycle: {neighbour}')
                    return True
            elif stack[neighbour] == True : 
                print(f'Last neighbour found on stack: {neighbour}')
                return True

    stack[v] = False
    return False


def isCycle(graph): 
    src_vertices = graph.keys()
    done = {k:False for k in src_vertices} 
    stack = {k:False for k in src_vertices} 

    for node in src_vertices: 
        if done[node] == False: 
            if _isCycle(graph, node, done, stack) == True: 
                return True
    return False

udf_isCycle = F.udf(isCycle, BooleanType())

class Cycles:
    def __init__(self, graph):
        self._graph = graph
        self.nodes = self.getNodes()
        self.nodeMap = {node : i for i, node in enumerate(self.nodes)}
        self.visited = ['NOT_VISITED' for _ in range(len(self.nodes))]  # Initialize all nodes unvisited
        self.stack = []  # Stack to keep track of visited nodes
        self.cycles = []

    def getNodes(self):
        _nodes=set([])
        for k, v in self._graph.items():
            items = [k] + v
            for item in items:
                _nodes.add(item)
        return list(_nodes)

    def printCycles(self):
        return self.cycles
    
    def addCycle(self, v):
        cycle=[]
        cycle.append(self.nodes[self.stack[-1]])
        i=1
        while cycle[-1]!=v and i<len(self.stack):
            i+=1
            cycle.append(self.nodes[self.stack[-i]])
            
        self.cycles.append(cycle)
        
    def dfs(self):
        curr = self.stack[-1]
        if self.nodes[curr] in self._graph:
            for neighbour in self._graph[self.nodes[curr]]:
                to = self.nodeMap[neighbour]
                if self.visited[to] == 'ON_STACK':
                    self.addCycle(to)
                elif self.visited[to] == 'NOT_VISITED':
                    self.stack.append(to)
                    self.visited[to] = 'ON_STACK'
                    self.dfs()
                
        self.visited[curr] = 'DONE'
        self.stack.pop()

    def findCycles(self):
        for i, node in enumerate(self.nodes):
            if self.visited[i] == 'NOT_VISITED':
                self.stack = []
                self.stack.append(i)
                self.visited[i] = 'ON_STACK'
                self.dfs()
        
        return self.printCycles()
    
def find_cycles(graph):
    cycles = Cycles(graph)
    return cycles.findCycles()

udf_findCycles = F.udf(find_cycles, ArrayType(ArrayType(StringType())))

In [8]:
"""
Variables
"""
ENV = 'production'
component = 'device'
prefix_edges = f"s3://mist-aggregated-stats-{ENV}/aggregated-stats/graph/snapshots/{component}-edges/"
prefix_nodes = f"s3://mist-aggregated-stats-{ENV}/aggregated-stats/graph/snapshots/{component}-nodes/"

END_DATE = datetime.today()
#END_DATE = datetime.strptime('2021-03-18', '%Y-%m-%d')
LAG = 0

hour = '14'

""""""

dates = date_list(END_DATE, delta=LAG)
df_edges = readParquet(prefix_edges, dates, hour=hour)
df_edges = df_edges.withColumn('isExpired', F.when(F.isnull(F.col('expiredAt')),'False').otherwise('True'))
df_active_edges = df_edges.filter(F.col('isExpired')==False)
df_nodes = readParquet(prefix_nodes, dates)
df_nodes = df_nodes.withColumn('isExpired', F.when(F.isnull(F.col('expiredAt')),'False').otherwise('True'))
df_active_nodes = df_nodes.filter(F.col('isExpired')==False)
df_active_nodes = df_active_nodes.withColumnRenamed("siteId","siteId_2")

## Filter out deviceType=='AP'  
df_join = df_active_edges.join(df_active_nodes,
                        [df_active_edges.source==df_active_nodes.mac, df_active_edges.target==df_active_nodes.mac],
                        how='left')

df_active_edges_filtered = df_join.filter(F.col('deviceType')!='ap')\
        .select('siteId','source','target','sourcePort','targetPort','sourceVendor','targetVendor','mac','model','deviceType')

df_active_edges_filtered = df_active_edges_filtered.withColumn('src',F.concat_ws("__",F.col('source'),F.col('sourcePort'))) \
                        .withColumn('dst',F.concat_ws("__",F.col('target'),F.col('targetPort')))

#df_active_edges_filtered.show()

In [10]:
df_site_graph = get_site_graph(df_active_edges_filtered)
df_site_graph_cycle = df_site_graph.withColumn('cycle_detected', udf_isCycle(F.col('graph')))

df_site_graph_cycle_true = df_site_graph_cycle.filter(F.col('cycle_detected')==True)
df_site_graph_cycle_true.select('siteId').show(5, truncate=False)
cycle_detect = df_site_graph_cycle_true.count()
print(f'Sites with cycles in {component} collection = {cycle_detect}')

+------------------------------------+
|siteId                              |
+------------------------------------+
|9291ba26-6e1e-11e5-9cdd-02e208b2d34f|
|60241098-9de0-4744-bdcd-595e25fd303d|
|4e627c2b-0e88-4d80-a8aa-c45926508c04|
|0027af11-eb72-425f-a1b2-e8f3b4ce8d10|
|67f210b0-fc3f-4d95-a3ee-111e63ac4b77|
+------------------------------------+
only showing top 5 rows

Sites with cycles in device collection = 40


In [12]:
df_site_graph_all_cycles = df_site_graph_cycle_true.withColumn('cycles', udf_findCycles(F.col('graph')))
temp_pd = df_site_graph_all_cycles.select('siteId', 'cycles').toPandas()

+--------------------+--------------------+
|              siteId|              cycles|
+--------------------+--------------------+
|60241098-9de0-474...|[[3c8c93ab24fd__g...|
|9291ba26-6e1e-11e...|[[d0dd49914188__g...|
|0027af11-eb72-425...|[[0c8126c7b987__g...|
|4e627c2b-0e88-4d8...|[[c8fe6a5db0ad__g...|
|67f210b0-fc3f-4d9...|[[0c8126c7da6f__g...|
|81c373db-9fcd-4a2...|[[c8fe6af63eb9__g...|
|4d9330f1-8816-449...|[[1c9c8cb9ea8c__m...|
|e47f249f-aca5-4a6...|[[b8c253a73c00__g...|
|2b6700bd-6c6e-409...|[[4c6d587afb23__m...|
|416bca36-2a01-4bf...|[[4c6d5866a596__m...|
|977cd71d-303d-41b...|[[80acac524640__g...|
|1782c519-9464-48c...|[[f4a739c5cd40__m...|
|4f177bae-b862-40a...|[[cce19414dd1e__g...|
|022d17f9-0236-469...|[[e45d3766ac8c__x...|
|85dd4350-3b60-462...|[[1039e97b2c5c__g...|
|a5092570-7713-4dd...|[[50c709a4ff2e__g...|
|f2b1c4c3-d2b3-498...|[[fc96432963ae__g...|
|06595606-0600-441...|[[0c599c833161__g...|
|c81f89f0-bd71-488...|[[84c1c1d4c080__m...|
|1a48ae56-588a-471...|[[0c8126c6

In [16]:
for site in temp_pd.siteId[::1]:
    print(f'Site={site}')
    for cycles in temp_pd[temp_pd.siteId==site].cycles:
        for cycle in cycles:
            print(f'-->{cycle}')

Site=60241098-9de0-4744-bdcd-595e25fd303d
-->['3c8c93ab24fd__ge-1/0/20', '3c8c93ab24fd__ge-1/0/23']
Site=9291ba26-6e1e-11e5-9cdd-02e208b2d34f
-->['d0dd49914188__ge-0/0/47', 'd0dd49914188__me0']
Site=4e627c2b-0e88-4d80-a8aa-c45926508c04
-->['c8fe6a5db0ad__ge-0/0/4', 'c8fe6a5db0ad__ge-0/0/9']
Site=0027af11-eb72-425f-a1b2-e8f3b4ce8d10
-->['0c8126c7b987__ge-1/0/26', '0c8126c7b987__ge-0/0/24']
Site=67f210b0-fc3f-4d95-a3ee-111e63ac4b77
-->['0c8126c7da6f__ge-0/0/6', '0c8126c7da6f__ge-0/0/14']
Site=81c373db-9fcd-4a20-b9b2-557a6e7e1dcf
-->['c8fe6af63eb9__ge-0/0/8', 'c8fe6af63eb9__ge-0/0/10']
Site=e47f249f-aca5-4a60-baae-5a2034fa4ed3
-->['b8c253a73c00__ge-0/0/33', 'b8c253a73c00__ge-2/0/19']
Site=4d9330f1-8816-449a-b3cc-d4e4e0a3313b
-->['1c9c8cb9ea8c__me0', '1c9c8cb9ea8c__ge-0/0/10']
Site=2b6700bd-6c6e-4096-8f61-a9c0b614a497
-->['4c6d587afb23__me0', '4c6d587afb23__ge-0/0/47']
-->['4c6d587add65__me0', '4c6d587add65__ge-1/0/41']
Site=416bca36-2a01-4bf3-bdff-8994690a325b
-->['4c6d5866a596__me0', '4c

In [None]:
df_to_s3(temp_pd, f"ruchitm/graphdb/site_graphdb_cycles_{dates}_hr_{hour}",
         f"sites_cycles.csv")

### source or target empty filter

In [19]:
df_empty = df_active_edges_filtered.filter((F.col('source')=='')|(F.col('target')=='')) \
                .select('source','target', 'sourceVendor','targetVendor','mac')

df_empty.show()

+------+------+----------+----------+------------+------------+---+
|source|target|sourcePort|targetPort|sourceVendor|targetVendor|mac|
+------+------+----------+----------+------------+------------+---+
+------+------+----------+----------+------------+------------+---+

