In [1]:
from functools import partial
from datetime import datetime, timedelta
import timeit
import boto3
import json
import copy
import pickle
import io
import pandas as pd
import numpy as np
import pyspark.sql.functions as F
from pyspark.sql import Row, Window
from pyspark.sql.types import *

"""
Utility function definitions
"""


def date_list(endDate, delta=14):
    temp = [endDate]
    for i in range(1, delta + 1):
        temp.append(endDate - timedelta(days=i))
    return '{' + ','.join([str(d.date()) for d in temp]) + '}'


def readParquet(prefix, dates="{*}", hour="{*}", fields=None):
    path = prefix + "/dt=" + dates + "/hr=" + hour + "/"
    return spark.read.parquet(path)

def readCSV(prefix, dates="{*}", hour="{*}", fields=None):
    path = prefix + "/dt=" + dates + "/hr=" + hour + "/"
    return spark.read.csv(path)

def readSeq(prefix, date, hour="*", fields=None, toPandas=False):
    path = prefix + "/dt=" + date + "/hr=" + hour + "/"
    temp = sc.sequenceFile(path).values() \
        .map(bytearray.decode).map(json.loads)
    if isinstance(fields, list):
        temp = temp.flatMap(lambda x: Row([x[field] for field in fields]))
    temp = spark.createDataFrame(temp)
    if isinstance(fields, list):
        for idx, field in enumerate(fields):
            temp = temp.withColumnRenamed("_" + str(idx + 1), field)
    _schema = copy.deepcopy(temp.schema)
    if toPandas:
        return temp.toPandas(), _schema
    return temp, _schema


def roundDatetime(timestamp, interval=0):
    tm = datetime.fromtimestamp(timestamp)
    if interval > 0:
        tm = tm - timedelta(minutes=tm.minute % interval, seconds=tm.second)
    return tm


curried_roundDatetime = partial(roundDatetime, interval=0)
udf_roundDate = F.udf(curried_roundDatetime, TimestampType())

udf_scaleToSeconds = F.udf(lambda tm : int(float(tm)/1E6), LongType())
udf_numElem = F.udf(lambda x : len(x), IntegerType())

@F.pandas_udf("int", F.PandasUDFType.GROUPED_AGG)
def median_udf(v):
    return v.median()


@F.pandas_udf("int", F.PandasUDFType.GROUPED_AGG)
def iqr_udf(v):
    iqr = v.quantile(0.75) - v.quantile(0.25)
    return iqr

def gaussian_smooth(df, groups, window=5, std=2):
    return df.set_index('time_of_day').groupby(groups)[['upper','lower']] \
                      .rolling(window, win_type='gaussian', min_periods=1, std=std) \
                      .mean().reset_index()

def df_to_s3(df, path, filename):
    s3 = boto3.resource('s3')
    file_type = filename.split(".")[-1]
    with io.StringIO() as outputBuffer:
        if file_type == "pickle":
            pickle.dump(df, outputBuffer)
        elif file_type == "json":
            df.to_json(outputBuffer, orient='index')
            #json.dump(df, buffer)
        print(outputBuffer.closed)
        outputBuffer.seek(0)
        obj = s3.Object('mist-data-science-dev', f'{path}/{filename}')
        obj.put(Body=outputBuffer.getvalue())
    print(outputBuffer.closed)

In [2]:
"""
Variables
"""
ENV = 'production'
prefix = f"s3://mist-aggregated-stats-{ENV}/aggregated-stats/graph/snapshots/client-edges/"

#END_DATE = datetime.today()
END_DATE = datetime.strptime('2021-02-12', '%Y-%m-%d')
LAG = 0
""""""

dates = date_list(END_DATE, delta=LAG)
df_stats = readParquet(prefix, dates)
df_stats = df_stats.withColumn('isExpired', F.when(F.isnull(F.col('expiredAt')),'False').otherwise('True'))
df_active = df_stats.filter(F.col('isExpired')==False)

In [3]:
def get_site_graph(df_site, site_id=None):
    if site_id:
        df_site = df_site.filter(F.col('siteId')==site_id)

    df_site_adj_list = df_site.select('siteId','from','to')\
                            .groupby('siteId','from')\
                            .agg(F.collect_set(F.col('to')).alias('to'))
    df_site_adj_list = df_site_adj_list.groupby("siteId")\
             .agg(F.map_from_arrays(F.collect_list("from"),F.collect_list("to")).alias("graph"))
    
    return df_site_adj_list

In [5]:
def _isCycle(graph, v, done, stack): 

    done[v] = True
    stack[v] = True

    for neighbour in graph[v] : 
        if neighbour in graph :
            if done[neighbour] == False : 
                if _isCycle(graph, neighbour, done, stack) == True: 
                    print(f'Part of cycle: {neighbour}')
                    return True
            elif stack[neighbour] == True : 
                print(f'Last neighbour found on stack: {neighbour}')
                return True

    stack[v] = False
    return False


def isCycle(graph): 
    src_vertices = graph.keys()
    done = {k:False for k in src_vertices} 
    stack = {k:False for k in src_vertices} 

    for node in src_vertices: 
        if done[node] == False: 
            if _isCycle(graph, node, done, stack) == True: 
                return True
    return False

def cycle_unittest():
    graph1 = {'a':['b','c','d'], 'b':['c','e'], 'd':['e','f','g'], 'e':['a','c']}
    df_graph1 = spark.createDataFrame(graph1, schema=['graph'])
    print(isCycle(graph1))
    print('\n')
    graph2 = {'a':['b','c','d'], 'b':['c'], 'd':['e','f','g'], 'e':['a','c']}
    print(isCycle(graph2))
    
udf_isCycle = F.udf(isCycle, BooleanType())

In [6]:
cycle_unittest()

Last neighbour found on stack: a
Part of cycle: e
Part of cycle: b
True


Last neighbour found on stack: a
Part of cycle: e
Part of cycle: d
True


In [9]:
#site_id = '03e668d2-7c72-4451-ae99-2e32b2b97b71'
site_id = None

df_site_graph = get_site_graph(df_active, site_id=site_id)

df_site_graph = df_site_graph.withColumn('cycle_detected', udf_isCycle(F.col('graph')))

In [11]:
df_site_graph.filter(F.col('cycle_detected')==False).show()
df_site_graph.filter(F.col('cycle_detected')==True).show()

+------+-----+--------------+
|siteId|graph|cycle_detected|
+------+-----+--------------+
+------+-----+--------------+



In [13]:
df_site_graph_pandas = df_site_graph.show(10).toPandas()

+--------------------+--------------------+--------------+
|              siteId|               graph|cycle_detected|
+--------------------+--------------------+--------------+
|0014d8a6-dfa0-4f3...|[ded55039f50b7879...|         false|
|0049c3ef-3406-4da...|[8179035e64109b48...|         false|
|0101c2ab-c573-4ac...|[a5ea2b030443565b...|         false|
|023eb151-afe0-408...|[608dfa66627474b9...|         false|
|02a73d0a-06fe-491...|[892301edde69d17b...|         false|
|0351532d-4adb-417...|[f8bf3ac422c8d596...|         false|
|036da01d-c2ac-49b...|[58c201529841ab55...|         false|
|03cdcb4d-0932-4fc...|[81437fc18e762cf8...|         false|
|046a45e5-6e83-470...|[26f382d0327189d8...|         false|
|0471f96e-a3dd-496...|[5e66612e75190e26...|         false|
+--------------------+--------------------+--------------+
only showing top 10 rows



AttributeError: 'NoneType' object has no attribute 'toPandas'