In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime, timedelta
from functools import partial

def round_datetime(timestamp, scale=1, interval=0):
    if timestamp : 
        timestamp = int(timestamp)
    else :
        timestamp=0
    timestamp = timestamp//scale
    tm = datetime.fromtimestamp(timestamp)
    if interval > 0:
        tm = tm - timedelta(minutes=tm.minute % interval, seconds=tm.second)
    return tm

def date_list(endDate, delta=14):
    temp=[endDate]
    for i in range(1,delta+1):
        temp.append(endDate - timedelta(days=i))
    return '{' + ','.join([str(d.date()) for d in temp]) + '}'


curried_roundDatetime_from_microsecond = partial(round_datetime, scale=1000000, interval=0)
udf_roundDate_microsecond = F.udf(curried_roundDatetime_from_microsecond, TimestampType())

############################################################

endDate = datetime.today()
interval = 14
dates=date_list(endDate, delta=interval)
date_string = dates.split('{')[1].split(',')[0] + "_" + dates.split('{')[1].split(',')[-1].split('}')[0]
#hour = '{'+','.join([str(i).zfill(2) for i in range(24)])+'}'
hour = '{*}'
storage = 's3'
env = 'staging'

cols = ["mac","org_id","site_id","when","model","ssr_peer_path_stats", "interfaces"]
df = spark.read.parquet(f"{storage}://mist-secorapp-{env}/oc-stats-analytics/oc-stats-analytics-{env}/dt={dates}/hr={hour}")

df = df.select(cols) \
        .filter(F.col('model')=='128T-Router') \
        .withColumn('num_entries_peer_path',F.size(F.col('ssr_peer_path_stats'))) \
        .withColumn('date', udf_roundDate_microsecond(F.col('when'))) \
        .where(F.col('num_entries_peer_path')>0)


df2 = df.select('mac','date','org_id','site_id', F.explode(F.col('ssr_peer_path_stats')))

df_peer_path = df2.select('mac','date','org_id','site_id',
                         F.col('col.*'))

df_peer_path = df_peer_path.withColumn('samples_dedupe',
                        F.array([F.element_at(F.col('samples'), -6),
                                 F.element_at(F.col('samples'), -5), 
                                 F.element_at(F.col('samples'), -4), 
                                 F.element_at(F.col('samples'), -3), 
                                 F.element_at(F.col('samples'), -2), 
                                 F.element_at(F.col('samples'), -1)]))

df_peer_path = df_peer_path.withColumn('num_samples',F.size(F.col('samples'))) \
            .withColumn('num_samples_dedupe',F.size(F.col('samples_dedupe')))

df_peer_path = df_peer_path.select('mac','date','org_id','site_id',
                                    'peer_name','adjacent_address', 'device_interface',
                                    'network_interface','vlan_id','uptime',
                                     F.explode(F.col('samples_dedupe')))



df_peer_path = df_peer_path.select('mac','date','org_id','site_id',
                                    'peer_name','adjacent_address', 'device_interface',
                                    'network_interface','vlan_id','uptime',
                                    F.col('col.*')) \
                           .withColumn('date_sample', udf_roundDate_microsecond(F.col('timestamp')))


In [2]:
df_peer_path.coalesce(5).write.csv(f'{storage}://mist-data-science-dev/ruchitm/ssr_peer_path/dt={date_string}', 
                                   header=True, 
                                   mode='overwrite')