In [1]:
sc.addPyFile("<path_spark_jobs zip file>")

import os
import pandas as pd
from datetime import datetime
import numpy as np
import json
from operator import itemgetter
from pprint import pprint
from itertools import groupby
from functools import partial
from elasticsearch_dsl import Search, Q, A
from elasticsearch import Elasticsearch

import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import Row

from common import epoch2datetime
from analytics.utils.time_util import current_epoch_seconds, to_seconds, to_milliseconds
from analytics.jobs.utils import *
from analytics.data_access.utils import parse_search_results
from analytics.transformer.rdd_row_group_transformer import RddRowGroupTransformer
from analytics.transformer.df_2_rdd_transformer import DF2RddTransformer
from analytics.transformer.util import recursive_get_field
from analytics.utils.fs_util import get_file_content


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
300,application_1652123621130_0302,pyspark,idle,Link,Link,ruchitm,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Data extract and job execute

In [None]:
date = '2022-04-28'
start_seconds = int((datetime.strptime(date,'%Y-%m-%d')-datetime(1970,1,1)).total_seconds())
start_hour = 1
n_step = 30
start_min=30
input_env = 'staging'
gen_category='t128_log_analysis'
config_file = 'ap-events'

#################################################################
########### Transform source data and generate event ##########
#################################################################

start_time = start_seconds + start_hour*3600 + start_min*60
end_time = start_time + n_step*60

job = start_debug_job(data_source=config_file, start_epoch=start_time, end_epoch=end_time,
                      debug_mode=True if input_env=='production' else False)

source_data = job.data_source_inst.get_data()

operations = job.generators.get(gen_category)
if operations.get('transformers'):
    for n, operator in enumerate(operations.get('transformers')):
        source_data = operator.transform(source_data)
        
event_gen = operations['events'][0]


In [5]:
pprint(source_data.take(2))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[{'event_type': 'ep-error-events', 'timestamp': 1651109626000, 'source': {'EPEvent': {'topic': 'ep-error-events-', 'hashkey': '3434a400-c693-11ec-aaaa-a6f504427bd9', 'Model': 'SSR', 'Firmware': '5.5.1-1', 'Version': 1, 'ID': '3f42e49b-c693-11ec-8ece-fdf9d8aa11d3', 'Type': 57005, 'Filetype': '', 'Title': 'Error Log message from []', 'Text': '', 'When': '2022-04-28 01:33:46.548277619 +0000 UTC', 'Uptime': 0}, 'Uptime': 0, 'S3filename': 'ssr/3434a400-c693-11ec-aaaa-a6f504427bd9/2022/04/28/01-33-46.548275768_errlog.dump', 'Type': '497659291Z.stack', 'Filetype': 'stack-trace', 'Client_MAC': None, 'BSSID': None, 'EPID': '02-00-01-72-de-91', 'Reason': '', 'DynamicCapture': False}, 'gateway_id': '02000172de91', 'site_id': None, 'org_id': None, 'filetype': 'stack-trace', 'filename': 'ssr/3434a400-c693-11ec-aaaa-a6f504427bd9/2022/04/28/01-33-46.548275768_errlog.dump', 'model': 'SSR', 'firmware': '5.5.1-1', 'ep_timestamp': '2022-04-28 01:33:46.548277619 +0000 UTC'}]

In [None]:
event_rdd = event_gen.generate_event(source_data, job.spark)

In [8]:
pprint(event_rdd.take(3))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[{'batch_count': 1,
  'category': 'ssr-availability',
  'details': {'filename': 'ssr/3434a400-c693-11ec-aaaa-a6f504427bd9/2022/04/28/01-33-46.548275768_errlog.dump',
              'firmware': '5.5.1-1',
              'issue_subtype': 'unknown',
              'issue_type': 'unknown',
              'model': 'SSR',
              'signature_id': '000',
              'stack_frames': []},
  'detection_delay': 2222510,
  'detection_time': 1653332136111,
  'display_entity_id': '02000172de91',
  'display_entity_type': 'gateway',
  'enable_action': False,
  'end_time': 1651109626000,
  'entity_id': 'none_02000172de91',
  'entity_type': 'gateway',
  'event_duration': 0,
  'event_name': '<placeholder>',
  'event_type': '<placeholder>',
  'gateway': {'firmware': '5.5.1-1',
              'gateway_id': '02000172de91',
              'model': 'SSR'},
  'gateway_id': '02000172de91',
  'mist_only': True,
  'modification_time': 1651111200000,
  'occurrence': 1,
  'org_id': 'none',
  'row_key': 'none_02000

In [None]:
# data_rdd = source_data.map(lambda x: event_gen.filter_and_enrich_event(x)) \
#     .filter(lambda x: x.get('site_id', None) is not None) \
#     .map(lambda x: event_gen.get_and_classify_trace(x))

# event_gen.logger.info("SSR stack trace count : %d" % (data_rdd.count()))

# group_rdd = data_rdd.groupBy(lambda x: ('_'.join([x['site_id'], x['gateway_id']]), x['timestamp'])) \
#     .map(lambda x: (x[0][0], x[1]))
# feature_rdd = event_gen.compose_entity_features(group_rdd)
# event_rdd = event_gen.gen_intra_batch_event(feature_rdd)
# event_rdd = event_gen.cross_batch_event_correlation(event_rdd)

In [5]:
# pprint(event_gen.gen_intra_batch_event(feature_rdd).take(3))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Process category

In [None]:
# date = '2022-04-28'
# start_seconds = int((datetime.datetime.strptime(date,'%Y-%m-%d')-datetime.datetime(1970,1,1)).total_seconds())
# start_hour = 1
# n_step = 30
# start_min = 30
# n_loops = 3*8
# input_env = 'staging'
# gen_category='t128_log_analysis'
# config_file = 'ap-events'

# ##################################################################

# for start_min in range(0, n_loops*n_step, n_step):
#     start_time = start_seconds + start_hour*3600 + start_min*60
#     end_time = start_time + n_step*60
#     job = start_debug_job(data_source=config_file, start_epoch=start_time, end_epoch=end_time,
#                           debug_mode=True if input_env=='production' else False)
#     source_data = job.data_source_inst.get_data()
#     operations = job.generators.get(gen_category)
#     if operations.get('transformers'):
#         for n, operator in enumerate(operations.get('transformers')):
#             operator.spark = spark
#             source_data = operator.transform(source_data)

#         source_data.persist()

#     if operations.get('events'):
#         for operator in operations.get('events'):
#             operator.process_event(source_data, spark)

#         source_data.unpersist()
#     print(f"########### {start_min} #########")