In [21]:
import pandas as pd
import os
import random
import numpy as np
import itertools

from pprint import pprint

from difflib import SequenceMatcher

import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt

INPUT_FOLDER = "/Users/ruchitm/Documents/Work/ssr-log-analysis/sample-traces"
df_st_highway_funcs = pd.read_excel(os.path.join(INPUT_FOLDER, 'highway_funcs_030422_annotated.xlsx'))
df_st_highway_funcs.drop(columns=df_st_highway_funcs.filter(regex="Unnamed").columns, inplace=True)

get_subcategory = lambda x: str(x).split('-')[1].strip().upper() if len(str(x).split('-'))>1 else 'GENERIC'
get_category = lambda x: '_'.join(str(x).split('-')[0].strip().upper().split()) if not pd.isna(x) else None
df_st_highway_funcs['Sub_category'] = df_st_highway_funcs.Category.apply(get_subcategory)
df_st_highway_funcs['Category'] = df_st_highway_funcs.Category.apply(get_category)
df_st_highway_funcs['keywords'] = df_st_highway_funcs.keywords.str.split(',')
df_st_highway_funcs.loc[df_st_highway_funcs.Category.isna(), 'Sub_category'] = None

df_st_highway_funcs.Category.value_counts()

SESSION_SETUP        6
CONFIG_PROCESSING    3
PACKET_PROCESSING    2
FAST_LANE_LOCK       1
Name: Category, dtype: int64

In [22]:
df_st_highway_funcs[['Category', 'Sub_category']].drop_duplicates()

Unnamed: 0,Category,Sub_category
0,CONFIG_PROCESSING,GENERIC
1,CONFIG_PROCESSING,UPDATE
2,FAST_LANE_LOCK,GENERIC
3,,
10,SESSION_SETUP,GENERIC
14,SESSION_SETUP,PACKET_HANDLER
17,PACKET_PROCESSING,ICMP
20,PACKET_PROCESSING,GENERIC
22,SESSION_SETUP,APP_ID_MODIFY


In [23]:
df_st_highway_keywords = df_st_highway_funcs.groupby(['Category', 'Sub_category'])['keywords'].agg(list)
df_st_highway_keywords = df_st_highway_keywords.apply(lambda x: list(itertools.chain(*x)))\
                                                .apply(lambda lst: set([x.strip() for x in lst]))
keywords_map = df_st_highway_keywords.to_dict(dict)

In [24]:
def getCategoryMap(trace):
    match = {}
    if isinstance(trace, str):
        for cat, words in keywords_map.items():
            match[cat] = np.round(np.mean([1 if word in trace else 0 for word in words]), 2)
    return match

def getCategory(mapping, thresh):
    if mapping and max(mapping.values())>thresh :
        return max(mapping, key=mapping.get) 

    

In [25]:
df_st_highway_funcs['Category_map'] = df_st_highway_funcs.processed_traces.apply(getCategoryMap)

In [26]:
keywords_map

{('CONFIG_PROCESSING', 'GENERIC'): {'ConfigData',
  'RegistryTable',
  'applyDeviceConfig',
  'handleDeviceConfigApplyRequest',
  'processConfig',
  'processConfigStateMachineResult'},
 ('CONFIG_PROCESSING', 'UPDATE'): {'updateConfigDone'},
 ('FAST_LANE_LOCK', 'GENERIC'): {'LockupDetector', 'runLockupDetection'},
 ('PACKET_PROCESSING', 'GENERIC'): {'fastlane',
  'processFlowPacketActions',
  'processInterfacePackets',
  'processPacketFromWire'},
 ('PACKET_PROCESSING', 'ICMP'): {'DivertedPacketHandler',
  'getKeyFromIcmpPayload'},
 ('SESSION_SETUP', 'APP_ID_MODIFY'): {'processAppIdModifyPackets',
  'processAppIdModifyPacketsImpl',
  'processPacketWithSessionBuilder'},
 ('SESSION_SETUP', 'GENERIC'): {'SessionBuilder',
  'addOrUpdateSession',
  'processCreateNewSessionAction'},
 ('SESSION_SETUP', 'PACKET_HANDLER'): {'SessionBuilder',
  'addOrUpdateSession',
  'processCreateNewSessionAction',
  'processPacketWithSessionBuilder'}}

In [27]:
MATCH_THRESH = 0.3
df_st_highway_funcs['Category_pred'] = df_st_highway_funcs.Category_map.apply(lambda x: getCategory(x, MATCH_THRESH))

In [28]:
df_st_highway_funcs.loc[~df_st_highway_funcs.Category_pred.isna(), ['Category','Sub_category','Category_pred']]

Unnamed: 0,Category,Sub_category,Category_pred
0,CONFIG_PROCESSING,GENERIC,"(CONFIG_PROCESSING, GENERIC)"
1,CONFIG_PROCESSING,UPDATE,"(CONFIG_PROCESSING, UPDATE)"
2,FAST_LANE_LOCK,GENERIC,"(FAST_LANE_LOCK, GENERIC)"
10,SESSION_SETUP,GENERIC,"(SESSION_SETUP, GENERIC)"
11,SESSION_SETUP,GENERIC,"(SESSION_SETUP, GENERIC)"
12,CONFIG_PROCESSING,GENERIC,"(CONFIG_PROCESSING, GENERIC)"
13,SESSION_SETUP,GENERIC,"(SESSION_SETUP, GENERIC)"
14,SESSION_SETUP,PACKET_HANDLER,"(SESSION_SETUP, PACKET_HANDLER)"
15,SESSION_SETUP,PACKET_HANDLER,"(SESSION_SETUP, PACKET_HANDLER)"
17,PACKET_PROCESSING,ICMP,"(PACKET_PROCESSING, ICMP)"


In [29]:
# df_st_highway_funcs.to_excel(os.path.join(INPUT_FOLDER, 'highway_funcs_030422_predicted.xlsx'))