In [1]:
import os
import re
import pandas as pd
import numpy as np
from pprint import pprint
import difflib
from collections import namedtuple
import itertools



INPUT_FOLDER = "/Users/ruchitm/Documents/Work/ssr-log-analysis/sample-traces"
OUTPUT_FOLDER = INPUT_FOLDER

filenames = []
for root, dirs, files in os.walk(INPUT_FOLDER):
    for file in files:
        if "stack" in file and "-demangled.txt" in file:
            filenames.append(os.path.join(root, file))
tickets = set([x.split('/')[-2] for x in filenames])
print(len(tickets))

75


In [2]:
records = []
for ticket in tickets:
    files = [f for f in filenames if ticket in f]
    for file in files:
        with open(file, encoding="utf8", errors='ignore') as f:
            record = {'JIRA': ticket, 'file': file.split('/')[-1]}
            flag = False
            for line in f:
                if "stack trace" in line or "Stack trace" in line:
                    flag = True
                if not flag and line.strip():
                    record[line.split(':')[0].strip()] = ':'.join([x.strip() for x in line.split(':')[1:]])
                else:
                    record.setdefault('stack_traces', []).append(line.strip())
        records.append(record)

In [3]:

cols = ['JIRA', 'file', 'PID', 'UID', 'GID', 'Signal', 'Timestamp',
       'Command Line', 'Executable', 'Control Group', 'Unit', 'Slice',
       'Boot ID', 'Machine ID', 'Hostname', 'Coredump', 'Message',
       'stack_traces']

df_st = pd.DataFrame(records)
df_st = df_st[cols]
df_st.dropna(thresh=4, inplace=True)

df_st['trace_type'] = df_st.file.apply(lambda z: 'full' if 'full' in z else 'partial')

df_st_full = df_st[df_st.file.str.match('full')]
df_st_partial = df_st[df_st.file.str.match('partial')]
print(df_st_partial.JIRA.unique().shape)
print(df_st_full.JIRA.unique().shape)

(70,)
(39,)


In [4]:
# df_st.groupby(['Executable', 'trace_type']).JIRA.count().sort_values(ascending=False)

In [5]:
def get_first_trace(trace):
    flag=False
    res = []
    for i, x in enumerate(trace):
        x = x.strip()
        if 'Stack trace' in x and not flag:
            flag=True
        elif 'Stack trace' in x and flag:
            break
        elif x!='':
            res.append(clean_frame(x))
        
    return res

def clean_frame(frame):
    res = re.sub('0x\w+\s', '', frame.encode("ascii", "ignore").decode())
    return res

IGNORE_PATTERNS = ['_M_', 'std::', '__gnu', '::operator', 'util::']
def extract_funs(trace): 
    res = []
    for x in trace:
        if re.match('^\#\d+\s+\w+\:\:', x):
            if ('lib128T' in x or '(highway)' in x):
                x = re.sub('^\#\d+','',x)                
                z = re.split("\s+|\(|\<|\>|\)|,", x)    
                frame = []
                for w in z:
                    m = re.match("[\w+\:\:\~?]+", w)
                    if m and '::' in w:
                        if all([pat not in w for pat in IGNORE_PATTERNS]) :
                            if m.group(0).strip()!='': 
                                frame.append(m.group(0).strip())
                if len(frame)>0:
                    res.append(' '.join(frame))
    return res 

def dedup_traces(traces):
    new_list = [sorted(trace, key=lambda x: x.pos) for trace in traces]
    return [k for k, _ in itertools.groupby(new_list)]

def first_class_preprocessor(trace):
    new_trace = []
    for frame in trace.split('\n'):
        new_frame = []
        for word in frame.split():
            z = [w for w in word.split('::') if w]
            flag=False
            i=0
            while(not flag and i<len(z)):
                if z[i]!='':
                    if z[i].islower():
                        i+=1
                    else:
                        i+=1
                        flag=True
                else:
                    flag=True
            if i>0:
                new_frame.append('::'.join(z[:i]))
        
        new_trace.append(' '.join(new_frame))
    return '\n'.join(new_trace)

def first_function_preprocessor(trace):
    new_trace = []
    for frame in trace.split('\n'):
        new_frame = []
        for word in frame.split():
            z = [w for w in word.split('::') if w]
            flag=False
            i=0
            while(not flag and i<len(z)):
                if z[i]!='':
#                     print(z[i])
                    if z[i].islower() or z[i][0].isupper() or z[i][0]=='~':
                        i+=1
                    else:
                        i+=1
                        flag=True
                else:
                    flag=True
            if i>0:
                new_frame.append('::'.join(z[:i]))
        
        new_trace.append(' '.join(new_frame))
    return '\n'.join(new_trace)

def add_frame_num(trace):
    new_trace = [f"{n}: {frame}" for n, frame in enumerate(trace.split('\n'))]
    return '\n'.join(new_trace)


################# TESTING ######################
# trace = df_st_highway_funcs.traces.values[10]
# print(trace, '\n')
# print(first_class_preprocessor(trace), '\n')
# print(first_function_preprocessor(trace))

In [6]:
exe = '/usr/bin/highway'
idx = (df_st_partial.Executable.str.match(exe))
df_st_highway = df_st_partial[idx].copy()
df_st_highway['first_st'] = df_st_highway['stack_traces'].apply(lambda x: get_first_trace(x))
df_st_highway['extracted_st'] = df_st_highway['first_st'].apply(lambda x: extract_funs(x))

Frame = namedtuple('Frame',['pos', 'val'])
df_st_highway['frames'] = df_st_highway.extracted_st.apply(lambda lst: [Frame(i, v) for i, v in enumerate(lst)])
df_st_highway_funcs = df_st_highway.groupby(['JIRA']).frames.apply(list).reset_index()

df_st_highway_funcs['frames'] = df_st_highway_funcs.frames.apply(lambda lst: dedup_traces(lst))
df_st_highway_funcs['traces'] = df_st_highway_funcs.frames.apply(lambda lst: ['\n'.join([frame.val for frame in sublist]) for sublist in lst])
df_st_highway_funcs = df_st_highway_funcs.explode(['traces'])
df_st_highway_funcs['processed_traces'] = df_st_highway_funcs.traces.apply(lambda x: first_function_preprocessor(x))
df_st_highway_funcs['processed_traces_numbered'] = df_st_highway_funcs.processed_traces.apply(lambda x: add_frame_num(x))

print(len(df_st_highway_funcs))

48


In [8]:
# df_st_highway_funcs.to_csv(os.path.join(OUTPUT_FOLDER, 'highway_funcs_030422.csv'))