In [3]:
import re


patterns = {
    'timestamp': re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}\-\d{4}: \d\.\d{3}:'),
    'oom': re.compile(r'java.lang.OutOfMemoryError'),
    'count': re.compile(r'执行结束!共生成对象次数:(\d+)'),
    'mem': re.compile(r'-XX:InitialHeapSize=(\d+)'),
    "level1": re.compile(r'^\s+(.*) total (\d+)[K],'),
    "level2": re.compile(r'^\s+(.*) (\d+)K,'),
    "meta": re.compile(r'^\s+(.*) used \d+K, capacity (\d+)K,'),
    "gctime": re.compile(r'^\[([^,]+).*, (\d+\.\d{7}) secs\].*\[Times: user=(\d\.\d{2}) sys=(\d\.\d{2}), real=(\d\.\d{2}) secs'),
    "cmsAction": re.compile(r'^\[([^:]+)\: \d+\.\d{3}\/(\d+\.\d{3}) secs\].*\[Times: user=(\d\.\d{2}) sys=(\d\.\d{2}), real=(\d\.\d{2}) secs'),
    "actionAndTime": re.compile(r'^\[([^,]+), (\d+\.\d{7}) secs\]$'),
    "onlyAction": re.compile(r'^\[([^,]+)\]$'),
    "memdeta": re.compile(r'\d+[KMG]\->\d+[KMG]\(\d+[KMG]\)'),
    "memused": re.compile(r'\d+[KMG]\(\d+[KMG]\)')
}

def match(patternName, string):
    return patterns[patternName].match(string)

def search(patternName, string):
    return patterns[patternName].search(string)

def sub(patternName, replace, string):
    return patterns[patternName].sub(replace, string)

def readLog(gc, mem):
    # read out file
    with open('log/%s%dM.out' % (gc, mem), 'r+') as f:
        output = f.read()

    # read gc log
    cmd = None
    records = []
    heap = None
    
    with open('log/%s%dM.log' % (gc, mem), 'r+') as f:
        line = f.readline()
        while line:
            if len(line) > 0:
                if not cmd:
                    if line.startswith('CommandLine'):
                        cmd = line
                elif heap:
                    heap = heap + '\n' + line;
                elif line.startswith('Heap'):
                    heap = line;
                else:
                    if match('timestamp', line):
                        records.append(line)
                    else:
                        records[-1] = records[-1] + '\n' + line;

            line = f.readline()

    return output, cmd, records, heap

def level1(record):
    left = 0
    for i, ch in enumerate(record):
        if ch == '[':
            if left == 0:
                buf = ch
            left += 1
        elif ch == ']':
            left -= 1
            if left == 0:
                buf += ch
                yield buf
        elif left == 1:
            buf += ch

def parseRecords(records):
    for record in records:
        record = sub('timestamp', '', record)
        
        top = ' '.join(level1(record))
        top = sub('memdeta', '', top)
        top = sub('memused', '', top)
        #print(top)
        
        searched = search('gctime', top)
        if searched:
            ret = searched.groups()
            #print(ret)
            yield ret[0].strip(), *map(float, ret[1:])
            continue
            
        searched = search('cmsAction', top)
        if searched:
            ret = searched.groups()
            #print(ret)
            yield ret[0].strip(), *map(float, ret[1:])
            continue
            
        matched = match('actionAndTime', top)
        if matched:
            ret = matched.groups()
            #print(ret)
            yield ret[0].strip(), *map(float, ret[1:])
            continue
        
        matched = match('onlyAction', top)
        if matched:
            #print(matched.groups())
            yield (matched.group(1).strip(),)
            continue
            
        print("passed")

def parseSpace(pattern, line):
    matched = search(pattern, line)
    
    return matched.group(1).strip(), int(matched.group(2))

def splitHeap(heap):
    ret = []
    for line in heap.split('\n')[1:]:
        if line.startswith('  '):
            ret[-1].append(line)
        elif line.startswith(' '):
            ret.append([line])
    return ret

def parseHeap(lines):
    if len(lines) > 2:
        yield parseSpace('level1', lines[0][0])
        
        for line in lines[0][1:]:
            yield parseSpace('level2', line)
        
    yield parseSpace('level1', lines[-2][0])
    
    yield parseSpace('meta', lines[-1][0])


def parseLogFile(gc, mem):
    output, cmd, records, heap = readLog(gc, mem)
    
    # parse output
    matched = search('count', output)
    if matched:
        count = int(matched.group(1))
    elif search('oom', output):
        count = 'oom'
    else:
        count = None
        display('none pattern matched.')
        
    # parse mem size
    memSize = int(search('mem', cmd).group(1)) // 1024
    
    # parse records
    gcTimes = list(parseRecords(records))

    # parse heap
    spaces = list(parseHeap(splitHeap(heap)))
    
    return count, memSize, gcTimes, spaces

parseLogFile('ConcMarkSweepGC', 128)

('oom',
 131072,
 [('GC (Allocation Failure)', 0.005012, 0.01, 0.0, 0.0),
  ('GC (Allocation Failure)', 0.0089492, 0.02, 0.01, 0.01),
  ('GC (Allocation Failure)', 0.0077728, 0.02, 0.0, 0.01),
  ('GC (Allocation Failure)', 0.0087592, 0.03, 0.01, 0.01),
  ('GC (Allocation Failure)', 0.0105731, 0.03, 0.01, 0.01),
  ('GC (CMS Initial Mark)', 0.0002193, 0.0, 0.0, 0.0),
  ('CMS-concurrent-mark-start',),
  ('CMS-concurrent-mark', 0.002, 0.0, 0.0, 0.01),
  ('CMS-concurrent-preclean-start',),
  ('CMS-concurrent-preclean', 0.0, 0.0, 0.0, 0.0),
  ('CMS-concurrent-abortable-preclean-start',),
  ('GC (Allocation Failure)', 0.007937, 0.03, 0.0, 0.01),
  ('GC (Allocation Failure)', 0.0101394, 0.04, 0.01, 0.01),
  ('GC (Allocation Failure)', 0.0197839, 0.02, 0.0, 0.02),
  ('GC (Allocation Failure)', 0.0159261, 0.01, 0.0, 0.01),
  ('GC (CMS Initial Mark)', 0.000289, 0.0, 0.0, 0.0),
  ('CMS-concurrent-mark-start',),
  ('CMS-concurrent-mark', 0.001, 0.01, 0.0, 0.01),
  ('CMS-concurrent-preclean-start',)

In [24]:
import pandas
import numpy as np

def outputReporter(gc, writer):
    dfSpaces = []
    dfRecords = []
    dfKpis = []

    for mem in (128, 512, 1024, 2048, 4096):
        count, memSize, gcTimes, spaces = parseLogFile(gc, mem)

        # space dataframe
        colName = '%dM' % (mem, )
        dfSpace = pandas.DataFrame(spaces, columns=['space', colName]).set_index('space')
        dfSpace[colName + '%'] = dfSpace[colName].apply(lambda x: '%.1f%%' % (x / memSize *100))
        dfSpaces.append(dfSpace)
        
        # record action dataframe
        dfRecord = pandas.DataFrame(gcTimes, columns=['action', 'gc', 'user', 'sys', 'real'])
        dfRecord['action'] = '%dM '%mem + dfRecord['action']
        grouped = dfRecord.groupby(['action'])
        dfRecord = grouped.agg([np.sum, np.mean, np.max, np.median])
        columns = dfRecord.columns.tolist()
        columns.insert(0, ('gc', 'count'))
        dfRecord[('gc','count')] = grouped.size()
        dfRecord = dfRecord.reindex(columns=columns)
        dfRecords.append(dfRecord)
        
        # kpi dataframe
        dfKpi = pandas.DataFrame((
            ('malloc count', count),
            ('gc time', dfRecord[('gc', 'sum')].sum()),
            ('max gc time', dfRecord[('gc', 'amax')].sum()),
            ('user time', dfRecord[('user', 'sum')].sum()),
            ('sys time', dfRecord[('sys', 'sum')].sum()),
            ('real time', dfRecord[('real', 'sum')].sum())
        ), columns=['kpi', colName]).set_index('kpi')
        dfKpis.append(dfKpi)

    # dfGcSpace
    dfGcSpace = pandas.concat(dfSpaces, axis=1)
    columns = dfGcSpace.columns.tolist()
    dfGcSpace['gc'] = gc
    columns.insert(0, 'gc')
    dfGcSpace = dfGcSpace.reindex(columns=columns)
    
    # dfGcKpi
    dfGcKpi = pandas.concat(dfKpis, axis=1)
    columns = dfGcKpi.columns.tolist()
    dfGcKpi['gc'] = gc
    columns.insert(0, 'gc')
    dfGcKpi = dfGcKpi.reindex(columns=columns)
    
    # dfAction
    dfAction = pandas.concat(dfRecords)
    
    return dfGcSpace, dfGcKpi, dfAction

with pandas.ExcelWriter('gclog.xlsx') as writer:
    pages = [
        {'sheet_name': 'space', 'dataframes': []},
        {'sheet_name': 'KPI', 'dataframes': []}
    ]
    
    for gc in ('SerialGC', 'ParallelGC', 'ConcMarkSweepGC', 'G1GC'):
        dfGcSpace, dfGcKpi, dfAction = outputReporter(gc, writer)
        pages.append({'sheet_name': '%s actions' % (gc,), 'dataframe': dfAction})
        pages[0]['dataframes'].append(dfGcSpace)
        pages[1]['dataframes'].append(dfGcKpi)
    
    for i in (0, 1):
        pages[i]['dataframe'] = pandas.concat(pages[i]['dataframes'])
    
    for page in pages:
        page['dataframe'].to_excel(writer, sheet_name=page['sheet_name'])