Skip to content

Commit

Permalink
Merge 08e1280 into f2086a4
Browse files Browse the repository at this point in the history
  • Loading branch information
scallister committed Apr 10, 2018
2 parents f2086a4 + 08e1280 commit 8587040
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 44 deletions.
75 changes: 48 additions & 27 deletions fossor/checks/similar_log_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from difflib import SequenceMatcher
from multiprocessing import Pool, cpu_count
from collections import OrderedDict

from fossor.checks.check import Check

Expand All @@ -12,8 +13,8 @@ class SimilarLogErrors(Check):

# TODO add table formatting to show diffs over time?

SIMILARITY_RATIO = 0.7
MAX_COMMON_LINES = 20
SIMILARITY_RATIO = 0.7 # Lines but be at least 70% similar in order to be counted.
MAX_COMMON_LINES = 20 # Only track similarity ratios for up to 20 lines.

# Logs matching these strings will not be included
log_name_blacklist = ['_configuration.log',
Expand Down Expand Up @@ -42,8 +43,16 @@ def run(self, variables):

for filename, common_lines in pool_results:
lines = []
for line, count in sorted(common_lines.items(), reverse=True, key=lambda x: x[1]): # Sort by count
lines.append(f"Count: {count}. {line.strip()}")
for line, values in common_lines.items(): # Sort by count
count = values['count']
first_seen = values.get('first_seen', None)
first_seen_text = ''
if first_seen:
first_seen_text = f"First seen: {first_seen}, "
count_text = f"Count: {count}, "
line_text = f"{line.strip()}"
text = first_seen_text + count_text + line_text
lines.append(text)
if lines:
result += f"filename: {filename}\n"
result += '\n'.join(lines) + '\n\n'
Expand All @@ -59,61 +68,73 @@ def _remove_blacklisted_logs(self, log_files):
return log_files

def get_error_pattern_counts(self, filename):
common_lines = {}
common_lines = OrderedDict()
self.log.debug(f"Processing filename: {filename}")
f = open(filename, 'rt')
for line in f.readlines():
line = self._process_line(line)
line, date = self._process_line(line)
if not line:
continue

# Group similar log lines
common_lines = self._group_similar_log_lines(line=line, common_lines=common_lines)
common_lines = self._group_similar_log_lines(line=line, date=date, common_lines=common_lines)

self.log.debug(f"Finished processing filename: {filename}")
return filename, common_lines

def _process_line(self, line):
'''Returns line and timestamp from line, returns None if line should be ignored'''
date = None
first_character = line[0]

# Skip certain log lines
# Skip since this is the lower portion of a stacktrace
if first_character.isspace():
return
line = None
# Line did not start with a date
elif first_character.isalpha() and 'Exception' not in line:
return
line = None
elif line.startswith('Caused by'):
return
# Check for an error level
line = None
# Check for date and error level
elif first_character.isdigit():
line_split = line.split()
try:
line_level = line_split[2]
if line_level != 'ERROR':
line = None
else:
line = ' '.join(line_split[2:]) # Remove date portion from line
date = ' '.join(line_split[:2])
# Truncate line
line = f"{line:.200}"
except IndexError as e:
self.log.debug(f"IndexError parsing line: {line}")
return
if line_level != 'ERROR':
return
line = ' '.join(line_split[2:]) # Remove date portion from line
# Truncate line
line = f"{line:.200}"
return line

def _group_similar_log_lines(self, line, common_lines):
line = None
return line, date

def _group_similar_log_lines(self, line, date, common_lines):
highest_similarity_ratio = 0
most_similar_line = None
for common_line in common_lines:
for common_line in common_lines.keys():
similarity_ratio = SequenceMatcher(a=common_line, b=line).quick_ratio()
# Find the most similar line
if similarity_ratio > highest_similarity_ratio:
highest_similarity_ratio = similarity_ratio
most_similar_line = common_line

# Count the line if it is similar to another line
if highest_similarity_ratio > self.SIMILARITY_RATIO:
common_lines[most_similar_line] += 1
elif len(common_lines) < self.MAX_COMMON_LINES:
common_lines[line] = 1
else:
common_lines.setdefault('Other Error Lines', 0)
common_lines['Other Error Lines'] += 1
self.log.debug(f"Line had a ratio of {highest_similarity_ratio}, incrementing count. New line: {line}, Old line: {most_similar_line}")
line = most_similar_line
# Do we already have too many unique lines?
elif len(common_lines) >= self.MAX_COMMON_LINES:
line = 'Other Error Lines'
self.log.debug(f"Too many unique lines, {len(common_lines)} of {self.MAX_COMMON_LINES}, adding this to category: {line}")

self.log.debug(f"Line {line} had a ratio of {highest_similarity_ratio} which was under the threshold of {self.SIMILARITY_RATIO}, counting this line as unique.")
values = common_lines.setdefault(line, {})
values['count'] = values.get('count', 0) + 1
if date:
values.setdefault('first_seen', date)
return common_lines
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def run_tests(self):

setup(
name='fossor',
version='1.1.1',
version='1.1.2',
description=description,
long_description=description,
url='https://github.com/linkedin/fossor',
Expand Down
78 changes: 62 additions & 16 deletions test/test_check_similar_log_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,38 +3,84 @@

import sys
import logging
import tempfile
import random
import string

from fossor.checks.similar_log_errors import SimilarLogErrors

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
log = logging.getLogger(__name__)

example_file_text = '''
java.util.concurrent.TimeoutException
2017/10/10 00:00:34.251 ERROR [ModelResolverOperator] [stuff] [service-foo] [543245] Some example message 1.
2017/10/10 00:00:34.251 INFO [ModelResolverOperator] [stuff] [service-foo] [145234] Some example message 2.
2017/10/10 00:04:34.251 ERROR [ModelResolverOperator] [stuff] [service-foo] [983453] Some example message 3.
2017/10/10 00:07:34.251 INFO [ModelResolverOperator] [stuff] [service-foo] [095463] Some example message 4.
2017/10/10 00:15:34.251 INFO [ComponentFoobar] [stuff] [service-bar] [242349] A substantially different set of text
'''


def test_process_line():
sle = SimilarLogErrors()
line = 'java.util.concurrent.TimeoutException'
assert sle._process_line(line=line) == line
processed_line, date = sle._process_line(line=line)
assert processed_line == line

line = '2017/10/10 00:00:34.251 ERROR [ModelResolverOperator] [stuff] [ranker-jymbii] [random_treeid] Some message text.'
result = sle._process_line(line=line)
assert result == ' '.join(line.split()[2:]) # Two lines should match with the datetime stripped out
processed_line, date = sle._process_line(line=line)
assert processed_line == ' '.join(line.split()[2:]) # Two lines should match with the datetime stripped out

line = '2017/10/10 00:00:34.251 INFO [ModelResolverOperator] [stuff] [ranker-jymbii] [random_treeid] Some message text.'
result = sle._process_line(line=line)
assert result is None # Should be None because INFO was passed in
processed_line, date = sle._process_line(line=line)
assert processed_line is None # Should be None because INFO was passed in

line = ' at sun.reflect.GeneratedMethodAccessor67.invoke(Unknown Source) ~[?:?]'
assert sle._process_line(line=line) is None
processed_line, date = sle._process_line(line=line)
assert processed_line is None


def create_tmp_log_file(text, dir_path):
path = tempfile.mktemp(dir=dir_path)
f = open(path, 'w')
f.write(text)
f.close()
return path


def test_grouping_similar_lines():
def test_basic_output():
sle = SimilarLogErrors()
common_lines = {}
common_lines['This is a line that is very distinct.'] = 2
common_lines['The rain in spain falls mainly in the plains'] = 5
common_lines['The quick brown fox jumps over the fence and goes to the grocery store.'] = 4

line = 'The quick RANDOM brown fox jumps TEXT over the fence and goes to the grocery store.'
result = sle._group_similar_log_lines(line=line, common_lines=common_lines)
log.debug(f"result: {result}")
assert result['The quick brown fox jumps over the fence and goes to the grocery store.'] == 5
with tempfile.TemporaryDirectory() as tmpdir:
filepath = create_tmp_log_file(text=example_file_text, dir_path=tmpdir)
variables = {}
variables['LogFiles'] = [filepath]
result = sle.run(variables)
print(f"Result:\n{result}")

assert 'filename' in result
assert 'Count' in result
assert 'ERROR' in result
assert 'First seen: 2017/10/10 00:00:34.251' in result


def test_max_unique_lines():
sle = SimilarLogErrors()

# Create random log lines
max_lines = sle.MAX_COMMON_LINES
log_text = ''
for i in range(max_lines+5):
# random_words = [''.join(random.choices(string.ascii_uppercase + string.digits, k=5)) for i in range(40)] # Generate 40 5 character words
random_text = ''.join(random.choices(string.ascii_uppercase + string.digits, k=5)) * 50
log_text += f'2017/10/10 00:00:34.251 ERROR {random_text}\n'

# Confirm max_lines is honored properly
with tempfile.TemporaryDirectory() as tmpdir:
filepath = create_tmp_log_file(text=log_text, dir_path=tmpdir)
variables = {}
variables['LogFiles'] = [filepath]
result = sle.run(variables)
print(f"Result:\n{result}")
assert 'Other Error Lines' in result
assert 'Count: 5' in result

0 comments on commit 8587040

Please sign in to comment.