Skip to content
Browse files

Added a naive clf parser.

  • Loading branch information...
1 parent 2f07cd3 commit a66c1941b27e2a94e39846ad9d466b21ab507673 @jbohman committed
View
3 config.yaml
@@ -11,5 +11,4 @@ cassandra_port: 9160
paths:
- name: ~/coding/cassandra/access.log
recursive: False
- - name: /var/log/testing/
- recurisve: True
+ format: "%h %l %u %t %r %s %O %{Referer}i %{User-Agent}i"
View
14 logsandra/monitor/monitor.py
@@ -9,14 +9,17 @@
#except ImportError:
from watchers.standard import StandardWatcher as Watcher
+from parsers.clf import ClfParser
+
class Reader(object):
def __init__(self, tail=False):
self.tail = tail
self.seek_data = {}
+ self.parser = {}
- def callback(self, filename):
+ def callback(self, filename, data):
if os.path.basename(filename).startswith('.'):
return False
@@ -29,7 +32,14 @@ def callback(self, filename):
file_handler.seek(0, os.SEEK_END)
for line in file_handler:
- print line.strip()
+ line = line.strip()
+
+ if filename not in self.parser:
+ self.parser[filename] = ClfParser(data['format'])
+
+ result = self.parser[filename].parse_line(line)
+
+ print result
self.seek_data[filename] = file_handler.tell()
file_handler.close()
View
0 logsandra/monitor/parsers/__init__.py
No changes.
View
51 logsandra/monitor/parsers/clf.py
@@ -0,0 +1,51 @@
+import re
+import dateutil.parser
+
+clf = {
+ '%h': r'(?P<host>\S+)',
+ '%l': r'\S+',
+ '%u': r'(?P<user>\S+)',
+ '%t': r'\[(?P<time>.+)\]',
+ '%r': r'"(?P<request>.+)"',
+ '%s': r'(?P<status>[0-9]+)',
+ '%>s': r'(?P<status>[0-9]+)',
+ '%<s': r'(?P<status>[0-9]+)',
+ '%b': r'(?P<size>\S+)',
+ '%O': r'(?P<size_with_headers>\S+)',
+ '%{Referer}i': r'"(?P<referer>.*)"',
+ '%{User-Agent}i': r'"(?P<user_agent>.*)"',
+ '%U': r'(?P<url_path>\S+)',
+ '%p': r'(?P<port>[0-9]+)',
+ '%v': r'(?P<server>\S+)'
+}
+
+class ClfParser(object):
+
+ def __init__(self, format):
+ self.format = format
+
+ parts = []
+ for element in self.format.split(' '):
+ parts.append(clf[element])
+
+ self.pattern = re.compile(r'\s+'.join(parts)+r'\s*\Z')
+
+ def parse_line(self, line):
+ match = self.pattern.match(line)
+ res = match.groupdict()
+
+ if 'user' in res and res['user'] == '-':
+ res['user'] = None
+
+ if 'size' in res and res['size'] == '-':
+ res['size'] = None
+
+ if 'size_with_headers' in res and res['size_with_headers'] == '-':
+ res['size_with_headers'] = None
+
+ if 'referer' in res and res['referer'] == '-':
+ res['referer'] = None
+
+ res['time'] = dateutil.parser.parse(res['time'], fuzzy=True)
+
+ return res
View
1 logsandra/monitor/watchers/inotify.py
@@ -22,6 +22,7 @@ def __init__(self, entities, callback, update_freq=0, rescan_freq=20):
def loop(self):
notifier = pyinotify.Notifier(self.wm, EventHandler(callback=self.callback), self.update_freq)
for entity in self.entities:
+ # TODO: proc_fun, to add more information about file
self.wm.add_watch(entity['name'], pyinotify.IN_MODIFY, rec=entity['recursive'])
notifier.loop()
View
24 logsandra/monitor/watchers/standard.py
@@ -13,8 +13,8 @@ def __init__(self, entities, callback, update_freq=10, rescan_freq=20):
self.files = {}
- for filename in self._find_files_generator():
- self.files[filename] = self._mtime(filename)
+ for filename, entity in self._find_files_generator():
+ self.files[filename] = {'mtime': self._mtime(filename), 'format': entity['format']}
self._last_rescan_time = time.time()
@@ -25,11 +25,11 @@ def loop(self):
self._last_rescan_time = self._rescan()
reference_time = time.time()
- for filename, mtime in self.files.iteritems():
+ for filename, data in self.files.iteritems():
new_mtime = self._mtime(filename)
- if new_mtime > mtime:
- self.files[filename] = new_mtime
- self.callback(filename)
+ if new_mtime > data['mtime']:
+ self.files[filename]['mtime'] = new_mtime
+ self.callback(filename, {'format': self.files[filename]['format']})
if self.update_freq > 0:
current_time = time.time()
@@ -46,25 +46,25 @@ def _find_files_generator(self):
if path[2]:
for filename in path[2]:
filename = os.path.join(os.path.abspath(path[0]), filename)
- yield filename
+ yield filename, entity
else:
for filename in os.listdir(entity['name']):
filename = os.path.abspath(entity['name']) + '/' + filename
if os.path.isfile(filename):
- yield filename
+ yield filename, entity
# Is file
else:
if os.path.exists(os.path.expanduser(entity['name'])):
filename = os.path.abspath(os.path.expanduser(entity['name']))
- yield filename
+ yield filename, entity
else:
- raise AttributeError('Invalid path, cannot monitor it')
+ raise Error('Invalid path, cannot monitor it')
def _rescan(self):
tempfiles = {}
- for filename in self._find_files_generator():
+ for filename, entity in self._find_files_generator():
if filename not in self.files:
- self.files[filename] = self._mtime(filename)
+ self.files[filename] = {'mtime': self._mtime(filename), 'format': entity['format']}
tempfiles[filename] = 0
result = set(self.files).difference(set(tempfiles))

0 comments on commit a66c194

Please sign in to comment.
Something went wrong with that request. Please try again.