Skip to content

Commit

Permalink
Added a naive clf parser.
Browse files Browse the repository at this point in the history
  • Loading branch information
jbohman committed Jun 9, 2010
1 parent 2f07cd3 commit a66c194
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 16 deletions.
3 changes: 1 addition & 2 deletions config.yaml
Expand Up @@ -11,5 +11,4 @@ cassandra_port: 9160
paths: paths:
- name: ~/coding/cassandra/access.log - name: ~/coding/cassandra/access.log
recursive: False recursive: False
- name: /var/log/testing/ format: "%h %l %u %t %r %s %O %{Referer}i %{User-Agent}i"
recurisve: True
14 changes: 12 additions & 2 deletions logsandra/monitor/monitor.py
Expand Up @@ -9,14 +9,17 @@
#except ImportError: #except ImportError:
from watchers.standard import StandardWatcher as Watcher from watchers.standard import StandardWatcher as Watcher


from parsers.clf import ClfParser



class Reader(object): class Reader(object):


def __init__(self, tail=False): def __init__(self, tail=False):
self.tail = tail self.tail = tail
self.seek_data = {} self.seek_data = {}
self.parser = {}


def callback(self, filename): def callback(self, filename, data):
if os.path.basename(filename).startswith('.'): if os.path.basename(filename).startswith('.'):
return False return False


Expand All @@ -29,7 +32,14 @@ def callback(self, filename):
file_handler.seek(0, os.SEEK_END) file_handler.seek(0, os.SEEK_END)


for line in file_handler: for line in file_handler:
print line.strip() line = line.strip()

if filename not in self.parser:
self.parser[filename] = ClfParser(data['format'])

result = self.parser[filename].parse_line(line)

print result


self.seek_data[filename] = file_handler.tell() self.seek_data[filename] = file_handler.tell()
file_handler.close() file_handler.close()
Expand Down
Empty file.
51 changes: 51 additions & 0 deletions logsandra/monitor/parsers/clf.py
@@ -0,0 +1,51 @@
import re
import dateutil.parser

clf = {
'%h': r'(?P<host>\S+)',
'%l': r'\S+',
'%u': r'(?P<user>\S+)',
'%t': r'\[(?P<time>.+)\]',
'%r': r'"(?P<request>.+)"',
'%s': r'(?P<status>[0-9]+)',
'%>s': r'(?P<status>[0-9]+)',
'%<s': r'(?P<status>[0-9]+)',
'%b': r'(?P<size>\S+)',
'%O': r'(?P<size_with_headers>\S+)',
'%{Referer}i': r'"(?P<referer>.*)"',
'%{User-Agent}i': r'"(?P<user_agent>.*)"',
'%U': r'(?P<url_path>\S+)',
'%p': r'(?P<port>[0-9]+)',
'%v': r'(?P<server>\S+)'
}

class ClfParser(object):

def __init__(self, format):
self.format = format

parts = []
for element in self.format.split(' '):
parts.append(clf[element])

self.pattern = re.compile(r'\s+'.join(parts)+r'\s*\Z')

def parse_line(self, line):
match = self.pattern.match(line)
res = match.groupdict()

if 'user' in res and res['user'] == '-':
res['user'] = None

if 'size' in res and res['size'] == '-':
res['size'] = None

if 'size_with_headers' in res and res['size_with_headers'] == '-':
res['size_with_headers'] = None

if 'referer' in res and res['referer'] == '-':
res['referer'] = None

res['time'] = dateutil.parser.parse(res['time'], fuzzy=True)

return res
1 change: 1 addition & 0 deletions logsandra/monitor/watchers/inotify.py
Expand Up @@ -22,6 +22,7 @@ def __init__(self, entities, callback, update_freq=0, rescan_freq=20):
def loop(self): def loop(self):
notifier = pyinotify.Notifier(self.wm, EventHandler(callback=self.callback), self.update_freq) notifier = pyinotify.Notifier(self.wm, EventHandler(callback=self.callback), self.update_freq)
for entity in self.entities: for entity in self.entities:
# TODO: proc_fun, to add more information about file
self.wm.add_watch(entity['name'], pyinotify.IN_MODIFY, rec=entity['recursive']) self.wm.add_watch(entity['name'], pyinotify.IN_MODIFY, rec=entity['recursive'])


notifier.loop() notifier.loop()
24 changes: 12 additions & 12 deletions logsandra/monitor/watchers/standard.py
Expand Up @@ -13,8 +13,8 @@ def __init__(self, entities, callback, update_freq=10, rescan_freq=20):


self.files = {} self.files = {}


for filename in self._find_files_generator(): for filename, entity in self._find_files_generator():
self.files[filename] = self._mtime(filename) self.files[filename] = {'mtime': self._mtime(filename), 'format': entity['format']}


self._last_rescan_time = time.time() self._last_rescan_time = time.time()


Expand All @@ -25,11 +25,11 @@ def loop(self):
self._last_rescan_time = self._rescan() self._last_rescan_time = self._rescan()


reference_time = time.time() reference_time = time.time()
for filename, mtime in self.files.iteritems(): for filename, data in self.files.iteritems():
new_mtime = self._mtime(filename) new_mtime = self._mtime(filename)
if new_mtime > mtime: if new_mtime > data['mtime']:
self.files[filename] = new_mtime self.files[filename]['mtime'] = new_mtime
self.callback(filename) self.callback(filename, {'format': self.files[filename]['format']})


if self.update_freq > 0: if self.update_freq > 0:
current_time = time.time() current_time = time.time()
Expand All @@ -46,25 +46,25 @@ def _find_files_generator(self):
if path[2]: if path[2]:
for filename in path[2]: for filename in path[2]:
filename = os.path.join(os.path.abspath(path[0]), filename) filename = os.path.join(os.path.abspath(path[0]), filename)
yield filename yield filename, entity
else: else:
for filename in os.listdir(entity['name']): for filename in os.listdir(entity['name']):
filename = os.path.abspath(entity['name']) + '/' + filename filename = os.path.abspath(entity['name']) + '/' + filename
if os.path.isfile(filename): if os.path.isfile(filename):
yield filename yield filename, entity
# Is file # Is file
else: else:
if os.path.exists(os.path.expanduser(entity['name'])): if os.path.exists(os.path.expanduser(entity['name'])):
filename = os.path.abspath(os.path.expanduser(entity['name'])) filename = os.path.abspath(os.path.expanduser(entity['name']))
yield filename yield filename, entity
else: else:
raise AttributeError('Invalid path, cannot monitor it') raise Error('Invalid path, cannot monitor it')


def _rescan(self): def _rescan(self):
tempfiles = {} tempfiles = {}
for filename in self._find_files_generator(): for filename, entity in self._find_files_generator():
if filename not in self.files: if filename not in self.files:
self.files[filename] = self._mtime(filename) self.files[filename] = {'mtime': self._mtime(filename), 'format': entity['format']}
tempfiles[filename] = 0 tempfiles[filename] = 0


result = set(self.files).difference(set(tempfiles)) result = set(self.files).difference(set(tempfiles))
Expand Down

0 comments on commit a66c194

Please sign in to comment.