Permalink
Browse files

Reworked the parser system, updated the webservice and created a

Cassandra model
  • Loading branch information...
1 parent e0e01b6 commit 3b7f1a7f353c05ca0ad1f6c357e05f3f255a4123 @jbohman committed Jun 24, 2010
View
@@ -5,15 +5,16 @@ ident: 'Logsandra Server 1'
webservice_enabled: True
webservice_address: "0.0.0.0"
webservice_port: 5000
-webservice_config: "development.ini"
+webservice_config: 'development.ini'
# Cassandra cluster to connect to
-cassandra_address: localhost
+cassandra_address: 'localhost'
cassandra_port: 9160
cassandra_timeout: 5
# List of paths (files and directories) to monitor
paths:
- name: ~/coding/cassandra/access.log
recursive: False
- format: "%h %l %u %t %r %s %O %{Referer}i %{User-Agent}i"
+ parser: 'clf'
+ clf_format: '%h %l %u %t %r %s %O %{Referer}i %{User-Agent}i'
@@ -3,16 +3,17 @@
import time
import pycassa
import struct
-from cassandra.ttypes import NotFoundException
from pylons import request, response, session, tmpl_context as c, url
from pylons.controllers.util import abort, redirect
from logsandra.lib.base import BaseController, render
from logsandra import config
+from logsandra.utils.model import Cassandra
log = logging.getLogger(__name__)
+
class LogController(BaseController):
def index(self):
@@ -22,6 +23,15 @@ def view(self):
date_from = request.GET['date_from']
date_to = request.GET['date_to']
status = request.GET['status']
+ search_keyword = request.GET['search_keyword']
+
+ keyword = status
+ if search_keyword:
+ keyword = search_keyword
+
+ current_next = None
+ if 'next' in request.GET:
+ current_next = long(request.GET['next'])
if date_from and date_to:
try:
@@ -34,28 +44,21 @@ def view(self):
date_to = None
# TODO: Move this to a final that could be used by other controllers and make sure it uses the config file
- connect_string = '%s:%s' % ('localhost', 9160)
- client = pycassa.connect([connect_string], timeout=5)
-
- # Column families
- entries = pycassa.ColumnFamily(client, 'logsandra', 'entries')
- by_date = pycassa.ColumnFamily(client, 'logsandra', 'by_date')
- by_date_data = pycassa.ColumnFamily(client, 'logsandra', 'by_date_data')
-
- long_struct = struct.Struct('>q')
-
- c.entries = []
- try:
- if date_from and date_to:
- result = by_date_data.get(str(status), column_start=long_struct.pack(int(time.mktime(date_from.timetuple()))),
- column_finish=long_struct.pack(int(time.mktime(date_to.timetuple()))))
- else:
- result = by_date_data.get(str(status))
-
- for elem in result.itervalues():
- c.entries.append(elem.strip())
- except NotFoundException:
- pass
+ client = Cassandra('', 'localhost', 9160, 5)
+
+ if current_next:
+ entries, next = client.get_entries_by_keyword(keyword, date_from, date_to, action_next=current_next)
+ else:
+ entries, next = client.get_entries_by_keyword(keyword, date_from, date_to)
+
+ c.entries = entries
+
+ if next:
+ c.next_url = url(controller='log', action='view',
+ search_keyword=request.GET['search_keyword'],
+ status=request.GET['status'],
+ date_from=request.GET['date_from'],
+ date_to=request.GET['date_to'], next=next)
return render('/log_view.html')
@@ -9,42 +9,26 @@
# Local imports
from logsandra.monitor.watchers import Watcher
from logsandra.monitor.parsers.clf import ClfParser
+from logsandra.utils.model import Cassandra
class Monitor(object):
def __init__(self, settings, tail=False):
self.logger = logging.getLogger('logsandra.monitord')
self.settings = settings
+ self.client = Cassandra(self.settings['ident'], self.settings['cassandra_address'], self.settings['cassandra_port'], self.settings['cassandra_timeout'])
self.tail = tail
self.seek_data = {}
- self.parser = {}
+ self.parsers = {'clf': ClfParser(self.client)}
def run(self):
- # Connect to cassandra
- connect_string = '%s:%s' % (self.settings['cassandra_address'], self.settings['cassandra_port'])
- self.client = pycassa.connect([connect_string], timeout=self.settings['cassandra_timeout'])
- # Column families
- self.entries = pycassa.ColumnFamily(self.client, 'logsandra', 'entries')
- self.by_date = pycassa.ColumnFamily(self.client, 'logsandra', 'by_date')
- self.by_date_data = pycassa.ColumnFamily(self.client, 'logsandra', 'by_date_data')
-
- # Struct
- self.long_struct = struct.Struct('>q')
-
- # Start watcher (inf loop)
self.logger.debug('Starting watcher')
self.watcher = Watcher(self.settings['paths'], self.callback)
self.watcher.loop()
- def _to_long(self, data):
- return self.long_struct.pack(data)
-
- def _from_long(self, data):
- return self.long_struct.unpack(data)
-
def callback(self, filename, data):
if os.path.basename(filename).startswith('.'):
return False
@@ -62,22 +46,13 @@ def callback(self, filename, data):
for line in file_handler:
line = line.strip()
- if filename not in self.parser:
- self.parser[filename] = ClfParser(data['format'])
-
- result = self.parser[filename].parse_line(line)
-
- # TODO: Should move this to every individual parser
- key = uuid.uuid4()
- self.entries.insert(key.bytes, {'ident': self.settings['ident'], 'entry': line})
-
- if 'status' in result:
- # TODO: is this really how pycassa should be used?
- self.by_date.insert(str(result['status']), {self._to_long(int(time.mktime(result['time'].timetuple()))): str(key)})
- self.by_date_data.insert(str(result['status']), {self._to_long(int(time.mktime(result['time'].timetuple()))): str(line)})
-
- self.logger.debug('Parsed line: %s' % line)
+ result = self.parsers[data['parser']].parse(line, data)
+ if result:
+ self.logger.debug('Parsed line: %s' % line)
+ else:
+ self.logger.error('Failed to parse line: %s' % line)
+ # TODO: Persist seek_data
self.seek_data[filename] = file_handler.tell()
file_handler.close()
@@ -21,31 +21,58 @@
class ClfParser(object):
- def __init__(self, format):
- self.format = format
+ def __init__(self, client):
+ self.client = client
+ def parse(self, line, data):
+ print data
parts = []
- for element in self.format.split(' '):
+ for element in data['clf_format'].split(' '):
parts.append(clf[element])
- self.pattern = re.compile(r'\s+'.join(parts)+r'\s*\Z')
-
- def parse_line(self, line):
- match = self.pattern.match(line)
+ # TODO: optimize by storing compiled regex?
+ pattern = re.compile(r'\s+'.join(parts)+r'\s*\Z')
+ match = pattern.match(line)
result = match.groupdict()
- if 'user' in result and result['user'] == '-':
- result['user'] = None
+ keywords = []
+
+ if 'user' in result and result['user'] != '-':
+ keywords.append('user:%s' % result['user'])
+ keywords.append(result['user'])
+
+ if 'user_agent' in result and result['user_agent'] != '-':
+ keywords.append('user_agent:%s' % result['user_agent'])
+ keywords.append(result['user_agent'])
+
+ if 'referer' in result and result['referer'] != '-':
+ keywords.append('referer:%s' % result['referer'])
+ keywords.append(result['referer'])
+
+ if 'request' in result and result['request'] != '-':
+ request_parts = result['request'].split(' ')
+ keywords.extend(request_parts)
+ keywords.append('request_type:%s' % request_parts[0])
+ keywords.append('request_url:%s' % request_parts[1])
+ keywords.append('request:%s' % result['request'])
+ keywords.append(result['request'])
+
+ if 'status' in result and result['status'] != '-':
+ keywords.append('status_code:%s' % result['status'])
+ keywords.append(result['status'])
- if 'size' in result and result['size'] == '-':
- result['size'] = None
+ if 'host' in result and result['host'] != '-':
+ keywords.append('host:%s' % result['host'])
+ keywords.append(result['host'])
- if 'size_with_headers' in result and result['size_with_headers'] == '-':
- result['size_with_headers'] = None
+ if 'port' in result and result['port'] != '-':
+ keywords.append('port:%s' % result['port'])
+ keywords.append(result['port'])
- if 'referer' in result and result['referer'] == '-':
- result['referer'] = None
+ if 'server' in result and result['server'] != '-':
+ keywords.append('server:%s' % result['server'])
+ keywords.append(result['server'])
- result['time'] = dateutil.parser.parse(result['time'], fuzzy=True)
+ date = dateutil.parser.parse(result['time'], fuzzy=True).replace(tzinfo=None)
- return result
+ return self.client.add_entry(date, line, data['source'], keywords)
@@ -14,7 +14,7 @@ def __init__(self, entities, callback, update_freq=10, rescan_freq=20):
self.files = {}
for filename, entity in self._find_files_generator():
- self.files[filename] = {'mtime': self._mtime(filename), 'format': entity['format']}
+ self.files[filename] = {'mtime': self._mtime(filename), 'data': entity}
self._last_rescan_time = time.time()
@@ -29,7 +29,9 @@ def loop(self):
new_mtime = self._mtime(filename)
if new_mtime > data['mtime']:
self.files[filename]['mtime'] = new_mtime
- self.callback(filename, {'format': self.files[filename]['format']})
+ data = self.files[filename]['data']
+ data['source'] = filename
+ self.callback(filename, data)
if self.update_freq > 0:
current_time = time.time()
@@ -64,7 +66,7 @@ def _rescan(self):
tempfiles = {}
for filename, entity in self._find_files_generator():
if filename not in self.files:
- self.files[filename] = {'mtime': self._mtime(filename), 'format': entity['format']}
+ self.files[filename] = {'mtime': self._mtime(filename), 'data': entity}
tempfiles[filename] = 0
result = set(self.files).difference(set(tempfiles))
@@ -7,6 +7,23 @@
<p>Filter log by status code (required) and date (optional).</p>
+ <p>Searchable keywords:</p>
+ <ul>
+ <li>User</li>
+ <li>User agent</li>
+ <li>Referer</li>
+ <li>Request, including request_type, request_url and http version</li>
+ <li>Status code</li>
+ <li>Host</li>
+ <li>Port</li>
+ <li>Server</li>
+ </ul>
+
+ Search keyword
+ <input type="text" name="search_keyword" /><br />
+
+ or<br /
+ >
Status code
<select name="status">
<option value="100">100 Continue</option>
@@ -2,12 +2,16 @@
{% block content %}
-<pre>
+{% if c.next_url %}
+<a href="{{ c.next_url }}">Next</a>
+{% endif %}
+
{% for entry in c.entries %}
- {{ entry }}
+ <pre style="white-space: normal;">
+ {{ entry['entry'] }}<br />
+ <small><span style="color: #bbb;">date </span>{{ entry['date'] }} <span style="color: #bbb;">ident </span>{{ entry['ident'] }} <span style="color: #bbb;">source </span>{{ entry['source'] }}</small></pre>
{% else %}
No entries found.
{% endfor %}
-</pre>
{% endblock %}
Oops, something went wrong.

0 comments on commit 3b7f1a7

Please sign in to comment.