Permalink
Browse files

Initial import of pysolr.

git-svn-id: https://pysolr.googlecode.com/svn/trunk@2 13ae9d4a-4d43-0410-997b-81b7443f7ec1
  • Loading branch information...
jkocherhans committed Jan 9, 2008
1 parent 5999463 commit eadd23b84b872ea30ae8634c10f3ce915de4d984
Showing with 249 additions and 0 deletions.
  1. +249 −0 pysolr.py
View
249 pysolr.py
@@ -0,0 +1,249 @@
+# -*- coding: utf-8 -*-
+"""
+All we need to create a Solr connection is an address.
+
+>>> conn = Solr(host='127.0.0.1')
+
+First, completely clear the index.
+
+>>> conn.delete(q='*:*')
+
+For now, we can only index python dictionaries. Each key in the dictionary
+will correspond to a field in Solr.
+
+>>> docs = [
+... {'id': 'testdoc.1', 'order_i': 1, 'name': 'document 1', 'text': u'Paul Verlaine'},
+... {'id': 'testdoc.2', 'order_i': 2, 'name': 'document 2', 'text': u'Владимир Маякoвский'},
+... {'id': 'testdoc.3', 'order_i': 3, 'name': 'document 3', 'text': u'test'},
+... {'id': 'testdoc.4', 'order_i': 4, 'name': 'document 4', 'text': u'test'}
+... ]
+
+
+We can add documents to the index by passing a list of docs to the connection's
+add method.
+
+>>> conn.add(docs)
+
+>>> results = conn.search('Verlaine')
+>>> len(results)
+1
+
+>>> results = conn.search(u'Владимир')
+>>> len(results)
+1
+
+
+Simple tests for searching. We can optionally sort the results using Solr's
+sort syntax, that is, the field name and either asc or desc.
+
+>>> results = conn.search('test', sort='order_i asc')
+>>> for result in results:
+... print result['name']
+document 3
+document 4
+
+>>> results = conn.search('test', sort='order_i desc')
+>>> for result in results:
+... print result['name']
+document 4
+document 3
+
+
+To update documents, we just use the add method.
+
+>>> docs = [
+... {'id': 'testdoc.4', 'order_i': 4, 'name': 'document 4', 'text': u'blah'}
+... ]
+>>> conn.add(docs)
+
+>>> len(conn.search('blah'))
+1
+>>> len(conn.search('test'))
+1
+
+
+We can delete documents from the index by id, or by supplying a query.
+
+>>> conn.delete(id='testdoc.1')
+>>> conn.delete(q='name:"document 2"')
+
+>>> results = conn.search('Verlaine')
+>>> len(results)
+0
+
+"""
+
+# TODO: unicode support is pretty sloppy. define it better.
+
+from httplib import HTTPConnection
+from urllib import urlencode
+from datetime import datetime, date
+from time import strptime, strftime
+try:
+ # for python 2.5
+ from xml.etree import ElementTree
+except ImportError:
+ from elemettree import ElementTree
+
+__all__ = ['Solr']
+
+class SolrError(Exception):
+ pass
+
+class Solr(object):
+ def __init__(self, host, port=8983):
+ self.host = host
+ self.port = port
+
+ def _select(self, params):
+ # encode the query as utf-8 so urlencode can handle it
+ params['q'] = params['q'].encode('utf-8')
+ conn = HTTPConnection(self.host, self.port)
+ url = '/solr/select/?%s' % urlencode(params)
+ conn.request('GET', url)
+ return conn.getresponse()
+
+ def _update(self, message):
+ """
+ Posts the given xml message to http://<host>:>port>/solr/update and
+ returns the result.
+ """
+ conn = HTTPConnection(self.host, self.port)
+ conn.request('POST', '/solr/update/', message, {'Content-type': 'text/xml'})
+ return conn.getresponse()
+
+ def _extract_error(self, response):
+ """
+ Extract the actual error message from a solr response. Unfortunately,
+ this means scraping the html.
+ """
+ et = ElementTree.parse(response)
+ return et.findtext('body/pre')
+
+ # Converters #############################################################
+
+ def _from_python(self, value):
+ """
+ Converts python values to a form suitable for insertion into the xml
+ we send to solr.
+ """
+ if isinstance(value, datetime):
+ value = value.strftime('%Y-%m-%dT%H:%M:%S.000Z')
+ elif isinstance(value, date):
+ value = value.strftime('%Y-%m-%dT00:00:00.000Z')
+ elif isinstance(value, bool):
+ if value:
+ value = 'true'
+ else:
+ value = 'false'
+ else:
+ value = unicode(value)
+ return value
+
+ def bool_to_python(self, value):
+ """
+ Convert a 'bool' field from solr's xml format to python and return it.
+ """
+ if value == 'true':
+ return True
+ elif value == 'false':
+ return False
+
+ def str_to_python(self, value):
+ """
+ Convert an 'str' field from solr's xml format to python and return it.
+ """
+ return unicode(value)
+
+ def int_to_python(self, value):
+ """
+ Convert an 'int' field from solr's xml format to python and return it.
+ """
+ return int(value)
+
+ def date_to_python(self, value):
+ """
+ Convert a 'date' field from solr's xml format to python and return it.
+ """
+ # this throws away fractions of a second
+ return datetime(*strptime(value[:-5], "%Y-%m-%dT%H:%M:%S")[0:6])
+
+ # API Methods ############################################################
+
+ def search(self, q, sort=None, start=0, rows=20):
+ """Performs a search and returns the results."""
+ params = {'q': q, 'start': start, 'rows': rows}
+ if sort:
+ params['sort'] = sort
+ response = self._select(params)
+ if response.status != 200:
+ raise SolrError(self._extract_error(response))
+
+ # TODO: make result retrieval lazy and allow custom result objects
+ et = ElementTree.parse(response)
+ result = et.find('result')
+ docs = result.findall('doc')
+ results = []
+ for doc in docs:
+ result = {}
+ for element in doc.getchildren():
+ converter_name = '%s_to_python' % element.tag
+ converter = getattr(self, converter_name)
+ result[element.get('name')] = converter(element.text)
+ results.append(result)
+ return results
+
+ def add(self, docs, commit=True):
+ """Adds or updates documents. For now, docs is a list of dictionaies
+ where each key is the field name and each value is the value to index.
+ """
+ message = ElementTree.Element('add')
+ for doc in docs:
+ d = ElementTree.Element('doc')
+ for key, value in doc.items():
+ f = ElementTree.Element('field', name=key)
+ f.text = self._from_python(value)
+ d.append(f)
+ message.append(d)
+ m = ElementTree.tostring(message)
+ response = self._update(m)
+ if response.status != 200:
+ raise SolrError(self._extract_error(response))
+ # TODO: Supposedly, we can put a <commit /> element in the same post body
+ # as the add element. That isn't working for some reason, and it would save us
+ # an extra trip to the server. This works for now.
+ if commit:
+ self.commit()
+
+ def delete(self, id=None, q=None, commit=True, fromPending=True, fromCommitted=True):
+ """Deletes documents."""
+ if id is None and q is None:
+ raise ValueError('You must specify "id" or "q".')
+ elif id is not None and q is not None:
+ raise ValueError('You many only specify "id" OR "q", not both.')
+ elif id is not None:
+ m = '<delete><id>%s</id></delete>' % id
+ elif q is not None:
+ m = '<delete><query>%s</query></delete>' % q
+ response = self._update(m)
+ if response.status != 200:
+ raise SolrError(self._extract_error(response))
+ # TODO: Supposedly, we can put a <commit /> element in the same post body
+ # as the delete element. That isn't working for some reason, and it would save us
+ # an extra trip to the server. This works for now.
+ if commit:
+ self.commit()
+
+ def commit(self):
+ response = self._update('<commit />')
+ if response.status != 200:
+ raise SolrError(self._extract_error(response))
+
+ def optimize(self):
+ response = self._update('<optimize />')
+ if response.status != 200:
+ raise SolrError(self._extract_error(response))
+
+if __name__ == "__main__":
+ import doctest
+ doctest.testmod()

0 comments on commit eadd23b

Please sign in to comment.