Permalink
Browse files

initial import

  • Loading branch information...
0 parents commit b804552d146c95820a78193886109ef648903e8e Justine Tunney committed Jan 28, 2012
Showing with 612 additions and 0 deletions.
  1. +5 −0 .gitignore
  2. +59 −0 README.rst
  3. +284 −0 ez_setup.py
  4. +230 −0 redisbayes.py
  5. +34 −0 setup.py
5 .gitignore
@@ -0,0 +1,5 @@
+*.pyc
+/build
+/dist
+*.egg-info
+
59 README.rst
@@ -0,0 +1,59 @@
+.. -*-rst-*-
+
+============
+ redisbayes
+============
+
+:name: redisbayes
+:description: Naïve Bayesian Text Classifier on Redis
+:copyright: © 2012 Justine Alexandra Roberts Tunney
+:license: MIT
+
+
+What Is This?
+=============
+
+It's a spam filter. I wrote this to filter spammy comments from a high
+traffic forum website and it worked pretty well. It can work for you too :)
+It's not tied to any particular format like email, it just deals with the raw
+text.
+
+This is probably the only simple spam filtering library you'll find for Python
+that's simple (170 lines of code), works (30 lines of test code), and doesn't
+suck.
+
+
+Installation
+============
+
+From folder::
+
+ sudo python setup.py install
+
+From cheeseshop::
+
+ sudo pip install redisbayes
+
+From git::
+
+ sudo pip install git+git://github.com/jart/redisbayes.git
+
+
+Basic Usage
+===========
+
+::
+
+ import redis, redisbayes
+ rb = redisbayes.RedisBayes(redis=redis.Redis())
+
+ rb.train('good', 'sunshine drugs love sex lobster sloth')
+ rb.train('bad', 'fear death horror government zombie god')
+
+ assert rb.classify('sloths are so cute i love them') == 'good'
+ assert rb.classify('i fear god and love the government') == 'bad'
+
+ print rb.score('i fear god and love the government')
+
+ rb.untrain('good', 'sunshine drugs love sex lobster sloth')
+ rb.untrain('bad', 'fear death horror government zombie god')
284 ez_setup.py
@@ -0,0 +1,284 @@
+#!python
+"""Bootstrap setuptools installation
+
+If you want to use setuptools in your package's setup.py, just include this
+file in the same directory with it, and add this to the top of your setup.py::
+
+ from ez_setup import use_setuptools
+ use_setuptools()
+
+If you want to require a specific version of setuptools, set a download
+mirror, or use an alternate download directory, you can do so by supplying
+the appropriate options to ``use_setuptools()``.
+
+This file can also be run as a script to install or upgrade setuptools.
+"""
+import sys
+DEFAULT_VERSION = "0.6c11"
+DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3]
+
+md5_data = {
+ 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca',
+ 'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb',
+ 'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b',
+ 'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a',
+ 'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618',
+ 'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac',
+ 'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5',
+ 'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4',
+ 'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c',
+ 'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b',
+ 'setuptools-0.6c10-py2.3.egg': 'ce1e2ab5d3a0256456d9fc13800a7090',
+ 'setuptools-0.6c10-py2.4.egg': '57d6d9d6e9b80772c59a53a8433a5dd4',
+ 'setuptools-0.6c10-py2.5.egg': 'de46ac8b1c97c895572e5e8596aeb8c7',
+ 'setuptools-0.6c10-py2.6.egg': '58ea40aef06da02ce641495523a0b7f5',
+ 'setuptools-0.6c11-py2.3.egg': '2baeac6e13d414a9d28e7ba5b5a596de',
+ 'setuptools-0.6c11-py2.4.egg': 'bd639f9b0eac4c42497034dec2ec0c2b',
+ 'setuptools-0.6c11-py2.5.egg': '64c94f3bf7a72a13ec83e0b24f2749b2',
+ 'setuptools-0.6c11-py2.6.egg': 'bfa92100bd772d5a213eedd356d64086',
+ 'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27',
+ 'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277',
+ 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa',
+ 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e',
+ 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e',
+ 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f',
+ 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2',
+ 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc',
+ 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167',
+ 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64',
+ 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d',
+ 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20',
+ 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab',
+ 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53',
+ 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2',
+ 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e',
+ 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372',
+ 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902',
+ 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de',
+ 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b',
+ 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03',
+ 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a',
+ 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6',
+ 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a',
+}
+
+import sys, os
+try: from hashlib import md5
+except ImportError: from md5 import md5
+
+def _validate_md5(egg_name, data):
+ if egg_name in md5_data:
+ digest = md5(data).hexdigest()
+ if digest != md5_data[egg_name]:
+ print >>sys.stderr, (
+ "md5 validation of %s failed! (Possible download problem?)"
+ % egg_name
+ )
+ sys.exit(2)
+ return data
+
+def use_setuptools(
+ version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
+ download_delay=15
+):
+ """Automatically find/download setuptools and make it available on sys.path
+
+ `version` should be a valid setuptools version number that is available
+ as an egg for download under the `download_base` URL (which should end with
+ a '/'). `to_dir` is the directory where setuptools will be downloaded, if
+ it is not already available. If `download_delay` is specified, it should
+ be the number of seconds that will be paused before initiating a download,
+ should one be required. If an older version of setuptools is installed,
+ this routine will print a message to ``sys.stderr`` and raise SystemExit in
+ an attempt to abort the calling script.
+ """
+ was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules
+ def do_download():
+ egg = download_setuptools(version, download_base, to_dir, download_delay)
+ sys.path.insert(0, egg)
+ import setuptools; setuptools.bootstrap_install_from = egg
+ try:
+ import pkg_resources
+ except ImportError:
+ return do_download()
+ try:
+ pkg_resources.require("setuptools>="+version); return
+ except pkg_resources.VersionConflict, e:
+ if was_imported:
+ print >>sys.stderr, (
+ "The required version of setuptools (>=%s) is not available, and\n"
+ "can't be installed while this script is running. Please install\n"
+ " a more recent version first, using 'easy_install -U setuptools'."
+ "\n\n(Currently using %r)"
+ ) % (version, e.args[0])
+ sys.exit(2)
+ except pkg_resources.DistributionNotFound:
+ pass
+
+ del pkg_resources, sys.modules['pkg_resources'] # reload ok
+ return do_download()
+
+def download_setuptools(
+ version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
+ delay = 15
+):
+ """Download setuptools from a specified location and return its filename
+
+ `version` should be a valid setuptools version number that is available
+ as an egg for download under the `download_base` URL (which should end
+ with a '/'). `to_dir` is the directory where the egg will be downloaded.
+ `delay` is the number of seconds to pause before an actual download attempt.
+ """
+ import urllib2, shutil
+ egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3])
+ url = download_base + egg_name
+ saveto = os.path.join(to_dir, egg_name)
+ src = dst = None
+ if not os.path.exists(saveto): # Avoid repeated downloads
+ try:
+ from distutils import log
+ if delay:
+ log.warn("""
+---------------------------------------------------------------------------
+This script requires setuptools version %s to run (even to display
+help). I will attempt to download it for you (from
+%s), but
+you may need to enable firewall access for this script first.
+I will start the download in %d seconds.
+
+(Note: if this machine does not have network access, please obtain the file
+
+ %s
+
+and place it in this directory before rerunning this script.)
+---------------------------------------------------------------------------""",
+ version, download_base, delay, url
+ ); from time import sleep; sleep(delay)
+ log.warn("Downloading %s", url)
+ src = urllib2.urlopen(url)
+ # Read/write all in one block, so we don't create a corrupt file
+ # if the download is interrupted.
+ data = _validate_md5(egg_name, src.read())
+ dst = open(saveto,"wb"); dst.write(data)
+ finally:
+ if src: src.close()
+ if dst: dst.close()
+ return os.path.realpath(saveto)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def main(argv, version=DEFAULT_VERSION):
+ """Install or upgrade setuptools and EasyInstall"""
+ try:
+ import setuptools
+ except ImportError:
+ egg = None
+ try:
+ egg = download_setuptools(version, delay=0)
+ sys.path.insert(0,egg)
+ from setuptools.command.easy_install import main
+ return main(list(argv)+[egg]) # we're done here
+ finally:
+ if egg and os.path.exists(egg):
+ os.unlink(egg)
+ else:
+ if setuptools.__version__ == '0.0.1':
+ print >>sys.stderr, (
+ "You have an obsolete version of setuptools installed. Please\n"
+ "remove it from your system entirely before rerunning this script."
+ )
+ sys.exit(2)
+
+ req = "setuptools>="+version
+ import pkg_resources
+ try:
+ pkg_resources.require(req)
+ except pkg_resources.VersionConflict:
+ try:
+ from setuptools.command.easy_install import main
+ except ImportError:
+ from easy_install import main
+ main(list(argv)+[download_setuptools(delay=0)])
+ sys.exit(0) # try to force an exit
+ else:
+ if argv:
+ from setuptools.command.easy_install import main
+ main(argv)
+ else:
+ print "Setuptools version",version,"or greater has been installed."
+ print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)'
+
+def update_md5(filenames):
+ """Update our built-in md5 registry"""
+
+ import re
+
+ for name in filenames:
+ base = os.path.basename(name)
+ f = open(name,'rb')
+ md5_data[base] = md5(f.read()).hexdigest()
+ f.close()
+
+ data = [" %r: %r,\n" % it for it in md5_data.items()]
+ data.sort()
+ repl = "".join(data)
+
+ import inspect
+ srcfile = inspect.getsourcefile(sys.modules[__name__])
+ f = open(srcfile, 'rb'); src = f.read(); f.close()
+
+ match = re.search("\nmd5_data = {\n([^}]+)}", src)
+ if not match:
+ print >>sys.stderr, "Internal error!"
+ sys.exit(2)
+
+ src = src[:match.start(1)] + repl + src[match.end(1):]
+ f = open(srcfile,'w')
+ f.write(src)
+ f.close()
+
+
+if __name__=='__main__':
+ if len(sys.argv)>2 and sys.argv[1]=='--md5update':
+ update_md5(sys.argv[2:])
+ else:
+ main(sys.argv[1:])
+
+
+
+
+
+
230 redisbayes.py
@@ -0,0 +1,230 @@
+# -*- coding: utf-8 -*-
+#
+# redisbayes - Naïve Bayesian Text Classifier on Redis
+# Copyright (c) 2012 Justine Alexandra Roberts Tunney
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use, copy,
+# modify, merge, publish, distribute, sublicense, and/or sell copies
+# of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+ur"""
+
+ redisbayes
+ ~~~~~~~~~~
+
+ Naïve Bayesian Text Classifier on Redis.
+
+ I wrote this to filter spammy comments from a high traffic forum website
+ and it worked pretty well. It can work for you too :)
+
+ For example::
+
+ >>> import redis
+ >>> import redisbayes
+ >>> rb = redisbayes.RedisBayes(redis.Redis(), prefix='bayes:test:')
+ >>> rb.flush()
+ >>> rb.classify('nothing trained yet') is None
+ True
+ >>> rb.train('good', 'sunshine drugs love sex lobster sloth')
+ >>> rb.train('bad', 'fear death horror government zombie god')
+ >>> rb.classify('sloths are so cute i love them')
+ 'good'
+ >>> rb.classify('i fear god and love the government')
+ 'bad'
+ >>> int(rb.score('i fear god and love the government')['bad'])
+ -9
+ >>> int(rb.score('i fear god and love the government')['good'])
+ -14
+ >>> rb.untrain('good', 'sunshine drugs love sex lobster sloth')
+ >>> rb.untrain('bad', 'fear death horror government zombie god')
+ >>> rb.score('lolcat')
+ {}
+
+ Words are lowercased and unicode is supported::
+
+ >>> print english_tokenizer("Æther")[0]
+ æther
+
+ Common english words and 1-2 character words are ignored::
+
+ >>> english_tokenizer("greetings mary a b aa bb")
+ [u'mary']
+
+ Some characters are removed::
+
+ >>> print english_tokenizer("contraction's")[0]
+ contraction's
+ >>> print english_tokenizer("what|is|goth")[0]
+ goth
+
+"""
+
+import re
+import math
+
+
+__version__ = '0.1'
+
+english_ignore = set("""
+a able about above abroad according accordingly across actually adj after
+afterwards again against ago ahead ain't all allow allows almost alone along
+alongside already also although always am amid amidst among amongst an and
+another any anybody anyhow anyone anything anyway anyways anywhere apart
+appear appreciate appropriate are aren't around as a's aside ask asking
+associated at available away awfully b back backward backwards be became
+because become becomes becoming been before beforehand begin behind being
+believe below beside besides best better between beyond both brief but by c
+came can cannot cant can't caption cause causes certain certainly changes
+clearly c'mon co co. com come comes concerning consequently consider
+considering contain containing contains corresponding could couldn't course
+c's currently d dare daren't definitely described despite did didn't different
+directly do does doesn't doing done don't down downwards during e each edu eg
+eight eighty either else elsewhere end ending enough entirely especially et
+etc even ever evermore every everybody everyone everything everywhere ex
+exactly example except f fairly far farther few fewer fifth first five
+followed following follows for forever former formerly forth forward found
+four from further furthermore g get gets getting given gives go goes going
+gone got gotten greetings h had hadn't half happens hardly has hasn't have
+haven't having he he'd he'll hello help hence her here hereafter hereby herein
+here's hereupon hers herself he's hi him himself his hither hopefully how
+howbeit however hundred i i'd ie if ignored i'll i'm immediate in inasmuch inc
+inc. indeed indicate indicated indicates inner inside insofar instead into
+inward is isn't it it'd it'll its it's itself i've j just k keep keeps kept
+know known knows l last lately later latter latterly least less lest let let's
+like liked likely likewise little look looking looks low lower ltd m made
+mainly make makes many may maybe mayn't me mean meantime meanwhile merely
+might mightn't mine minus miss more moreover most mostly mr mrs much must
+mustn't my myself n name namely nd near nearly necessary need needn't needs
+neither never neverf neverless nevertheless new next nine ninety no nobody non
+none nonetheless noone no-one nor normally not nothing notwithstanding novel
+now nowhere o obviously of off often oh ok okay old on once one ones one's
+only onto opposite or other others otherwise ought oughtn't our ours ourselves
+out outside over overall own p particular particularly past per perhaps placed
+please plus possible presumably probably provided provides q que quite qv r
+rather rd re really reasonably recent recently regarding regardless regards
+relatively respectively right round s said same saw say saying says second
+secondly see seeing seem seemed seeming seems seen self selves sensible sent
+serious seriously seven several shall shan't she she'd she'll she's should
+shouldn't since six so some somebody someday somehow someone something
+sometime sometimes somewhat somewhere soon sorry specified specify specifying
+still sub such sup sure t take taken taking tell tends th than thank thanks
+thanx that that'll thats that's that've the their theirs them themselves then
+thence there thereafter thereby there'd therefore therein there'll there're
+theres there's thereupon there've these they they'd they'll they're they've
+thing things think third thirty this thorough thoroughly those though three
+through throughout thru thus till to together too took toward towards tried
+tries truly try trying t's twice two u un under underneath undoing
+unfortunately unless unlike unlikely until unto up upon upwards us use used
+useful uses using usually v value various versus very via viz vs w want wants
+was wasn't way we we'd welcome well we'll went were we're weren't we've what
+whatever what'll what's what've when whence whenever where whereafter whereas
+whereby wherein where's whereupon wherever whether which whichever while
+whilst whither who who'd whoever whole who'll whom whomever who's whose why
+will willing wish with within without wonder won't would wouldn't x y yes yet
+you you'd you'll your you're yours yourself yourselves you've z zero
+successful greatest began including being all for close but
+""".split())
+
+
+def tidy(text):
+ if not isinstance(text, basestring):
+ text = str(text)
+ if not isinstance(text, unicode):
+ text = text.decode('utf8')
+ text = text.lower()
+ return re.sub(r'[\_.,<>:;~+|\[\]?`"!@#$%^&*()\s]', ' ', text, re.UNICODE)
+
+
+def english_tokenizer(text):
+ words = tidy(text).split()
+ return [w for w in words if len(w) > 2 and w not in english_ignore]
+
+
+def occurances(words):
+ counts = {}
+ for word in words:
+ if word in counts:
+ counts[word] += 1
+ else:
+ counts[word] = 1
+ return counts
+
+
+class RedisBayes(object):
+ def __init__(self, redis=None, prefix='bayes:', correction=0.1,
+ tokenizer=None):
+ self.redis = redis
+ self.prefix = prefix
+ self.correction = correction
+ self.tokenizer = tokenizer or english_tokenizer
+ if not self.redis:
+ import redis
+ redis = redis.Redis()
+
+ def flush(self):
+ for cat in self.redis.smembers(self.prefix + 'categories'):
+ self.redis.delete(self.prefix + cat)
+ self.redis.delete(self.prefix + 'categories')
+
+ def train(self, category, text):
+ self.redis.sadd(self.prefix + 'categories', category)
+ for word, count in occurances(self.tokenizer(text)).iteritems():
+ self.redis.hincrby(self.prefix + category, word, count)
+
+ def untrain(self, category, text):
+ for word, count in occurances(self.tokenizer(text)).iteritems():
+ cur = self.redis.hget(self.prefix + category, word)
+ if cur:
+ new = int(cur) - count
+ if new > 0:
+ self.redis.hset(self.prefix + category, word, new)
+ else:
+ self.redis.hdel(self.prefix + category, word)
+ if self.tally(category) == 0:
+ self.redis.delete(self.prefix + category)
+ self.redis.srem(self.prefix + 'categories', category)
+
+ def classify(self, text):
+ score = self.score(text)
+ if not score:
+ return None
+ return sorted(score.iteritems(), key=lambda v: v[1])[-1][0]
+
+ def score(self, text):
+ occurs = occurances(self.tokenizer(text))
+ scores = {}
+ for category in self.redis.smembers(self.prefix + 'categories'):
+ tally = self.tally(category)
+ scores[category] = 0.0
+ for word, count in occurs.iteritems():
+ score = self.redis.hget(self.prefix + category, word)
+ assert not score or score > 0, "corrupt bayesian database"
+ score = score or self.correction
+ scores[category] += math.log(float(score) / tally)
+ return scores
+
+ def tally(self, category):
+ tally = sum(int(x) for x in self.redis.hvals(self.prefix + category))
+ assert tally >= 0, "corrupt bayesian database"
+ return tally
+
+
+if __name__ == '__main__':
+ import doctest
+ doctest.testmod()
34 setup.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+# http://packages.python.org/distribute/setuptools.html
+# http://diveintopython3.org/packaging.html
+# http://wiki.python.org/moin/CheeseShopTutorial
+# http://pypi.python.org/pypi?:action=list_classifiers
+
+from ez_setup import use_setuptools
+use_setuptools(version="0.6c11")
+
+import os
+from setuptools import setup
+
+def read(fname):
+ return open(os.path.join(os.path.dirname(__file__), fname)).read()
+
+setup(
+ name = "redisbayes",
+ version = __import__("redisbayes").__version__,
+ description = u"Naïve Bayesian Text Classifier on Redis",
+ long_description = read("README.rst"),
+ author = "Justine Tunney",
+ author_email = "jtunney@lobstertech.com",
+ license = "MIT",
+ install_requires = ["redis"],
+ py_modules = ["redisbayes"],
+ classifiers = [
+ "Development Status :: 5 - Production/Stable",
+ "License :: OSI Approved :: MIT License",
+ "Intended Audience :: Developers",
+ "Programming Language :: Python",
+ "Topic :: Database",
+ "Topic :: Communications :: Email",
+ ],
+)

0 comments on commit b804552

Please sign in to comment.