From 76d538aa35f6b311b7ce68879b543e0602d1a7cf Mon Sep 17 00:00:00 2001
From: Jacob Perkins <japerk@gmail.com>
Date: Tue, 18 Jan 2011 20:56:52 -0800
Subject: [PATCH] phonetic hashing for classifier tagger features

---
 nltk_trainer/featx/__init__.py  |   0
 nltk_trainer/featx/metaphone.py | 439 +++++++++++++++++++++++
 nltk_trainer/featx/phonetics.py | 599 ++++++++++++++++++++++++++++++++
 nltk_trainer/tagging/taggers.py |  47 +++
 train_tagger.py                 |  68 ++--
 5 files changed, 1133 insertions(+), 20 deletions(-)
 create mode 100644 nltk_trainer/featx/__init__.py
 create mode 100644 nltk_trainer/featx/metaphone.py
 create mode 100644 nltk_trainer/featx/phonetics.py
 create mode 100644 nltk_trainer/tagging/taggers.py

diff --git a/nltk_trainer/featx/__init__.py b/nltk_trainer/featx/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/nltk_trainer/featx/metaphone.py b/nltk_trainer/featx/metaphone.py
new file mode 100644
index 0000000..e00e9e3
--- /dev/null
+++ b/nltk_trainer/featx/metaphone.py
@@ -0,0 +1,439 @@
+#!python
+#coding= utf-8
+# This script implements the Double Metaphone algorithm (c) 1998, 1999 by Lawrence Philips
+# it was translated to Python from the C source written by Kevin Atkinson (http://aspell.net/metaphone/)
+# By Andrew Collins - January 12, 2007 who claims no rights to this work
+# http://atomboy.isa-geek.com/plone/Members/acoil/programing/double-metaphone
+# Tested with Python 2.4.3
+# Updated Feb 14, 2007 - Found a typo in the 'gh' section
+# Updated Dec 17, 2007 - Bugs fixed in 'S', 'Z', and 'J' sections. Thanks Chris Leong!
+# Updated 2009-03-05 by Matthew Somerville - Various bug fixes against the reference C++ implementation.
+
+"""
+>>> dm(u'aubrey')
+('APR', '')
+>>> dm(u'richard')
+('RXRT', 'RKRT')
+>>> dm(u'katherine') == dm(u'catherine')
+True
+>>> dm(u'Bartoš'), dm(u'Bartosz'), dm(u'Bartosch'), dm(u'Bartos')
+(('PRT', ''), ('PRTS', 'PRTX'), ('PRTX', ''), ('PRTS', ''))
+"""
+
+import unicodedata
+
+
+def dm(st):
+    """dm(string) -> (string, string or '')
+    returns the double metaphone codes for given string - always a tuple
+    there are no checks done on the input string, but it should be a single word or name."""
+    vowels = ['A', 'E', 'I', 'O', 'U', 'Y']
+    st = ''.join((c for c in unicodedata.normalize('NFD', st) if unicodedata.category(c) != 'Mn'))
+    st = st.upper()  # st is short for string. I usually prefer descriptive over short, but this var is used a lot!
+    is_slavo_germanic = (st.find('W') > -1 or st.find('K') > -1 or st.find('CZ') > -1 or st.find('WITZ') > -1)
+    length = len(st)
+    first = 2
+    st = '-' * first + st + '------'  # so we can index beyond the begining and end of the input string
+    last = first + length - 1
+    pos = first     # pos is short for position
+    pri = sec = ''  # primary and secondary metaphone codes
+    # skip these silent letters when at start of word
+    if st[first:first + 2] in ["GN", "KN", "PN", "WR", "PS"]:
+        pos += 1
+    # Initial 'X' is pronounced 'Z' e.g. 'Xavier'
+    if st[first] == 'X':
+        pri = sec = 'S'  # 'Z' maps to 'S'
+        pos += 1
+    # main loop through chars in st
+    while pos <= last:
+        #print str(pos) + '\t' + st[pos]
+        ch = st[pos]  # ch is short for character
+        # nxt (short for next characters in metaphone code) is set to  a tuple of the next characters in
+        # the primary and secondary codes and how many characters to move forward in the string.
+        # the secondary code letter is given only when it is different than the primary.
+        # This is just a trick to make the code easier to write and read.
+        nxt = (None, 1)  # default action is to add nothing and move to next char
+        if ch in vowels:
+            nxt = (None, 1)
+            if pos == first:  # all init vowels now map to 'A'
+                nxt = ('A', 1)
+        elif ch == 'B':
+            #"-mb", e.g", "dumb", already skipped over... see 'M' below
+            if st[pos + 1] == 'B':
+                nxt = ('P', 2)
+            else:
+                nxt = ('P', 1)
+        elif ch == 'C':
+            # various germanic
+            if pos > first + 1 and st[pos - 2] not in vowels and st[pos - 1:pos + 2] == 'ACH' and \
+               st[pos + 2] not in ['I'] and (st[pos + 2] not in ['E'] or st[pos - 2:pos + 4] in ['BACHER', 'MACHER']):
+                nxt = ('K', 2)
+            # special case 'CAESAR'
+            elif pos == first and st[first:first + 6] == 'CAESAR':
+                nxt = ('S', 2)
+            elif st[pos:pos + 4] == 'CHIA':  # italian 'chianti'
+                nxt = ('K', 2)
+            elif st[pos:pos + 2] == 'CH':
+                # find 'michael'
+                if pos > first and st[pos:pos + 4] == 'CHAE':
+                    nxt = ('K', 'X', 2)
+                elif pos == first and (st[pos + 1:pos + 6] in ['HARAC', 'HARIS'] or \
+                   st[pos + 1:pos + 4] in ["HOR", "HYM", "HIA", "HEM"]) and st[first:first + 5] != 'CHORE':
+                    nxt = ('K', 2)
+                #germanic, greek, or otherwise 'ch' for 'kh' sound
+                elif st[first:first + 4] in ['VAN ', 'VON '] or st[first:first + 3] == 'SCH' \
+                   or st[pos - 2:pos + 4] in ["ORCHES", "ARCHIT", "ORCHID"] \
+                   or st[pos + 2] in ['T', 'S'] \
+                   or ((st[pos - 1] in ["A", "O", "U", "E"] or pos == first) \
+                   and st[pos + 2] in ["L", "R", "N", "M", "B", "H", "F", "V", "W"]):
+                    nxt = ('K', 2)
+                else:
+                    if pos > first:
+                        if st[first:first + 2] == 'MC':
+                            nxt = ('K', 2)
+                        else:
+                            nxt = ('X', 'K', 2)
+                    else:
+                        nxt = ('X', 2)
+            # e.g, 'czerny'
+            elif st[pos:pos + 2] == 'CZ' and st[pos - 2:pos + 2] != 'WICZ':
+                nxt = ('S', 'X', 2)
+            # e.g., 'focaccia'
+            elif st[pos + 1:pos + 4] == 'CIA':
+                nxt = ('X', 3)
+            # double 'C', but not if e.g. 'McClellan'
+            elif st[pos:pos + 2] == 'CC' and not (pos == (first + 1) and st[first] == 'M'):
+                #'bellocchio' but not 'bacchus'
+                if st[pos + 2] in ["I", "E", "H"] and st[pos + 2:pos + 4] != 'HU':
+                    # 'accident', 'accede' 'succeed'
+                    if (pos == (first + 1) and st[first] == 'A') or \
+                       st[pos - 1:pos + 4] in ['UCCEE', 'UCCES']:
+                        nxt = ('KS', 3)
+                    # 'bacci', 'bertucci', other italian
+                    else:
+                        nxt = ('X', 3)
+                else:
+                    nxt = ('K', 2)
+            elif st[pos:pos + 2] in ["CK", "CG", "CQ"]:
+                nxt = ('K', 2)
+            elif st[pos:pos + 2] in ["CI", "CE", "CY"]:
+                # italian vs. english
+                if st[pos:pos + 3] in ["CIO", "CIE", "CIA"]:
+                    nxt = ('S', 'X', 2)
+                else:
+                    nxt = ('S', 2)
+            else:
+                # name sent in 'mac caffrey', 'mac gregor
+                if st[pos + 1:pos + 3] in [" C", " Q", " G"]:
+                    nxt = ('K', 3)
+                else:
+                    if st[pos + 1] in ["C", "K", "Q"] and st[pos + 1:pos + 3] not in ["CE", "CI"]:
+                        nxt = ('K', 2)
+                    else:  # default for 'C'
+                        nxt = ('K', 1)
+        elif ch == u'\xc7':  # will never get here with st.encode('ascii', 'replace') above
+            # \xc7 is UTF-8 encoding of Ç
+            nxt = ('S', 1)
+        elif ch == 'D':
+            if st[pos:pos + 2] == 'DG':
+                if st[pos + 2] in ['I', 'E', 'Y']:  # e.g. 'edge'
+                    nxt = ('J', 3)
+                else:
+                    nxt = ('TK', 2)
+            elif st[pos:pos + 2] in ['DT', 'DD']:
+                nxt = ('T', 2)
+            else:
+                nxt = ('T', 1)
+        elif ch == 'F':
+            if st[pos + 1] == 'F':
+                nxt = ('F', 2)
+            else:
+                nxt = ('F', 1)
+        elif ch == 'G':
+            if st[pos + 1] == 'H':
+                if pos > first and st[pos - 1] not in vowels:
+                    nxt = ('K', 2)
+                elif pos < (first + 3):
+                    if pos == first:  # 'ghislane', ghiradelli
+                        if st[pos + 2] == 'I':
+                            nxt = ('J', 2)
+                        else:
+                            nxt = ('K', 2)
+                # Parker's rule (with some further refinements) - e.g., 'hugh'
+                elif (pos > (first + 1) and st[pos - 2] in ['B', 'H', 'D']) \
+                   or (pos > (first + 2) and st[pos - 3] in ['B', 'H', 'D']) \
+                   or (pos > (first + 3) and st[pos - 3] in ['B', 'H']):
+                    nxt = (None, 2)
+                else:
+                    # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
+                    if pos > (first + 2) and st[pos - 1] == 'U' \
+                       and st[pos - 3] in ["C", "G", "L", "R", "T"]:
+                        nxt = ('F', 2)
+                    else:
+                        if pos > first and st[pos - 1] != 'I':
+                            nxt = ('K', 2)
+            elif st[pos + 1] == 'N':
+                if pos == (first + 1) and st[first] in vowels and not is_slavo_germanic:
+                    nxt = ('KN', 'N', 2)
+                else:
+                    # not e.g. 'cagney'
+                    if st[pos + 2:pos + 4] != 'EY' and st[pos + 1] != 'Y' and not is_slavo_germanic:
+                        nxt = ('N', 'KN', 2)
+                    else:
+                        nxt = ('KN', 2)
+            # 'tagliaro'
+            elif st[pos + 1:pos + 3] == 'LI' and not is_slavo_germanic:
+                nxt = ('KL', 'L', 2)
+            # -ges-,-gep-,-gel-, -gie- at beginning
+            elif pos == first and (st[pos + 1] == 'Y' \
+               or st[pos + 1:pos + 3] in ["ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER"]):
+                nxt = ('K', 'J', 2)
+            # -ger-,  -gy-
+            elif (st[pos + 1:pos + 3] == 'ER' or st[pos + 1] == 'Y') \
+               and st[first:first + 6] not in ["DANGER", "RANGER", "MANGER"] \
+               and st[pos - 1] not in ['E', 'I'] and st[pos - 1:pos + 2] not in ['RGY', 'OGY']:
+                nxt = ('K', 'J', 2)
+            # italian e.g, 'biaggi'
+            elif st[pos + 1] in ['E', 'I', 'Y'] or st[pos - 1:pos + 3] in ["AGGI", "OGGI"]:
+                # obvious germanic
+                if st[first:first + 4] in ['VON ', 'VAN '] or st[first:first + 3] == 'SCH' \
+                   or st[pos + 1:pos + 3] == 'ET':
+                    nxt = ('K', 2)
+                else:
+                    # always soft if french ending
+                    if st[pos + 1:pos + 5] == 'IER ':
+                        nxt = ('J', 2)
+                    else:
+                        nxt = ('J', 'K', 2)
+            elif st[pos + 1] == 'G':
+                nxt = ('K', 2)
+            else:
+                nxt = ('K', 1)
+        elif ch == 'H':
+            # only keep if first & before vowel or btw. 2 vowels
+            if (pos == first or st[pos - 1] in vowels) and st[pos + 1] in vowels:
+                nxt = ('H', 2)
+            else:  # (also takes care of 'HH')
+                nxt = (None, 1)
+        elif ch == 'J':
+            # obvious spanish, 'jose', 'san jacinto'
+            if st[pos:pos + 4] == 'JOSE' or st[first:first + 4] == 'SAN ':
+                if (pos == first and st[pos + 4] == ' ') or st[first:first + 4] == 'SAN ':
+                    nxt = ('H', )
+                else:
+                    nxt = ('J', 'H')
+            elif pos == first and st[pos:pos + 4] != 'JOSE':
+                nxt = ('J', 'A')  # Yankelovich/Jankelowicz
+            else:
+                # spanish pron. of e.g. 'bajador'
+                if st[pos - 1] in vowels and not is_slavo_germanic \
+                   and st[pos + 1] in ['A', 'O']:
+                    nxt = ('J', 'H')
+                else:
+                    if pos == last:
+                        nxt = ('J', ' ')
+                    else:
+                        if st[pos + 1] not in ["L", "T", "K", "S", "N", "M", "B", "Z"] \
+                           and st[pos - 1] not in ["S", "K", "L"]:
+                            nxt = ('J', )
+                        else:
+                            nxt = (None, )
+            if st[pos + 1] == 'J':
+                nxt = nxt + (2, )
+            else:
+                nxt = nxt + (1, )
+        elif ch == 'K':
+            if st[pos + 1] == 'K':
+                nxt = ('K', 2)
+            else:
+                nxt = ('K', 1)
+        elif ch == 'L':
+            if st[pos + 1] == 'L':
+                # spanish e.g. 'cabrillo', 'gallegos'
+                if (pos == (last - 2) and st[pos - 1:pos + 3] in ["ILLO", "ILLA", "ALLE"]) \
+                   or ((st[last - 1:last + 1] in ["AS", "OS"] or st[last] in ["A", "O"]) \
+                   and st[pos - 1:pos + 3] == 'ALLE'):
+                    nxt = ('L', ' ', 2)
+                else:
+                    nxt = ('L', 2)
+            else:
+                nxt = ('L', 1)
+        elif ch == 'M':
+            if (st[pos + 1:pos + 4] == 'UMB' \
+               and (pos + 1 == last or st[pos + 2:pos + 4] == 'ER')) \
+               or st[pos + 1] == 'M':
+                nxt = ('M', 2)
+            else:
+                nxt = ('M', 1)
+        elif ch == 'N':
+            if st[pos + 1] == 'N':
+                nxt = ('N', 2)
+            else:
+                nxt = ('N', 1)
+        elif ch == u'\xd1':  # UTF-8 encoding of ﾄ
+            nxt = ('N', 1)
+        elif ch == 'P':
+            if st[pos + 1] == 'H':
+                nxt = ('F', 2)
+            elif st[pos + 1] in ['P', 'B']:  # also account for "campbell", "raspberry"
+                nxt = ('P', 2)
+            else:
+                nxt = ('P', 1)
+        elif ch == 'Q':
+            if st[pos + 1] == 'Q':
+                nxt = ('K', 2)
+            else:
+                nxt = ('K', 1)
+        elif ch == 'R':
+            # french e.g. 'rogier', but exclude 'hochmeier'
+            if pos == last and not is_slavo_germanic \
+               and st[pos - 2:pos] == 'IE' and st[pos - 4:pos - 2] not in ['ME', 'MA']:
+                nxt = ('', 'R')
+            else:
+                nxt = ('R', )
+            if st[pos + 1] == 'R':
+                nxt = nxt + (2, )
+            else:
+                nxt = nxt + (1, )
+        elif ch == 'S':
+            # special cases 'island', 'isle', 'carlisle', 'carlysle'
+            if st[pos - 1:pos + 2] in ['ISL', 'YSL']:
+                nxt = (None, 1)
+            # special case 'sugar-'
+            elif pos == first and st[first:first + 5] == 'SUGAR':
+                nxt = ('X', 'S', 1)
+            elif st[pos:pos + 2] == 'SH':
+                # germanic
+                if st[pos + 1:pos + 5] in ["HEIM", "HOEK", "HOLM", "HOLZ"]:
+                    nxt = ('S', 2)
+                else:
+                    nxt = ('X', 2)
+            # italian & armenian
+            elif st[pos:pos + 3] in ["SIO", "SIA"] or st[pos:pos + 4] == 'SIAN':
+                if not is_slavo_germanic:
+                    nxt = ('S', 'X', 3)
+                else:
+                    nxt = ('S', 3)
+            # german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
+            # also, -sz- in slavic language altho in hungarian it is pronounced 's'
+            elif (pos == first and st[pos + 1] in ["M", "N", "L", "W"]) or st[pos + 1] == 'Z':
+                nxt = ('S', 'X')
+                if st[pos + 1] == 'Z':
+                    nxt = nxt + (2, )
+                else:
+                    nxt = nxt + (1, )
+            elif st[pos:pos + 2] == 'SC':
+                # Schlesinger's rule
+                if st[pos + 2] == 'H':
+                    # dutch origin, e.g. 'school', 'schooner'
+                    if st[pos + 3:pos + 5] in ["OO", "ER", "EN", "UY", "ED", "EM"]:
+                        # 'schermerhorn', 'schenker'
+                        if st[pos + 3:pos + 5] in ['ER', 'EN']:
+                            nxt = ('X', 'SK', 3)
+                        else:
+                            nxt = ('SK', 3)
+                    else:
+                        if pos == first and st[first + 3] not in vowels and st[first + 3] != 'W':
+                            nxt = ('X', 'S', 3)
+                        else:
+                            nxt = ('X', 3)
+                elif st[pos + 2] in ['I', 'E', 'Y']:
+                    nxt = ('S', 3)
+                else:
+                    nxt = ('SK', 3)
+            # french e.g. 'resnais', 'artois'
+            elif pos == last and st[pos - 2:pos] in ['AI', 'OI']:
+                nxt = ('', 'S', 1)
+            else:
+                nxt = ('S', )
+                if st[pos + 1] in ['S', 'Z']:
+                    nxt = nxt + (2, )
+                else:
+                    nxt = nxt + (1, )
+        elif ch == 'T':
+            if st[pos:pos + 4] == 'TION':
+                nxt = ('X', 3)
+            elif st[pos:pos + 3] in ['TIA', 'TCH']:
+                nxt = ('X', 3)
+            elif st[pos:pos + 2] == 'TH' or st[pos:pos + 3] == 'TTH':
+                # special case 'thomas', 'thames' or germanic
+                if st[pos + 2:pos + 4] in ['OM', 'AM'] or st[first:first + 4] in ['VON ', 'VAN '] \
+                   or st[first:first + 3] == 'SCH':
+                    nxt = ('T', 2)
+                else:
+                    nxt = ('0', 'T', 2)
+            elif st[pos + 1] in ['T', 'D']:
+                nxt = ('T', 2)
+            else:
+                nxt = ('T', 1)
+        elif ch == 'V':
+            if st[pos + 1] == 'V':
+                nxt = ('F', 2)
+            else:
+                nxt = ('F', 1)
+        elif ch == 'W':
+            # can also be in middle of word
+            if st[pos:pos + 2] == 'WR':
+                nxt = ('R', 2)
+            elif pos == first and (st[pos + 1] in vowels or st[pos:pos + 2] == 'WH'):
+                # Wasserman should match Vasserman
+                if st[pos + 1] in vowels:
+                    nxt = ('A', 'F', 1)
+                else:
+                    nxt = ('A', 1)
+            # Arnow should match Arnoff
+            elif (pos == last and st[pos - 1] in vowels) \
+               or st[pos - 1:pos + 4] in ["EWSKI", "EWSKY", "OWSKI", "OWSKY"] \
+               or st[first:first + 3] == 'SCH':
+                nxt = ('', 'F', 1)
+            # polish e.g. 'filipowicz'
+            elif st[pos:pos + 4] in ["WICZ", "WITZ"]:
+                nxt = ('TS', 'FX', 4)
+            else:  # default is to skip it
+                nxt = (None, 1)
+        elif ch == 'X':
+            # french e.g. breaux
+            nxt = (None, )
+            if not(pos == last and (st[pos - 3:pos] in ["IAU", "EAU"] \
+               or st[pos - 2:pos] in ['AU', 'OU'])):
+                nxt = ('KS', )
+            if st[pos + 1] in ['C', 'X']:
+                nxt = nxt + (2, )
+            else:
+                nxt = nxt + (1, )
+        elif ch == 'Z':
+            # chinese pinyin e.g. 'zhao'
+            if st[pos + 1] == 'H':
+                nxt = ('J', )
+            elif st[pos + 1:pos + 3] in ["ZO", "ZI", "ZA"] \
+               or (is_slavo_germanic and pos > first and st[pos - 1] != 'T'):
+                nxt = ('S', 'TS')
+            else:
+                nxt = ('S', )
+            if st[pos + 1] == 'Z' or st[pos + 1] == 'H':
+                nxt = nxt + (2, )
+            else:
+                nxt = nxt + (1, )
+        # ----------------------------------
+        # --- end checking letters------
+        # ----------------------------------
+        #print str(nxt)
+        if len(nxt) == 2:
+            if nxt[0]:
+                pri += nxt[0]
+                sec += nxt[0]
+            pos += nxt[1]
+        elif len(nxt) == 3:
+            if nxt[0]:
+                pri += nxt[0]
+            if nxt[1]:
+                sec += nxt[1]
+            pos += nxt[2]
+    if pri == sec:
+        return (pri, '')
+    else:
+        return (pri, sec)
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
diff --git a/nltk_trainer/featx/phonetics.py b/nltk_trainer/featx/phonetics.py
new file mode 100644
index 0000000..e7752c5
--- /dev/null
+++ b/nltk_trainer/featx/phonetics.py
@@ -0,0 +1,599 @@
+# ----------------------------------------------------------
+# AdvaS Advanced Search 
+# module for phonetic algorithms
+#
+# (C) 2002 - 2005 Frank Hofmann, Chemnitz, Germany
+# email fh@efho.de
+# ----------------------------------------------------------
+
+# changed 2005-01-24
+
+import string
+import re
+
+def soundex (term):
+	"Return the soundex value to a string argument."
+
+	# Create and compare soundex codes of English words.
+	#
+	# Soundex is an algorithm that hashes English strings into
+	# alpha-numerical value that represents what the word sounds
+	# like. For more information on soundex and some notes on the
+	# differences in implemenations visit:
+	# http://www.bluepoof.com/Soundex/info.html
+	#
+	# This version modified by Nathan Heagy at Front Logic Inc., to be
+	# compatible with php's soundexing and much faster.
+	#
+	# eAndroid / Nathan Heagy / Jul 29 2000
+	# changes by Frank Hofmann / Jan 02 2005
+
+	# generate translation table only once. used to translate into soundex numbers
+	#table = string.maketrans('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '0123012002245501262301020201230120022455012623010202')
+	table = string.maketrans('ABCDEFGHIJKLMNOPQRSTUVWXYZ', '01230120022455012623010202')
+
+	# check parameter
+	if not term:
+		return "0000" # could be Z000 for compatibility with other implementations
+	# end if
+
+    	# convert into uppercase letters
+	term = string.upper(term)
+    	first_char = term[0]
+
+	# translate the string into soundex code according to the table above
+	term = string.translate(term[1:], table)
+	
+	# remove all 0s
+	term = string.replace(term, "0", "")
+
+    	# remove duplicate numbers in-a-row
+    	str2 = first_char
+	for x in term:
+		if x != str2[-1]:
+	   		str2 = str2 + x
+		# end if
+	# end for
+
+	# pad with zeros
+	str2 = str2+"0"*len(str2)
+
+	# take the first four letters
+	return_value = str2[:4]
+
+	# return value
+	return return_value
+
+def metaphone (term):
+	"returns metaphone code for a given string"
+
+	# implementation of the original algorithm from Lawrence Philips
+	# extended/rewritten by M. Kuhn
+	# improvements with thanks to John Machin <sjmachin@lexicon.net>
+
+	# define return value
+	code = ""
+
+	i = 0
+	term_length = len(term)
+
+	if (term_length == 0):
+		# empty string ?
+		return code
+	# end if
+
+	# extension #1 (added 2005-01-28)
+	# convert to lowercase
+	term = string.lower(term)
+	
+	# extension #2 (added 2005-01-28)
+	# remove all non-english characters, first
+	term = re.sub(r'[^a-z]', '', term)
+	if len(term) == 0:
+		# nothing left
+		return code
+	# end if
+		
+	# extension #3 (added 2005-01-24)
+	# conflate repeated letters
+	firstChar = term[0]
+    	str2 = firstChar
+	for x in term:
+		if x != str2[-1]:
+	   		str2 = str2 + x
+		# end if
+	# end for
+	
+	# extension #4 (added 2005-01-24)
+	# remove any vowels unless a vowel is the first letter
+	firstChar = str2[0]
+	str3 = firstChar
+	for x in str2[1:]:
+		if (re.search(r'[^aeiou]', x)):
+			str3 = str3 + x
+		# end if
+	# end for
+	
+	term = str3
+	term_length = len(term)
+	if term_length == 0:
+		# nothing left
+		return code
+	# end if
+	
+	# check for exceptions
+	if (term_length > 1):
+		# get first two characters
+		first_chars = term[0:2]
+
+		# build translation table
+		table = {
+		    "ae":"e",
+		    "gn":"n",
+		    "kn":"n",
+		    "pn":"n",
+		    "wr":"n",
+		    "wh":"w"
+		}
+		
+		if first_chars in table.keys():
+			term = term[2:]
+			code = table[first_chars]
+			term_length = len(term)
+		# end if
+		
+	elif (term[0] == "x"):
+		term = ""
+		code = "s"
+		term_length = 0
+	# end if
+
+	# define standard translation table
+	st_trans = {
+		"b":"b",
+		"c":"k",
+		"d":"t",
+		"g":"k",
+		"h":"h",
+		"k":"k",
+		"p":"p",
+		"q":"k",
+		"s":"s",
+		"t":"t",
+		"v":"f",
+		"w":"w",
+		"x":"ks",
+		"y":"y",
+		"z":"s"
+	}
+
+	i = 0
+	while (i<term_length):
+		# init character to add, init basic patterns
+		add_char = ""
+		part_n_2 = ""
+		part_n_3 = ""
+		part_n_4 = ""
+		part_c_2 = ""
+		part_c_3 = ""
+
+		# extract a number of patterns, if possible
+		if (i < (term_length - 1)):
+			part_n_2 = term[i:i+2]
+
+			if (i>0):
+				part_c_2 = term[i-1:i+1]
+				part_c_3 = term[i-1:i+2]
+			# end if
+		# end if
+
+		if (i < (term_length - 2)):
+			part_n_3 = term[i:i+3]
+		# end if
+
+		if (i < (term_length - 3)):
+			part_n_4 = term[i:i+4]
+		# end if
+
+		# use table with conditions for translations
+		if (term[i] == "b"):
+			add_char = st_trans["b"]
+			if (i == (term_length - 1)):
+				if (i>0):
+					if (term[i-1] == "m"):
+						add_char = ""
+					# end if
+				# end if
+			# end if
+		elif (term[i] == "c"):
+			add_char = st_trans["c"]
+			if (part_n_2 == "ch"):
+				add_char = "x"
+			elif (re.search(r'c[iey]', part_n_2)):
+				add_char = "s"
+			# end if
+
+			if (part_n_3 == "cia"):
+				add_char = "x"
+			# end if
+
+			if (re.search(r'sc[iey]', part_c_3)):
+				add_char = ""
+			# end if
+
+		elif (term[i] == "d"):
+			add_char = st_trans["d"]
+			if (re.search(r'dg[eyi]', part_n_3)):
+				add_char = "j"
+			# end if
+
+		elif (term[i] == "g"):
+			add_char = st_trans["g"]
+
+			if (part_n_2 == "gh"):
+				if (i == (term_length - 2)):
+					add_char = ""
+				# end if
+			elif (re.search(r'gh[aeiouy]', part_n_3)):
+				add_char = ""
+			elif (part_n_2 == "gn"):
+				add_char = ""
+			elif (part_n_4 == "gned"):
+				add_char = ""
+			elif (re.search(r'dg[eyi]',part_c_3)):
+				add_char = ""
+			elif (part_n_2 == "gi"):
+				if (part_c_3 != "ggi"):
+					add_char = "j"
+				# end if
+			elif (part_n_2 == "ge"):
+				if (part_c_3 != "gge"):
+					add_char = "j"
+				# end if
+			elif (part_n_2 == "gy"):
+				if (part_c_3 != "ggy"):
+					add_char = "j"
+				# end if
+			elif (part_n_2 == "gg"):
+				add_char = ""
+			# end if
+		elif (term[i] == "h"):
+			add_char = st_trans["h"]
+			if (re.search(r'[aeiouy]h[^aeiouy]', part_c_3)):
+				add_char = ""
+			elif (re.search(r'[csptg]h', part_c_2)):
+				add_char = ""
+			# end if
+		elif (term[i] == "k"):
+			add_char = st_trans["k"]
+			if (part_c_2 == "ck"):
+				add_char = ""
+			# end if
+		elif (term[i] == "p"):
+			add_char = st_trans["p"]
+			if (part_n_2 == "ph"):
+				add_char = "f"
+			# end if
+		elif (term[i] == "q"):
+			add_char = st_trans["q"]
+		elif (term[i] == "s"):
+			add_char = st_trans["s"]
+			if (part_n_2 == "sh"):
+				add_char = "x"
+			# end if
+
+			if (re.search(r'si[ao]', part_n_3)):
+				add_char = "x"
+			# end if
+		elif (term[i] == "t"):
+			add_char = st_trans["t"]
+			if (part_n_2 == "th"):
+				add_char = "0"
+			# end if
+
+			if (re.search(r'ti[ao]', part_n_3)):
+				add_char = "x"
+			# end if
+		elif (term[i] == "v"):
+			add_char = st_trans["v"]
+		elif (term[i] == "w"):
+			add_char = st_trans["w"]
+			if (re.search(r'w[^aeiouy]', part_n_2)):
+				add_char = ""
+			# end if
+		elif (term[i] == "x"):
+			add_char = st_trans["x"]
+		elif (term[i] == "y"):
+			add_char = st_trans["y"]
+		elif (term[i] == "z"):
+			add_char = st_trans["z"]
+		else:
+			# alternative
+			add_char = term[i]
+		# end if
+
+		code = code + add_char
+		i += 1
+	# end while
+
+	# return metaphone code
+	return code
+
+def nysiis (term):
+	"returns New York State Identification and Intelligence Algorithm (NYSIIS) code for the given term"
+
+	code = ""
+
+	i = 0
+	term_length = len(term)
+
+	if (term_length == 0):
+		# empty string ?
+		return code
+	# end if
+
+	# build translation table for the first characters
+	table = {
+		"mac":"mcc",
+		"ph":"ff",
+		"kn":"nn",
+		"pf":"ff",
+		"k":"c",
+		"sch":"sss"
+	}
+
+	for table_entry in table.keys():
+		table_value = table[table_entry]	# get table value
+		table_value_len = len(table_value)	# calculate its length
+		first_chars = term[0:table_value_len]
+		if (first_chars == table_entry):
+			term = table_value + term[table_value_len:]
+			break
+		# end if
+	# end for
+
+	# build translation table for the last characters
+	table = {
+		"ee":"y",
+		"ie":"y",
+		"dt":"d",
+		"rt":"d",
+		"rd":"d",
+		"nt":"d",
+		"nd":"d",
+	}
+
+	for table_entry in table.keys():
+		table_value = table[table_entry]	# get table value
+		table_entry_len = len(table_entry)	# calculate its length
+		last_chars = term[(0 - table_entry_len):]
+		#print last_chars, ", ", table_entry, ", ", table_value
+		if (last_chars == table_entry):
+			term = term[:(0 - table_value_len + 1)] + table_value
+			break
+		# end if
+	# end for
+
+	# initialize code
+	code = term
+
+	# transform ev->af
+	code = re.sub(r'ev', r'af', code)
+
+	# transform a,e,i,o,u->a
+	code = re.sub(r'[aeiouy]', r'a', code)
+	
+	# transform q->g
+	code = re.sub(r'q', r'g', code)
+	
+	# transform z->s
+	code = re.sub(r'z', r's', code)
+
+	# transform m->n
+	code = re.sub(r'm', r'n', code)
+
+	# transform kn->n
+	code = re.sub(r'kn', r'n', code)
+
+	# transform k->c
+	code = re.sub(r'k', r'c', code)
+
+	# transform sch->sss
+	code = re.sub(r'sch', r'sss', code)
+
+	# transform ph->ff
+	code = re.sub(r'ph', r'ff', code)
+
+	# transform h-> if previous or next is nonvowel -> previous
+	occur = re.findall(r'([a-z]{0,1}?)h([a-z]{0,1}?)', code)
+	#print occur
+	for occur_group in occur:
+		occur_item_previous = occur_group[0]
+		occur_item_next = occur_group[1]
+
+		if ((re.match(r'[^aeiouy]', occur_item_previous)) or (re.match(r'[^aeiouy]', occur_item_next))):
+			if (occur_item_previous != ""):
+				# make substitution
+				code = re.sub (occur_item_previous + "h", occur_item_previous * 2, code, 1)
+			# end if
+		# end if
+	# end for
+	
+	# transform w-> if previous is vowel -> previous
+	occur = re.findall(r'([aeiouy]{1}?)w', code)
+	#print occur
+	for occur_group in occur:
+		occur_item_previous = occur_group[0]
+		# make substitution
+		code = re.sub (occur_item_previous + "w", occur_item_previous * 2, code, 1)
+	# end for
+	
+	# check last character
+	# -s, remove
+	code = re.sub (r's$', r'', code)
+	# -ay, replace by -y
+	code = re.sub (r'ay$', r'y', code)
+	# -a, remove
+	code = re.sub (r'a$', r'', code)
+	
+	# return nysiis code
+	return code
+
+def caverphone (term):
+	"returns the language key using the caverphone algorithm 2.0"
+
+	# Developed at the University of Otago, New Zealand.
+	# Project: Caversham Project (http://caversham.otago.ac.nz)
+	# Developer: David Hood, University of Otago, New Zealand
+	# Contact: caversham@otago.ac.nz
+	# Project Technical Paper: http://caversham.otago.ac.nz/files/working/ctp150804.pdf
+	# Version 2.0 (2004-08-15)
+
+	code = ""
+
+	i = 0
+	term_length = len(term)
+
+	if (term_length == 0):
+		# empty string ?
+		return code
+	# end if
+
+	# convert to lowercase
+	code = string.lower(term)
+
+	# remove anything not in the standard alphabet (a-z)
+	code = re.sub(r'[^a-z]', '', code)
+
+	# remove final e
+	if code.endswith("e"):
+		code = code[:-1]
+
+	# if the name starts with cough, rough, tough, enough or trough -> cou2f (rou2f, tou2f, enou2f, trough)
+	code = re.sub(r'^([crt]|(en)|(tr))ough', r'\1ou2f', code)
+
+	# if the name starts with gn -> 2n
+	code = re.sub(r'^gn', r'2n', code)
+
+	# if the name ends with mb -> m2
+	code = re.sub(r'mb$', r'm2', code)
+
+	# replace cq -> 2q
+	code = re.sub(r'cq', r'2q', code)
+	
+	# replace c[i,e,y] -> s[i,e,y]
+	code = re.sub(r'c([iey])', r's\1', code)
+	
+	# replace tch -> 2ch
+	code = re.sub(r'tch', r'2ch', code)
+	
+	# replace c,q,x -> k
+	code = re.sub(r'[cqx]', r'k', code)
+	
+	# replace v -> f
+	code = re.sub(r'v', r'f', code)
+	
+	# replace dg -> 2g
+	code = re.sub(r'dg', r'2g', code)
+	
+	# replace ti[o,a] -> si[o,a]
+	code = re.sub(r'ti([oa])', r'si\1', code)
+	
+	# replace d -> t
+	code = re.sub(r'd', r't', code)
+	
+	# replace ph -> fh
+	code = re.sub(r'ph', r'fh', code)
+
+	# replace b -> p
+	code = re.sub(r'b', r'p', code)
+	
+	# replace sh -> s2
+	code = re.sub(r'sh', r's2', code)
+	
+	# replace z -> s
+	code = re.sub(r'z', r's', code)
+
+	# replace initial vowel [aeiou] -> A
+	code = re.sub(r'^[aeiou]', r'A', code)
+
+	# replace all other vowels [aeiou] -> 3
+	code = re.sub(r'[aeiou]', r'3', code)
+
+	# replace j -> y
+	code = re.sub(r'j', r'y', code)
+
+	# replace an initial y3 -> Y3
+	code = re.sub(r'^y3', r'Y3', code)
+	
+	# replace an initial y -> A
+	code = re.sub(r'^y', r'A', code)
+
+	# replace y -> 3
+	code = re.sub(r'y', r'3', code)
+	
+	# replace 3gh3 -> 3kh3
+	code = re.sub(r'3gh3', r'3kh3', code)
+	
+	# replace gh -> 22
+	code = re.sub(r'gh', r'22', code)
+
+	# replace g -> k
+	code = re.sub(r'g', r'k', code)
+
+	# replace groups of s,t,p,k,f,m,n by its single, upper-case equivalent
+	for single_letter in ["s", "t", "p", "k", "f", "m", "n"]:
+		otherParts = re.split(single_letter + "+", code)
+		code = string.join(otherParts, string.upper(single_letter))
+	
+	# replace w[3,h3] by W[3,h3]
+	code = re.sub(r'w(h?3)', r'W\1', code)
+
+	# replace final w with 3
+	code = re.sub(r'w$', r'3', code)
+
+	# replace w -> 2
+	code = re.sub(r'w', r'2', code)
+
+	# replace h at the beginning with an A
+	code = re.sub(r'^h', r'A', code)
+
+	# replace all other occurrences of h with a 2
+	code = re.sub(r'h', r'2', code)
+
+	# replace r3 with R3
+	code = re.sub(r'r3', r'R3', code)
+
+	# replace final r -> 3
+	code = re.sub(r'r$', r'3', code)
+
+	# replace r with 2
+	code = re.sub(r'r', r'2', code)
+
+	# replace l3 with L3
+	code = re.sub(r'l3', r'L3', code)
+	
+	# replace final l -> 3
+	code = re.sub(r'l$', r'3', code)
+	
+	# replace l with 2
+	code = re.sub(r'l', r'2', code)
+
+	# remove all 2's
+	code = re.sub(r'2', r'', code)
+
+	# replace the final 3 -> A
+	code = re.sub(r'3$', r'A', code)
+	
+	# remove all 3's
+	code = re.sub(r'3', r'', code)
+
+	# extend the code by 10 '1' (one)
+	code += '1' * 10
+	
+	# take the first 10 characters
+	caverphoneCode = code[:10]
+	
+	# return caverphone code
+	return caverphoneCode
+
diff --git a/nltk_trainer/tagging/taggers.py b/nltk_trainer/tagging/taggers.py
new file mode 100644
index 0000000..844ff2b
--- /dev/null
+++ b/nltk_trainer/tagging/taggers.py
@@ -0,0 +1,47 @@
+from nltk.tag.sequential import SequentialBackoffTagger
+from nltk.probability import FreqDist
+from nltk.tag import ClassifierBasedPOSTagger
+from nltk_trainer.featx import phonetics
+from nltk_trainer.featx.metaphone import dm
+
+class PhoneticClassifierBasedPOSTagger(ClassifierBasedPOSTagger):
+	def __init__(self, double_metaphone=False, metaphone=False, soundex=False, nysiis=False, caverphone=False, *args, **kwargs):
+		self.funs = {}
+		
+		if double_metaphone:
+			self.funs['double-metaphone'] = lambda s: dm(unicode(s))
+		
+		if metaphone:
+			self.funs['metaphone'] = phonetics.metaphone
+		
+		if soundex:
+			self.funs['soundex'] = phonetics.soundex
+		
+		if nysiis:
+			self.funs['nysiis'] = phonetics.nysiis
+		
+		if caverphone:
+			self.funs['caverphone'] = phonetics.caverphone
+		# for some reason don't get self.funs if this is done first, but works if done last
+		ClassifierBasedPOSTagger.__init__(self, *args, **kwargs)
+	
+	def feature_detector(self, tokens, index, history):
+		feats = ClassifierBasedPOSTagger.feature_detector(self, tokens, index, history)
+		s = tokens[index]
+		
+		for key, fun in self.funs.iteritems():
+			feats[key] = fun(s)
+		
+		return feats
+
+class MaxVoteBackoffTagger(SequentialBackoffTagger):
+	def __init__(self, *taggers):
+		self._taggers = taggers
+	
+	def choose_tag(self, tokens, index, history):
+		tags = FreqDist()
+		
+		for tagger in self._taggers:
+			tags.inc(tagger.choose_tag(tokens, index, history))
+		
+		return tags.max()
\ No newline at end of file
diff --git a/train_tagger.py b/train_tagger.py
index ad7d354..838377f 100644
--- a/train_tagger.py
+++ b/train_tagger.py
@@ -9,6 +9,7 @@
 from nltk.tag.simplify import simplify_wsj_tag
 from nltk_trainer.tagging.readers import NumberedTaggedSentCorpusReader
 from nltk_trainer.tagging.training import train_brill_tagger
+from nltk_trainer.tagging.taggers import PhoneticClassifierBasedPOSTagger
 
 ########################################
 ## command options & argument parsing ##
@@ -54,15 +55,6 @@
 You can use this option multiple times to create multiple AffixTaggers with different affixes.
 The affixes will be used in the order given.''')
 
-classifier_group = parser.add_argument_group('Classifier Based Tagger')
-classifier_group.add_argument('--classifier', default=None,
-	choices=['NaiveBayes', 'DecisionTree', 'Maxent'] + MaxentClassifier.ALGORITHMS,
-	help='''ClassifierBasedPOSTagger algorithm to use, default is %(default)s.
-Maxent uses the default Maxent training algorithm, either CG or iis.''')
-classifier_group.add_argument('--cutoff_prob', default=0, type=float,
-	help='Cutoff probability for classifier tagger to backoff to previous tagger')
-# TODO: phonetic feature options
-
 brill_group = parser.add_argument_group('Brill Tagger Options')
 brill_group.add_argument('--brill', action='store_true', default=False,
 	help='Train a Brill Tagger in front of the other tagger.')
@@ -72,11 +64,25 @@
 brill_group.add_argument('--max_rules', type=int, default=200)
 brill_group.add_argument('--min_score', type=int, default=2)
 
-eval_group = parser.add_argument_group('Tagger Evaluation',
-	'Evaluation metrics for part-of-speech taggers')
-eval_group.add_argument('--no-eval', action='store_true', default=False,
-	help="don't do any evaluation")
-# TODO: are there any metrics other than accuracy?
+classifier_group = parser.add_argument_group('Classifier Based Tagger')
+classifier_group.add_argument('--classifier', default=None,
+	choices=['NaiveBayes', 'DecisionTree', 'Maxent'] + MaxentClassifier.ALGORITHMS,
+	help='''ClassifierBasedPOSTagger algorithm to use, default is %(default)s.
+Maxent uses the default Maxent training algorithm, either CG or iis.''')
+classifier_group.add_argument('--cutoff_prob', default=0, type=float,
+	help='Cutoff probability for classifier tagger to backoff to previous tagger')
+
+phonetic_group = parser.add_argument_group('Phonetic Feature Options for a Classifier Based Tagger')
+phonetic_group.add_argument('--metaphone', action='store_true',
+	default=False, help='Use metaphone feature')
+phonetic_group.add_argument('--double-metaphone', action='store_true',
+	default=False, help='Use double metaphone feature')
+phonetic_group.add_argument('--soundex', action='store_true',
+	default=False, help='Use soundex feature')
+phonetic_group.add_argument('--nysiis', action='store_true',
+	default=False, help='Use NYSIIS feature')
+phonetic_group.add_argument('--caverphone', action='store_true',
+	default=False, help='Use caverphone feature')
 
 maxent_group = parser.add_argument_group('Maxent Classifier Tagger',
 	'These options only apply when a Maxent classifier is chosen.')
@@ -97,6 +103,12 @@
 decisiontree_group.add_argument('--support_cutoff', default=10, type=int,
 	help='default is 10')
 
+eval_group = parser.add_argument_group('Tagger Evaluation',
+	'Evaluation metrics for part-of-speech taggers')
+eval_group.add_argument('--no-eval', action='store_true', default=False,
+	help="don't do any evaluation")
+# TODO: word coverage of test words, how many get a tag != '-NONE-'
+
 args = parser.parse_args()
 
 ###################
@@ -242,15 +254,31 @@ def f(train_sents, backoff=None):
 	classifier_train_kwargs['trace'] = args.trace
 
 if args.classifier:
-	if args.trace:
-		print 'training a %s ClassifierBasedPOSTagger' % args.classifier
-	
 	def classifier_builder(train_feats):
 		return classifier_train(train_feats, **classifier_train_kwargs)
 	
-	tagger = ClassifierBasedPOSTagger(train=train_sents, verbose=args.trace,
-		backoff=tagger, cutoff_prob=args.cutoff_prob,
-		classifier_builder=classifier_builder)
+	kwargs = {
+		'train': train_sents,
+		'verbose': args.trace,
+		'backoff': tagger,
+		'cutoff_prob': args.cutoff_prob,
+		'classifier_builder': classifier_builder
+	}
+	
+	phonetic_keys = ['metaphone', 'double_metaphone', 'soundex', 'nysiis', 'caverphone']
+	
+	if any([getattr(args, key) for key in phonetic_keys]):
+		cls = PhoneticClassifierBasedPOSTagger
+		
+		for key in phonetic_keys:
+			kwargs[key] = getattr(args, key)
+	else:
+		cls = ClassifierBasedPOSTagger
+	
+	if args.trace:
+		print 'training a %s %s' % (args.classifier, cls.__name__)
+	
+	tagger = cls(**kwargs)
 
 ##################
 ## brill tagger ##