From 76d538aa35f6b311b7ce68879b543e0602d1a7cf Mon Sep 17 00:00:00 2001 From: Jacob Perkins Date: Tue, 18 Jan 2011 20:56:52 -0800 Subject: [PATCH] phonetic hashing for classifier tagger features --- nltk_trainer/featx/__init__.py | 0 nltk_trainer/featx/metaphone.py | 439 +++++++++++++++++++++++ nltk_trainer/featx/phonetics.py | 599 ++++++++++++++++++++++++++++++++ nltk_trainer/tagging/taggers.py | 47 +++ train_tagger.py | 68 ++-- 5 files changed, 1133 insertions(+), 20 deletions(-) create mode 100644 nltk_trainer/featx/__init__.py create mode 100644 nltk_trainer/featx/metaphone.py create mode 100644 nltk_trainer/featx/phonetics.py create mode 100644 nltk_trainer/tagging/taggers.py diff --git a/nltk_trainer/featx/__init__.py b/nltk_trainer/featx/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/nltk_trainer/featx/metaphone.py b/nltk_trainer/featx/metaphone.py new file mode 100644 index 0000000..e00e9e3 --- /dev/null +++ b/nltk_trainer/featx/metaphone.py @@ -0,0 +1,439 @@ +#!python +#coding= utf-8 +# This script implements the Double Metaphone algorithm (c) 1998, 1999 by Lawrence Philips +# it was translated to Python from the C source written by Kevin Atkinson (http://aspell.net/metaphone/) +# By Andrew Collins - January 12, 2007 who claims no rights to this work +# http://atomboy.isa-geek.com/plone/Members/acoil/programing/double-metaphone +# Tested with Python 2.4.3 +# Updated Feb 14, 2007 - Found a typo in the 'gh' section +# Updated Dec 17, 2007 - Bugs fixed in 'S', 'Z', and 'J' sections. Thanks Chris Leong! +# Updated 2009-03-05 by Matthew Somerville - Various bug fixes against the reference C++ implementation. + +""" +>>> dm(u'aubrey') +('APR', '') +>>> dm(u'richard') +('RXRT', 'RKRT') +>>> dm(u'katherine') == dm(u'catherine') +True +>>> dm(u'Bartoš'), dm(u'Bartosz'), dm(u'Bartosch'), dm(u'Bartos') +(('PRT', ''), ('PRTS', 'PRTX'), ('PRTX', ''), ('PRTS', '')) +""" + +import unicodedata + + +def dm(st): + """dm(string) -> (string, string or '') + returns the double metaphone codes for given string - always a tuple + there are no checks done on the input string, but it should be a single word or name.""" + vowels = ['A', 'E', 'I', 'O', 'U', 'Y'] + st = ''.join((c for c in unicodedata.normalize('NFD', st) if unicodedata.category(c) != 'Mn')) + st = st.upper() # st is short for string. I usually prefer descriptive over short, but this var is used a lot! + is_slavo_germanic = (st.find('W') > -1 or st.find('K') > -1 or st.find('CZ') > -1 or st.find('WITZ') > -1) + length = len(st) + first = 2 + st = '-' * first + st + '------' # so we can index beyond the begining and end of the input string + last = first + length - 1 + pos = first # pos is short for position + pri = sec = '' # primary and secondary metaphone codes + # skip these silent letters when at start of word + if st[first:first + 2] in ["GN", "KN", "PN", "WR", "PS"]: + pos += 1 + # Initial 'X' is pronounced 'Z' e.g. 'Xavier' + if st[first] == 'X': + pri = sec = 'S' # 'Z' maps to 'S' + pos += 1 + # main loop through chars in st + while pos <= last: + #print str(pos) + '\t' + st[pos] + ch = st[pos] # ch is short for character + # nxt (short for next characters in metaphone code) is set to a tuple of the next characters in + # the primary and secondary codes and how many characters to move forward in the string. + # the secondary code letter is given only when it is different than the primary. + # This is just a trick to make the code easier to write and read. + nxt = (None, 1) # default action is to add nothing and move to next char + if ch in vowels: + nxt = (None, 1) + if pos == first: # all init vowels now map to 'A' + nxt = ('A', 1) + elif ch == 'B': + #"-mb", e.g", "dumb", already skipped over... see 'M' below + if st[pos + 1] == 'B': + nxt = ('P', 2) + else: + nxt = ('P', 1) + elif ch == 'C': + # various germanic + if pos > first + 1 and st[pos - 2] not in vowels and st[pos - 1:pos + 2] == 'ACH' and \ + st[pos + 2] not in ['I'] and (st[pos + 2] not in ['E'] or st[pos - 2:pos + 4] in ['BACHER', 'MACHER']): + nxt = ('K', 2) + # special case 'CAESAR' + elif pos == first and st[first:first + 6] == 'CAESAR': + nxt = ('S', 2) + elif st[pos:pos + 4] == 'CHIA': # italian 'chianti' + nxt = ('K', 2) + elif st[pos:pos + 2] == 'CH': + # find 'michael' + if pos > first and st[pos:pos + 4] == 'CHAE': + nxt = ('K', 'X', 2) + elif pos == first and (st[pos + 1:pos + 6] in ['HARAC', 'HARIS'] or \ + st[pos + 1:pos + 4] in ["HOR", "HYM", "HIA", "HEM"]) and st[first:first + 5] != 'CHORE': + nxt = ('K', 2) + #germanic, greek, or otherwise 'ch' for 'kh' sound + elif st[first:first + 4] in ['VAN ', 'VON '] or st[first:first + 3] == 'SCH' \ + or st[pos - 2:pos + 4] in ["ORCHES", "ARCHIT", "ORCHID"] \ + or st[pos + 2] in ['T', 'S'] \ + or ((st[pos - 1] in ["A", "O", "U", "E"] or pos == first) \ + and st[pos + 2] in ["L", "R", "N", "M", "B", "H", "F", "V", "W"]): + nxt = ('K', 2) + else: + if pos > first: + if st[first:first + 2] == 'MC': + nxt = ('K', 2) + else: + nxt = ('X', 'K', 2) + else: + nxt = ('X', 2) + # e.g, 'czerny' + elif st[pos:pos + 2] == 'CZ' and st[pos - 2:pos + 2] != 'WICZ': + nxt = ('S', 'X', 2) + # e.g., 'focaccia' + elif st[pos + 1:pos + 4] == 'CIA': + nxt = ('X', 3) + # double 'C', but not if e.g. 'McClellan' + elif st[pos:pos + 2] == 'CC' and not (pos == (first + 1) and st[first] == 'M'): + #'bellocchio' but not 'bacchus' + if st[pos + 2] in ["I", "E", "H"] and st[pos + 2:pos + 4] != 'HU': + # 'accident', 'accede' 'succeed' + if (pos == (first + 1) and st[first] == 'A') or \ + st[pos - 1:pos + 4] in ['UCCEE', 'UCCES']: + nxt = ('KS', 3) + # 'bacci', 'bertucci', other italian + else: + nxt = ('X', 3) + else: + nxt = ('K', 2) + elif st[pos:pos + 2] in ["CK", "CG", "CQ"]: + nxt = ('K', 2) + elif st[pos:pos + 2] in ["CI", "CE", "CY"]: + # italian vs. english + if st[pos:pos + 3] in ["CIO", "CIE", "CIA"]: + nxt = ('S', 'X', 2) + else: + nxt = ('S', 2) + else: + # name sent in 'mac caffrey', 'mac gregor + if st[pos + 1:pos + 3] in [" C", " Q", " G"]: + nxt = ('K', 3) + else: + if st[pos + 1] in ["C", "K", "Q"] and st[pos + 1:pos + 3] not in ["CE", "CI"]: + nxt = ('K', 2) + else: # default for 'C' + nxt = ('K', 1) + elif ch == u'\xc7': # will never get here with st.encode('ascii', 'replace') above + # \xc7 is UTF-8 encoding of Ç + nxt = ('S', 1) + elif ch == 'D': + if st[pos:pos + 2] == 'DG': + if st[pos + 2] in ['I', 'E', 'Y']: # e.g. 'edge' + nxt = ('J', 3) + else: + nxt = ('TK', 2) + elif st[pos:pos + 2] in ['DT', 'DD']: + nxt = ('T', 2) + else: + nxt = ('T', 1) + elif ch == 'F': + if st[pos + 1] == 'F': + nxt = ('F', 2) + else: + nxt = ('F', 1) + elif ch == 'G': + if st[pos + 1] == 'H': + if pos > first and st[pos - 1] not in vowels: + nxt = ('K', 2) + elif pos < (first + 3): + if pos == first: # 'ghislane', ghiradelli + if st[pos + 2] == 'I': + nxt = ('J', 2) + else: + nxt = ('K', 2) + # Parker's rule (with some further refinements) - e.g., 'hugh' + elif (pos > (first + 1) and st[pos - 2] in ['B', 'H', 'D']) \ + or (pos > (first + 2) and st[pos - 3] in ['B', 'H', 'D']) \ + or (pos > (first + 3) and st[pos - 3] in ['B', 'H']): + nxt = (None, 2) + else: + # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' + if pos > (first + 2) and st[pos - 1] == 'U' \ + and st[pos - 3] in ["C", "G", "L", "R", "T"]: + nxt = ('F', 2) + else: + if pos > first and st[pos - 1] != 'I': + nxt = ('K', 2) + elif st[pos + 1] == 'N': + if pos == (first + 1) and st[first] in vowels and not is_slavo_germanic: + nxt = ('KN', 'N', 2) + else: + # not e.g. 'cagney' + if st[pos + 2:pos + 4] != 'EY' and st[pos + 1] != 'Y' and not is_slavo_germanic: + nxt = ('N', 'KN', 2) + else: + nxt = ('KN', 2) + # 'tagliaro' + elif st[pos + 1:pos + 3] == 'LI' and not is_slavo_germanic: + nxt = ('KL', 'L', 2) + # -ges-,-gep-,-gel-, -gie- at beginning + elif pos == first and (st[pos + 1] == 'Y' \ + or st[pos + 1:pos + 3] in ["ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER"]): + nxt = ('K', 'J', 2) + # -ger-, -gy- + elif (st[pos + 1:pos + 3] == 'ER' or st[pos + 1] == 'Y') \ + and st[first:first + 6] not in ["DANGER", "RANGER", "MANGER"] \ + and st[pos - 1] not in ['E', 'I'] and st[pos - 1:pos + 2] not in ['RGY', 'OGY']: + nxt = ('K', 'J', 2) + # italian e.g, 'biaggi' + elif st[pos + 1] in ['E', 'I', 'Y'] or st[pos - 1:pos + 3] in ["AGGI", "OGGI"]: + # obvious germanic + if st[first:first + 4] in ['VON ', 'VAN '] or st[first:first + 3] == 'SCH' \ + or st[pos + 1:pos + 3] == 'ET': + nxt = ('K', 2) + else: + # always soft if french ending + if st[pos + 1:pos + 5] == 'IER ': + nxt = ('J', 2) + else: + nxt = ('J', 'K', 2) + elif st[pos + 1] == 'G': + nxt = ('K', 2) + else: + nxt = ('K', 1) + elif ch == 'H': + # only keep if first & before vowel or btw. 2 vowels + if (pos == first or st[pos - 1] in vowels) and st[pos + 1] in vowels: + nxt = ('H', 2) + else: # (also takes care of 'HH') + nxt = (None, 1) + elif ch == 'J': + # obvious spanish, 'jose', 'san jacinto' + if st[pos:pos + 4] == 'JOSE' or st[first:first + 4] == 'SAN ': + if (pos == first and st[pos + 4] == ' ') or st[first:first + 4] == 'SAN ': + nxt = ('H', ) + else: + nxt = ('J', 'H') + elif pos == first and st[pos:pos + 4] != 'JOSE': + nxt = ('J', 'A') # Yankelovich/Jankelowicz + else: + # spanish pron. of e.g. 'bajador' + if st[pos - 1] in vowels and not is_slavo_germanic \ + and st[pos + 1] in ['A', 'O']: + nxt = ('J', 'H') + else: + if pos == last: + nxt = ('J', ' ') + else: + if st[pos + 1] not in ["L", "T", "K", "S", "N", "M", "B", "Z"] \ + and st[pos - 1] not in ["S", "K", "L"]: + nxt = ('J', ) + else: + nxt = (None, ) + if st[pos + 1] == 'J': + nxt = nxt + (2, ) + else: + nxt = nxt + (1, ) + elif ch == 'K': + if st[pos + 1] == 'K': + nxt = ('K', 2) + else: + nxt = ('K', 1) + elif ch == 'L': + if st[pos + 1] == 'L': + # spanish e.g. 'cabrillo', 'gallegos' + if (pos == (last - 2) and st[pos - 1:pos + 3] in ["ILLO", "ILLA", "ALLE"]) \ + or ((st[last - 1:last + 1] in ["AS", "OS"] or st[last] in ["A", "O"]) \ + and st[pos - 1:pos + 3] == 'ALLE'): + nxt = ('L', ' ', 2) + else: + nxt = ('L', 2) + else: + nxt = ('L', 1) + elif ch == 'M': + if (st[pos + 1:pos + 4] == 'UMB' \ + and (pos + 1 == last or st[pos + 2:pos + 4] == 'ER')) \ + or st[pos + 1] == 'M': + nxt = ('M', 2) + else: + nxt = ('M', 1) + elif ch == 'N': + if st[pos + 1] == 'N': + nxt = ('N', 2) + else: + nxt = ('N', 1) + elif ch == u'\xd1': # UTF-8 encoding of ト + nxt = ('N', 1) + elif ch == 'P': + if st[pos + 1] == 'H': + nxt = ('F', 2) + elif st[pos + 1] in ['P', 'B']: # also account for "campbell", "raspberry" + nxt = ('P', 2) + else: + nxt = ('P', 1) + elif ch == 'Q': + if st[pos + 1] == 'Q': + nxt = ('K', 2) + else: + nxt = ('K', 1) + elif ch == 'R': + # french e.g. 'rogier', but exclude 'hochmeier' + if pos == last and not is_slavo_germanic \ + and st[pos - 2:pos] == 'IE' and st[pos - 4:pos - 2] not in ['ME', 'MA']: + nxt = ('', 'R') + else: + nxt = ('R', ) + if st[pos + 1] == 'R': + nxt = nxt + (2, ) + else: + nxt = nxt + (1, ) + elif ch == 'S': + # special cases 'island', 'isle', 'carlisle', 'carlysle' + if st[pos - 1:pos + 2] in ['ISL', 'YSL']: + nxt = (None, 1) + # special case 'sugar-' + elif pos == first and st[first:first + 5] == 'SUGAR': + nxt = ('X', 'S', 1) + elif st[pos:pos + 2] == 'SH': + # germanic + if st[pos + 1:pos + 5] in ["HEIM", "HOEK", "HOLM", "HOLZ"]: + nxt = ('S', 2) + else: + nxt = ('X', 2) + # italian & armenian + elif st[pos:pos + 3] in ["SIO", "SIA"] or st[pos:pos + 4] == 'SIAN': + if not is_slavo_germanic: + nxt = ('S', 'X', 3) + else: + nxt = ('S', 3) + # german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' + # also, -sz- in slavic language altho in hungarian it is pronounced 's' + elif (pos == first and st[pos + 1] in ["M", "N", "L", "W"]) or st[pos + 1] == 'Z': + nxt = ('S', 'X') + if st[pos + 1] == 'Z': + nxt = nxt + (2, ) + else: + nxt = nxt + (1, ) + elif st[pos:pos + 2] == 'SC': + # Schlesinger's rule + if st[pos + 2] == 'H': + # dutch origin, e.g. 'school', 'schooner' + if st[pos + 3:pos + 5] in ["OO", "ER", "EN", "UY", "ED", "EM"]: + # 'schermerhorn', 'schenker' + if st[pos + 3:pos + 5] in ['ER', 'EN']: + nxt = ('X', 'SK', 3) + else: + nxt = ('SK', 3) + else: + if pos == first and st[first + 3] not in vowels and st[first + 3] != 'W': + nxt = ('X', 'S', 3) + else: + nxt = ('X', 3) + elif st[pos + 2] in ['I', 'E', 'Y']: + nxt = ('S', 3) + else: + nxt = ('SK', 3) + # french e.g. 'resnais', 'artois' + elif pos == last and st[pos - 2:pos] in ['AI', 'OI']: + nxt = ('', 'S', 1) + else: + nxt = ('S', ) + if st[pos + 1] in ['S', 'Z']: + nxt = nxt + (2, ) + else: + nxt = nxt + (1, ) + elif ch == 'T': + if st[pos:pos + 4] == 'TION': + nxt = ('X', 3) + elif st[pos:pos + 3] in ['TIA', 'TCH']: + nxt = ('X', 3) + elif st[pos:pos + 2] == 'TH' or st[pos:pos + 3] == 'TTH': + # special case 'thomas', 'thames' or germanic + if st[pos + 2:pos + 4] in ['OM', 'AM'] or st[first:first + 4] in ['VON ', 'VAN '] \ + or st[first:first + 3] == 'SCH': + nxt = ('T', 2) + else: + nxt = ('0', 'T', 2) + elif st[pos + 1] in ['T', 'D']: + nxt = ('T', 2) + else: + nxt = ('T', 1) + elif ch == 'V': + if st[pos + 1] == 'V': + nxt = ('F', 2) + else: + nxt = ('F', 1) + elif ch == 'W': + # can also be in middle of word + if st[pos:pos + 2] == 'WR': + nxt = ('R', 2) + elif pos == first and (st[pos + 1] in vowels or st[pos:pos + 2] == 'WH'): + # Wasserman should match Vasserman + if st[pos + 1] in vowels: + nxt = ('A', 'F', 1) + else: + nxt = ('A', 1) + # Arnow should match Arnoff + elif (pos == last and st[pos - 1] in vowels) \ + or st[pos - 1:pos + 4] in ["EWSKI", "EWSKY", "OWSKI", "OWSKY"] \ + or st[first:first + 3] == 'SCH': + nxt = ('', 'F', 1) + # polish e.g. 'filipowicz' + elif st[pos:pos + 4] in ["WICZ", "WITZ"]: + nxt = ('TS', 'FX', 4) + else: # default is to skip it + nxt = (None, 1) + elif ch == 'X': + # french e.g. breaux + nxt = (None, ) + if not(pos == last and (st[pos - 3:pos] in ["IAU", "EAU"] \ + or st[pos - 2:pos] in ['AU', 'OU'])): + nxt = ('KS', ) + if st[pos + 1] in ['C', 'X']: + nxt = nxt + (2, ) + else: + nxt = nxt + (1, ) + elif ch == 'Z': + # chinese pinyin e.g. 'zhao' + if st[pos + 1] == 'H': + nxt = ('J', ) + elif st[pos + 1:pos + 3] in ["ZO", "ZI", "ZA"] \ + or (is_slavo_germanic and pos > first and st[pos - 1] != 'T'): + nxt = ('S', 'TS') + else: + nxt = ('S', ) + if st[pos + 1] == 'Z' or st[pos + 1] == 'H': + nxt = nxt + (2, ) + else: + nxt = nxt + (1, ) + # ---------------------------------- + # --- end checking letters------ + # ---------------------------------- + #print str(nxt) + if len(nxt) == 2: + if nxt[0]: + pri += nxt[0] + sec += nxt[0] + pos += nxt[1] + elif len(nxt) == 3: + if nxt[0]: + pri += nxt[0] + if nxt[1]: + sec += nxt[1] + pos += nxt[2] + if pri == sec: + return (pri, '') + else: + return (pri, sec) + +if __name__ == '__main__': + import doctest + doctest.testmod() diff --git a/nltk_trainer/featx/phonetics.py b/nltk_trainer/featx/phonetics.py new file mode 100644 index 0000000..e7752c5 --- /dev/null +++ b/nltk_trainer/featx/phonetics.py @@ -0,0 +1,599 @@ +# ---------------------------------------------------------- +# AdvaS Advanced Search +# module for phonetic algorithms +# +# (C) 2002 - 2005 Frank Hofmann, Chemnitz, Germany +# email fh@efho.de +# ---------------------------------------------------------- + +# changed 2005-01-24 + +import string +import re + +def soundex (term): + "Return the soundex value to a string argument." + + # Create and compare soundex codes of English words. + # + # Soundex is an algorithm that hashes English strings into + # alpha-numerical value that represents what the word sounds + # like. For more information on soundex and some notes on the + # differences in implemenations visit: + # http://www.bluepoof.com/Soundex/info.html + # + # This version modified by Nathan Heagy at Front Logic Inc., to be + # compatible with php's soundexing and much faster. + # + # eAndroid / Nathan Heagy / Jul 29 2000 + # changes by Frank Hofmann / Jan 02 2005 + + # generate translation table only once. used to translate into soundex numbers + #table = string.maketrans('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '0123012002245501262301020201230120022455012623010202') + table = string.maketrans('ABCDEFGHIJKLMNOPQRSTUVWXYZ', '01230120022455012623010202') + + # check parameter + if not term: + return "0000" # could be Z000 for compatibility with other implementations + # end if + + # convert into uppercase letters + term = string.upper(term) + first_char = term[0] + + # translate the string into soundex code according to the table above + term = string.translate(term[1:], table) + + # remove all 0s + term = string.replace(term, "0", "") + + # remove duplicate numbers in-a-row + str2 = first_char + for x in term: + if x != str2[-1]: + str2 = str2 + x + # end if + # end for + + # pad with zeros + str2 = str2+"0"*len(str2) + + # take the first four letters + return_value = str2[:4] + + # return value + return return_value + +def metaphone (term): + "returns metaphone code for a given string" + + # implementation of the original algorithm from Lawrence Philips + # extended/rewritten by M. Kuhn + # improvements with thanks to John Machin + + # define return value + code = "" + + i = 0 + term_length = len(term) + + if (term_length == 0): + # empty string ? + return code + # end if + + # extension #1 (added 2005-01-28) + # convert to lowercase + term = string.lower(term) + + # extension #2 (added 2005-01-28) + # remove all non-english characters, first + term = re.sub(r'[^a-z]', '', term) + if len(term) == 0: + # nothing left + return code + # end if + + # extension #3 (added 2005-01-24) + # conflate repeated letters + firstChar = term[0] + str2 = firstChar + for x in term: + if x != str2[-1]: + str2 = str2 + x + # end if + # end for + + # extension #4 (added 2005-01-24) + # remove any vowels unless a vowel is the first letter + firstChar = str2[0] + str3 = firstChar + for x in str2[1:]: + if (re.search(r'[^aeiou]', x)): + str3 = str3 + x + # end if + # end for + + term = str3 + term_length = len(term) + if term_length == 0: + # nothing left + return code + # end if + + # check for exceptions + if (term_length > 1): + # get first two characters + first_chars = term[0:2] + + # build translation table + table = { + "ae":"e", + "gn":"n", + "kn":"n", + "pn":"n", + "wr":"n", + "wh":"w" + } + + if first_chars in table.keys(): + term = term[2:] + code = table[first_chars] + term_length = len(term) + # end if + + elif (term[0] == "x"): + term = "" + code = "s" + term_length = 0 + # end if + + # define standard translation table + st_trans = { + "b":"b", + "c":"k", + "d":"t", + "g":"k", + "h":"h", + "k":"k", + "p":"p", + "q":"k", + "s":"s", + "t":"t", + "v":"f", + "w":"w", + "x":"ks", + "y":"y", + "z":"s" + } + + i = 0 + while (i0): + part_c_2 = term[i-1:i+1] + part_c_3 = term[i-1:i+2] + # end if + # end if + + if (i < (term_length - 2)): + part_n_3 = term[i:i+3] + # end if + + if (i < (term_length - 3)): + part_n_4 = term[i:i+4] + # end if + + # use table with conditions for translations + if (term[i] == "b"): + add_char = st_trans["b"] + if (i == (term_length - 1)): + if (i>0): + if (term[i-1] == "m"): + add_char = "" + # end if + # end if + # end if + elif (term[i] == "c"): + add_char = st_trans["c"] + if (part_n_2 == "ch"): + add_char = "x" + elif (re.search(r'c[iey]', part_n_2)): + add_char = "s" + # end if + + if (part_n_3 == "cia"): + add_char = "x" + # end if + + if (re.search(r'sc[iey]', part_c_3)): + add_char = "" + # end if + + elif (term[i] == "d"): + add_char = st_trans["d"] + if (re.search(r'dg[eyi]', part_n_3)): + add_char = "j" + # end if + + elif (term[i] == "g"): + add_char = st_trans["g"] + + if (part_n_2 == "gh"): + if (i == (term_length - 2)): + add_char = "" + # end if + elif (re.search(r'gh[aeiouy]', part_n_3)): + add_char = "" + elif (part_n_2 == "gn"): + add_char = "" + elif (part_n_4 == "gned"): + add_char = "" + elif (re.search(r'dg[eyi]',part_c_3)): + add_char = "" + elif (part_n_2 == "gi"): + if (part_c_3 != "ggi"): + add_char = "j" + # end if + elif (part_n_2 == "ge"): + if (part_c_3 != "gge"): + add_char = "j" + # end if + elif (part_n_2 == "gy"): + if (part_c_3 != "ggy"): + add_char = "j" + # end if + elif (part_n_2 == "gg"): + add_char = "" + # end if + elif (term[i] == "h"): + add_char = st_trans["h"] + if (re.search(r'[aeiouy]h[^aeiouy]', part_c_3)): + add_char = "" + elif (re.search(r'[csptg]h', part_c_2)): + add_char = "" + # end if + elif (term[i] == "k"): + add_char = st_trans["k"] + if (part_c_2 == "ck"): + add_char = "" + # end if + elif (term[i] == "p"): + add_char = st_trans["p"] + if (part_n_2 == "ph"): + add_char = "f" + # end if + elif (term[i] == "q"): + add_char = st_trans["q"] + elif (term[i] == "s"): + add_char = st_trans["s"] + if (part_n_2 == "sh"): + add_char = "x" + # end if + + if (re.search(r'si[ao]', part_n_3)): + add_char = "x" + # end if + elif (term[i] == "t"): + add_char = st_trans["t"] + if (part_n_2 == "th"): + add_char = "0" + # end if + + if (re.search(r'ti[ao]', part_n_3)): + add_char = "x" + # end if + elif (term[i] == "v"): + add_char = st_trans["v"] + elif (term[i] == "w"): + add_char = st_trans["w"] + if (re.search(r'w[^aeiouy]', part_n_2)): + add_char = "" + # end if + elif (term[i] == "x"): + add_char = st_trans["x"] + elif (term[i] == "y"): + add_char = st_trans["y"] + elif (term[i] == "z"): + add_char = st_trans["z"] + else: + # alternative + add_char = term[i] + # end if + + code = code + add_char + i += 1 + # end while + + # return metaphone code + return code + +def nysiis (term): + "returns New York State Identification and Intelligence Algorithm (NYSIIS) code for the given term" + + code = "" + + i = 0 + term_length = len(term) + + if (term_length == 0): + # empty string ? + return code + # end if + + # build translation table for the first characters + table = { + "mac":"mcc", + "ph":"ff", + "kn":"nn", + "pf":"ff", + "k":"c", + "sch":"sss" + } + + for table_entry in table.keys(): + table_value = table[table_entry] # get table value + table_value_len = len(table_value) # calculate its length + first_chars = term[0:table_value_len] + if (first_chars == table_entry): + term = table_value + term[table_value_len:] + break + # end if + # end for + + # build translation table for the last characters + table = { + "ee":"y", + "ie":"y", + "dt":"d", + "rt":"d", + "rd":"d", + "nt":"d", + "nd":"d", + } + + for table_entry in table.keys(): + table_value = table[table_entry] # get table value + table_entry_len = len(table_entry) # calculate its length + last_chars = term[(0 - table_entry_len):] + #print last_chars, ", ", table_entry, ", ", table_value + if (last_chars == table_entry): + term = term[:(0 - table_value_len + 1)] + table_value + break + # end if + # end for + + # initialize code + code = term + + # transform ev->af + code = re.sub(r'ev', r'af', code) + + # transform a,e,i,o,u->a + code = re.sub(r'[aeiouy]', r'a', code) + + # transform q->g + code = re.sub(r'q', r'g', code) + + # transform z->s + code = re.sub(r'z', r's', code) + + # transform m->n + code = re.sub(r'm', r'n', code) + + # transform kn->n + code = re.sub(r'kn', r'n', code) + + # transform k->c + code = re.sub(r'k', r'c', code) + + # transform sch->sss + code = re.sub(r'sch', r'sss', code) + + # transform ph->ff + code = re.sub(r'ph', r'ff', code) + + # transform h-> if previous or next is nonvowel -> previous + occur = re.findall(r'([a-z]{0,1}?)h([a-z]{0,1}?)', code) + #print occur + for occur_group in occur: + occur_item_previous = occur_group[0] + occur_item_next = occur_group[1] + + if ((re.match(r'[^aeiouy]', occur_item_previous)) or (re.match(r'[^aeiouy]', occur_item_next))): + if (occur_item_previous != ""): + # make substitution + code = re.sub (occur_item_previous + "h", occur_item_previous * 2, code, 1) + # end if + # end if + # end for + + # transform w-> if previous is vowel -> previous + occur = re.findall(r'([aeiouy]{1}?)w', code) + #print occur + for occur_group in occur: + occur_item_previous = occur_group[0] + # make substitution + code = re.sub (occur_item_previous + "w", occur_item_previous * 2, code, 1) + # end for + + # check last character + # -s, remove + code = re.sub (r's$', r'', code) + # -ay, replace by -y + code = re.sub (r'ay$', r'y', code) + # -a, remove + code = re.sub (r'a$', r'', code) + + # return nysiis code + return code + +def caverphone (term): + "returns the language key using the caverphone algorithm 2.0" + + # Developed at the University of Otago, New Zealand. + # Project: Caversham Project (http://caversham.otago.ac.nz) + # Developer: David Hood, University of Otago, New Zealand + # Contact: caversham@otago.ac.nz + # Project Technical Paper: http://caversham.otago.ac.nz/files/working/ctp150804.pdf + # Version 2.0 (2004-08-15) + + code = "" + + i = 0 + term_length = len(term) + + if (term_length == 0): + # empty string ? + return code + # end if + + # convert to lowercase + code = string.lower(term) + + # remove anything not in the standard alphabet (a-z) + code = re.sub(r'[^a-z]', '', code) + + # remove final e + if code.endswith("e"): + code = code[:-1] + + # if the name starts with cough, rough, tough, enough or trough -> cou2f (rou2f, tou2f, enou2f, trough) + code = re.sub(r'^([crt]|(en)|(tr))ough', r'\1ou2f', code) + + # if the name starts with gn -> 2n + code = re.sub(r'^gn', r'2n', code) + + # if the name ends with mb -> m2 + code = re.sub(r'mb$', r'm2', code) + + # replace cq -> 2q + code = re.sub(r'cq', r'2q', code) + + # replace c[i,e,y] -> s[i,e,y] + code = re.sub(r'c([iey])', r's\1', code) + + # replace tch -> 2ch + code = re.sub(r'tch', r'2ch', code) + + # replace c,q,x -> k + code = re.sub(r'[cqx]', r'k', code) + + # replace v -> f + code = re.sub(r'v', r'f', code) + + # replace dg -> 2g + code = re.sub(r'dg', r'2g', code) + + # replace ti[o,a] -> si[o,a] + code = re.sub(r'ti([oa])', r'si\1', code) + + # replace d -> t + code = re.sub(r'd', r't', code) + + # replace ph -> fh + code = re.sub(r'ph', r'fh', code) + + # replace b -> p + code = re.sub(r'b', r'p', code) + + # replace sh -> s2 + code = re.sub(r'sh', r's2', code) + + # replace z -> s + code = re.sub(r'z', r's', code) + + # replace initial vowel [aeiou] -> A + code = re.sub(r'^[aeiou]', r'A', code) + + # replace all other vowels [aeiou] -> 3 + code = re.sub(r'[aeiou]', r'3', code) + + # replace j -> y + code = re.sub(r'j', r'y', code) + + # replace an initial y3 -> Y3 + code = re.sub(r'^y3', r'Y3', code) + + # replace an initial y -> A + code = re.sub(r'^y', r'A', code) + + # replace y -> 3 + code = re.sub(r'y', r'3', code) + + # replace 3gh3 -> 3kh3 + code = re.sub(r'3gh3', r'3kh3', code) + + # replace gh -> 22 + code = re.sub(r'gh', r'22', code) + + # replace g -> k + code = re.sub(r'g', r'k', code) + + # replace groups of s,t,p,k,f,m,n by its single, upper-case equivalent + for single_letter in ["s", "t", "p", "k", "f", "m", "n"]: + otherParts = re.split(single_letter + "+", code) + code = string.join(otherParts, string.upper(single_letter)) + + # replace w[3,h3] by W[3,h3] + code = re.sub(r'w(h?3)', r'W\1', code) + + # replace final w with 3 + code = re.sub(r'w$', r'3', code) + + # replace w -> 2 + code = re.sub(r'w', r'2', code) + + # replace h at the beginning with an A + code = re.sub(r'^h', r'A', code) + + # replace all other occurrences of h with a 2 + code = re.sub(r'h', r'2', code) + + # replace r3 with R3 + code = re.sub(r'r3', r'R3', code) + + # replace final r -> 3 + code = re.sub(r'r$', r'3', code) + + # replace r with 2 + code = re.sub(r'r', r'2', code) + + # replace l3 with L3 + code = re.sub(r'l3', r'L3', code) + + # replace final l -> 3 + code = re.sub(r'l$', r'3', code) + + # replace l with 2 + code = re.sub(r'l', r'2', code) + + # remove all 2's + code = re.sub(r'2', r'', code) + + # replace the final 3 -> A + code = re.sub(r'3$', r'A', code) + + # remove all 3's + code = re.sub(r'3', r'', code) + + # extend the code by 10 '1' (one) + code += '1' * 10 + + # take the first 10 characters + caverphoneCode = code[:10] + + # return caverphone code + return caverphoneCode + diff --git a/nltk_trainer/tagging/taggers.py b/nltk_trainer/tagging/taggers.py new file mode 100644 index 0000000..844ff2b --- /dev/null +++ b/nltk_trainer/tagging/taggers.py @@ -0,0 +1,47 @@ +from nltk.tag.sequential import SequentialBackoffTagger +from nltk.probability import FreqDist +from nltk.tag import ClassifierBasedPOSTagger +from nltk_trainer.featx import phonetics +from nltk_trainer.featx.metaphone import dm + +class PhoneticClassifierBasedPOSTagger(ClassifierBasedPOSTagger): + def __init__(self, double_metaphone=False, metaphone=False, soundex=False, nysiis=False, caverphone=False, *args, **kwargs): + self.funs = {} + + if double_metaphone: + self.funs['double-metaphone'] = lambda s: dm(unicode(s)) + + if metaphone: + self.funs['metaphone'] = phonetics.metaphone + + if soundex: + self.funs['soundex'] = phonetics.soundex + + if nysiis: + self.funs['nysiis'] = phonetics.nysiis + + if caverphone: + self.funs['caverphone'] = phonetics.caverphone + # for some reason don't get self.funs if this is done first, but works if done last + ClassifierBasedPOSTagger.__init__(self, *args, **kwargs) + + def feature_detector(self, tokens, index, history): + feats = ClassifierBasedPOSTagger.feature_detector(self, tokens, index, history) + s = tokens[index] + + for key, fun in self.funs.iteritems(): + feats[key] = fun(s) + + return feats + +class MaxVoteBackoffTagger(SequentialBackoffTagger): + def __init__(self, *taggers): + self._taggers = taggers + + def choose_tag(self, tokens, index, history): + tags = FreqDist() + + for tagger in self._taggers: + tags.inc(tagger.choose_tag(tokens, index, history)) + + return tags.max() \ No newline at end of file diff --git a/train_tagger.py b/train_tagger.py index ad7d354..838377f 100644 --- a/train_tagger.py +++ b/train_tagger.py @@ -9,6 +9,7 @@ from nltk.tag.simplify import simplify_wsj_tag from nltk_trainer.tagging.readers import NumberedTaggedSentCorpusReader from nltk_trainer.tagging.training import train_brill_tagger +from nltk_trainer.tagging.taggers import PhoneticClassifierBasedPOSTagger ######################################## ## command options & argument parsing ## @@ -54,15 +55,6 @@ You can use this option multiple times to create multiple AffixTaggers with different affixes. The affixes will be used in the order given.''') -classifier_group = parser.add_argument_group('Classifier Based Tagger') -classifier_group.add_argument('--classifier', default=None, - choices=['NaiveBayes', 'DecisionTree', 'Maxent'] + MaxentClassifier.ALGORITHMS, - help='''ClassifierBasedPOSTagger algorithm to use, default is %(default)s. -Maxent uses the default Maxent training algorithm, either CG or iis.''') -classifier_group.add_argument('--cutoff_prob', default=0, type=float, - help='Cutoff probability for classifier tagger to backoff to previous tagger') -# TODO: phonetic feature options - brill_group = parser.add_argument_group('Brill Tagger Options') brill_group.add_argument('--brill', action='store_true', default=False, help='Train a Brill Tagger in front of the other tagger.') @@ -72,11 +64,25 @@ brill_group.add_argument('--max_rules', type=int, default=200) brill_group.add_argument('--min_score', type=int, default=2) -eval_group = parser.add_argument_group('Tagger Evaluation', - 'Evaluation metrics for part-of-speech taggers') -eval_group.add_argument('--no-eval', action='store_true', default=False, - help="don't do any evaluation") -# TODO: are there any metrics other than accuracy? +classifier_group = parser.add_argument_group('Classifier Based Tagger') +classifier_group.add_argument('--classifier', default=None, + choices=['NaiveBayes', 'DecisionTree', 'Maxent'] + MaxentClassifier.ALGORITHMS, + help='''ClassifierBasedPOSTagger algorithm to use, default is %(default)s. +Maxent uses the default Maxent training algorithm, either CG or iis.''') +classifier_group.add_argument('--cutoff_prob', default=0, type=float, + help='Cutoff probability for classifier tagger to backoff to previous tagger') + +phonetic_group = parser.add_argument_group('Phonetic Feature Options for a Classifier Based Tagger') +phonetic_group.add_argument('--metaphone', action='store_true', + default=False, help='Use metaphone feature') +phonetic_group.add_argument('--double-metaphone', action='store_true', + default=False, help='Use double metaphone feature') +phonetic_group.add_argument('--soundex', action='store_true', + default=False, help='Use soundex feature') +phonetic_group.add_argument('--nysiis', action='store_true', + default=False, help='Use NYSIIS feature') +phonetic_group.add_argument('--caverphone', action='store_true', + default=False, help='Use caverphone feature') maxent_group = parser.add_argument_group('Maxent Classifier Tagger', 'These options only apply when a Maxent classifier is chosen.') @@ -97,6 +103,12 @@ decisiontree_group.add_argument('--support_cutoff', default=10, type=int, help='default is 10') +eval_group = parser.add_argument_group('Tagger Evaluation', + 'Evaluation metrics for part-of-speech taggers') +eval_group.add_argument('--no-eval', action='store_true', default=False, + help="don't do any evaluation") +# TODO: word coverage of test words, how many get a tag != '-NONE-' + args = parser.parse_args() ################### @@ -242,15 +254,31 @@ def f(train_sents, backoff=None): classifier_train_kwargs['trace'] = args.trace if args.classifier: - if args.trace: - print 'training a %s ClassifierBasedPOSTagger' % args.classifier - def classifier_builder(train_feats): return classifier_train(train_feats, **classifier_train_kwargs) - tagger = ClassifierBasedPOSTagger(train=train_sents, verbose=args.trace, - backoff=tagger, cutoff_prob=args.cutoff_prob, - classifier_builder=classifier_builder) + kwargs = { + 'train': train_sents, + 'verbose': args.trace, + 'backoff': tagger, + 'cutoff_prob': args.cutoff_prob, + 'classifier_builder': classifier_builder + } + + phonetic_keys = ['metaphone', 'double_metaphone', 'soundex', 'nysiis', 'caverphone'] + + if any([getattr(args, key) for key in phonetic_keys]): + cls = PhoneticClassifierBasedPOSTagger + + for key in phonetic_keys: + kwargs[key] = getattr(args, key) + else: + cls = ClassifierBasedPOSTagger + + if args.trace: + print 'training a %s %s' % (args.classifier, cls.__name__) + + tagger = cls(**kwargs) ################## ## brill tagger ##