Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 599 lines (482 sloc) 13.172 kb
76d538a @japerk phonetic hashing for classifier tagger features
authored
1 # ----------------------------------------------------------
2 # AdvaS Advanced Search
3 # module for phonetic algorithms
4 #
5 # (C) 2002 - 2005 Frank Hofmann, Chemnitz, Germany
6 # email fh@efho.de
7 # ----------------------------------------------------------
8
9 # changed 2005-01-24
10
11 import string
12 import re
13
14 def soundex (term):
15 "Return the soundex value to a string argument."
16
17 # Create and compare soundex codes of English words.
18 #
19 # Soundex is an algorithm that hashes English strings into
20 # alpha-numerical value that represents what the word sounds
21 # like. For more information on soundex and some notes on the
22 # differences in implemenations visit:
23 # http://www.bluepoof.com/Soundex/info.html
24 #
25 # This version modified by Nathan Heagy at Front Logic Inc., to be
26 # compatible with php's soundexing and much faster.
27 #
28 # eAndroid / Nathan Heagy / Jul 29 2000
29 # changes by Frank Hofmann / Jan 02 2005
30
31 # generate translation table only once. used to translate into soundex numbers
32 #table = string.maketrans('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '0123012002245501262301020201230120022455012623010202')
33 table = string.maketrans('ABCDEFGHIJKLMNOPQRSTUVWXYZ', '01230120022455012623010202')
34
35 # check parameter
36 if not term:
37 return "0000" # could be Z000 for compatibility with other implementations
38 # end if
39
aa42d2c @japerk lambda & syntax updates, comment out simplify tags
authored
40 # convert into uppercase letters
76d538a @japerk phonetic hashing for classifier tagger features
authored
41 term = string.upper(term)
aa42d2c @japerk lambda & syntax updates, comment out simplify tags
authored
42 first_char = term[0]
76d538a @japerk phonetic hashing for classifier tagger features
authored
43
44 # translate the string into soundex code according to the table above
45 term = string.translate(term[1:], table)
46
47 # remove all 0s
48 term = string.replace(term, "0", "")
aa42d2c @japerk lambda & syntax updates, comment out simplify tags
authored
49 # remove duplicate numbers in-a-row
50 str2 = first_char
76d538a @japerk phonetic hashing for classifier tagger features
authored
51 for x in term:
52 if x != str2[-1]:
aa42d2c @japerk lambda & syntax updates, comment out simplify tags
authored
53 str2 = str2 + x
76d538a @japerk phonetic hashing for classifier tagger features
authored
54 # end if
55 # end for
56
57 # pad with zeros
58 str2 = str2+"0"*len(str2)
59
60 # take the first four letters
61 return_value = str2[:4]
62
63 # return value
64 return return_value
65
66 def metaphone (term):
67 "returns metaphone code for a given string"
68
69 # implementation of the original algorithm from Lawrence Philips
70 # extended/rewritten by M. Kuhn
71 # improvements with thanks to John Machin <sjmachin@lexicon.net>
72
73 # define return value
74 code = ""
75
76 i = 0
77 term_length = len(term)
78
79 if (term_length == 0):
80 # empty string ?
81 return code
82 # end if
83
84 # extension #1 (added 2005-01-28)
85 # convert to lowercase
86 term = string.lower(term)
87
88 # extension #2 (added 2005-01-28)
89 # remove all non-english characters, first
90 term = re.sub(r'[^a-z]', '', term)
91 if len(term) == 0:
92 # nothing left
93 return code
94 # end if
95
96 # extension #3 (added 2005-01-24)
97 # conflate repeated letters
98 firstChar = term[0]
aa42d2c @japerk lambda & syntax updates, comment out simplify tags
authored
99 str2 = firstChar
76d538a @japerk phonetic hashing for classifier tagger features
authored
100 for x in term:
101 if x != str2[-1]:
aa42d2c @japerk lambda & syntax updates, comment out simplify tags
authored
102 str2 = str2 + x
76d538a @japerk phonetic hashing for classifier tagger features
authored
103 # end if
104 # end for
105
106 # extension #4 (added 2005-01-24)
107 # remove any vowels unless a vowel is the first letter
108 firstChar = str2[0]
109 str3 = firstChar
110 for x in str2[1:]:
111 if (re.search(r'[^aeiou]', x)):
112 str3 = str3 + x
113 # end if
114 # end for
115
116 term = str3
117 term_length = len(term)
118 if term_length == 0:
119 # nothing left
120 return code
121 # end if
122
123 # check for exceptions
124 if (term_length > 1):
125 # get first two characters
126 first_chars = term[0:2]
127
128 # build translation table
129 table = {
aa42d2c @japerk lambda & syntax updates, comment out simplify tags
authored
130 "ae":"e",
131 "gn":"n",
132 "kn":"n",
133 "pn":"n",
134 "wr":"n",
135 "wh":"w"
76d538a @japerk phonetic hashing for classifier tagger features
authored
136 }
137
138 if first_chars in table.keys():
139 term = term[2:]
140 code = table[first_chars]
141 term_length = len(term)
142 # end if
143
144 elif (term[0] == "x"):
145 term = ""
146 code = "s"
147 term_length = 0
148 # end if
149
150 # define standard translation table
151 st_trans = {
152 "b":"b",
153 "c":"k",
154 "d":"t",
155 "g":"k",
156 "h":"h",
157 "k":"k",
158 "p":"p",
159 "q":"k",
160 "s":"s",
161 "t":"t",
162 "v":"f",
163 "w":"w",
164 "x":"ks",
165 "y":"y",
166 "z":"s"
167 }
168
169 i = 0
170 while (i<term_length):
171 # init character to add, init basic patterns
172 add_char = ""
173 part_n_2 = ""
174 part_n_3 = ""
175 part_n_4 = ""
176 part_c_2 = ""
177 part_c_3 = ""
178
179 # extract a number of patterns, if possible
180 if (i < (term_length - 1)):
181 part_n_2 = term[i:i+2]
182
183 if (i>0):
184 part_c_2 = term[i-1:i+1]
185 part_c_3 = term[i-1:i+2]
186 # end if
187 # end if
188
189 if (i < (term_length - 2)):
190 part_n_3 = term[i:i+3]
191 # end if
192
193 if (i < (term_length - 3)):
194 part_n_4 = term[i:i+4]
195 # end if
196
197 # use table with conditions for translations
198 if (term[i] == "b"):
199 add_char = st_trans["b"]
200 if (i == (term_length - 1)):
201 if (i>0):
202 if (term[i-1] == "m"):
203 add_char = ""
204 # end if
205 # end if
206 # end if
207 elif (term[i] == "c"):
208 add_char = st_trans["c"]
209 if (part_n_2 == "ch"):
210 add_char = "x"
211 elif (re.search(r'c[iey]', part_n_2)):
212 add_char = "s"
213 # end if
214
215 if (part_n_3 == "cia"):
216 add_char = "x"
217 # end if
218
219 if (re.search(r'sc[iey]', part_c_3)):
220 add_char = ""
221 # end if
222
223 elif (term[i] == "d"):
224 add_char = st_trans["d"]
225 if (re.search(r'dg[eyi]', part_n_3)):
226 add_char = "j"
227 # end if
228
229 elif (term[i] == "g"):
230 add_char = st_trans["g"]
231
232 if (part_n_2 == "gh"):
233 if (i == (term_length - 2)):
234 add_char = ""
235 # end if
236 elif (re.search(r'gh[aeiouy]', part_n_3)):
237 add_char = ""
238 elif (part_n_2 == "gn"):
239 add_char = ""
240 elif (part_n_4 == "gned"):
241 add_char = ""
242 elif (re.search(r'dg[eyi]',part_c_3)):
243 add_char = ""
244 elif (part_n_2 == "gi"):
245 if (part_c_3 != "ggi"):
246 add_char = "j"
247 # end if
248 elif (part_n_2 == "ge"):
249 if (part_c_3 != "gge"):
250 add_char = "j"
251 # end if
252 elif (part_n_2 == "gy"):
253 if (part_c_3 != "ggy"):
254 add_char = "j"
255 # end if
256 elif (part_n_2 == "gg"):
257 add_char = ""
258 # end if
259 elif (term[i] == "h"):
260 add_char = st_trans["h"]
261 if (re.search(r'[aeiouy]h[^aeiouy]', part_c_3)):
262 add_char = ""
263 elif (re.search(r'[csptg]h', part_c_2)):
264 add_char = ""
265 # end if
266 elif (term[i] == "k"):
267 add_char = st_trans["k"]
268 if (part_c_2 == "ck"):
269 add_char = ""
270 # end if
271 elif (term[i] == "p"):
272 add_char = st_trans["p"]
273 if (part_n_2 == "ph"):
274 add_char = "f"
275 # end if
276 elif (term[i] == "q"):
277 add_char = st_trans["q"]
278 elif (term[i] == "s"):
279 add_char = st_trans["s"]
280 if (part_n_2 == "sh"):
281 add_char = "x"
282 # end if
283
284 if (re.search(r'si[ao]', part_n_3)):
285 add_char = "x"
286 # end if
287 elif (term[i] == "t"):
288 add_char = st_trans["t"]
289 if (part_n_2 == "th"):
290 add_char = "0"
291 # end if
292
293 if (re.search(r'ti[ao]', part_n_3)):
294 add_char = "x"
295 # end if
296 elif (term[i] == "v"):
297 add_char = st_trans["v"]
298 elif (term[i] == "w"):
299 add_char = st_trans["w"]
300 if (re.search(r'w[^aeiouy]', part_n_2)):
301 add_char = ""
302 # end if
303 elif (term[i] == "x"):
304 add_char = st_trans["x"]
305 elif (term[i] == "y"):
306 add_char = st_trans["y"]
307 elif (term[i] == "z"):
308 add_char = st_trans["z"]
309 else:
310 # alternative
311 add_char = term[i]
312 # end if
313
314 code = code + add_char
315 i += 1
316 # end while
317
318 # return metaphone code
319 return code
320
321 def nysiis (term):
322 "returns New York State Identification and Intelligence Algorithm (NYSIIS) code for the given term"
323
324 code = ""
325
326 i = 0
327 term_length = len(term)
328
329 if (term_length == 0):
330 # empty string ?
331 return code
332 # end if
333
334 # build translation table for the first characters
335 table = {
336 "mac":"mcc",
337 "ph":"ff",
338 "kn":"nn",
339 "pf":"ff",
340 "k":"c",
341 "sch":"sss"
342 }
343
344 for table_entry in table.keys():
345 table_value = table[table_entry] # get table value
346 table_value_len = len(table_value) # calculate its length
347 first_chars = term[0:table_value_len]
348 if (first_chars == table_entry):
349 term = table_value + term[table_value_len:]
350 break
351 # end if
352 # end for
353
354 # build translation table for the last characters
355 table = {
356 "ee":"y",
357 "ie":"y",
358 "dt":"d",
359 "rt":"d",
360 "rd":"d",
361 "nt":"d",
362 "nd":"d",
363 }
364
365 for table_entry in table.keys():
366 table_value = table[table_entry] # get table value
367 table_entry_len = len(table_entry) # calculate its length
368 last_chars = term[(0 - table_entry_len):]
369 #print last_chars, ", ", table_entry, ", ", table_value
370 if (last_chars == table_entry):
371 term = term[:(0 - table_value_len + 1)] + table_value
372 break
373 # end if
374 # end for
375
376 # initialize code
377 code = term
378
379 # transform ev->af
380 code = re.sub(r'ev', r'af', code)
381
382 # transform a,e,i,o,u->a
383 code = re.sub(r'[aeiouy]', r'a', code)
384
385 # transform q->g
386 code = re.sub(r'q', r'g', code)
387
388 # transform z->s
389 code = re.sub(r'z', r's', code)
390
391 # transform m->n
392 code = re.sub(r'm', r'n', code)
393
394 # transform kn->n
395 code = re.sub(r'kn', r'n', code)
396
397 # transform k->c
398 code = re.sub(r'k', r'c', code)
399
400 # transform sch->sss
401 code = re.sub(r'sch', r'sss', code)
402
403 # transform ph->ff
404 code = re.sub(r'ph', r'ff', code)
405
406 # transform h-> if previous or next is nonvowel -> previous
407 occur = re.findall(r'([a-z]{0,1}?)h([a-z]{0,1}?)', code)
408 #print occur
409 for occur_group in occur:
410 occur_item_previous = occur_group[0]
411 occur_item_next = occur_group[1]
412
413 if ((re.match(r'[^aeiouy]', occur_item_previous)) or (re.match(r'[^aeiouy]', occur_item_next))):
414 if (occur_item_previous != ""):
415 # make substitution
416 code = re.sub (occur_item_previous + "h", occur_item_previous * 2, code, 1)
417 # end if
418 # end if
419 # end for
420
421 # transform w-> if previous is vowel -> previous
422 occur = re.findall(r'([aeiouy]{1}?)w', code)
423 #print occur
424 for occur_group in occur:
425 occur_item_previous = occur_group[0]
426 # make substitution
427 code = re.sub (occur_item_previous + "w", occur_item_previous * 2, code, 1)
428 # end for
429
430 # check last character
431 # -s, remove
432 code = re.sub (r's$', r'', code)
433 # -ay, replace by -y
434 code = re.sub (r'ay$', r'y', code)
435 # -a, remove
436 code = re.sub (r'a$', r'', code)
437
438 # return nysiis code
439 return code
440
441 def caverphone (term):
442 "returns the language key using the caverphone algorithm 2.0"
443
444 # Developed at the University of Otago, New Zealand.
445 # Project: Caversham Project (http://caversham.otago.ac.nz)
446 # Developer: David Hood, University of Otago, New Zealand
447 # Contact: caversham@otago.ac.nz
448 # Project Technical Paper: http://caversham.otago.ac.nz/files/working/ctp150804.pdf
449 # Version 2.0 (2004-08-15)
450
451 code = ""
452
453 i = 0
454 term_length = len(term)
455
456 if (term_length == 0):
457 # empty string ?
458 return code
459 # end if
460
461 # convert to lowercase
462 code = string.lower(term)
463
464 # remove anything not in the standard alphabet (a-z)
465 code = re.sub(r'[^a-z]', '', code)
466
467 # remove final e
468 if code.endswith("e"):
469 code = code[:-1]
470
471 # if the name starts with cough, rough, tough, enough or trough -> cou2f (rou2f, tou2f, enou2f, trough)
472 code = re.sub(r'^([crt]|(en)|(tr))ough', r'\1ou2f', code)
473
474 # if the name starts with gn -> 2n
475 code = re.sub(r'^gn', r'2n', code)
476
477 # if the name ends with mb -> m2
478 code = re.sub(r'mb$', r'm2', code)
479
480 # replace cq -> 2q
481 code = re.sub(r'cq', r'2q', code)
482
483 # replace c[i,e,y] -> s[i,e,y]
484 code = re.sub(r'c([iey])', r's\1', code)
485
486 # replace tch -> 2ch
487 code = re.sub(r'tch', r'2ch', code)
488
489 # replace c,q,x -> k
490 code = re.sub(r'[cqx]', r'k', code)
491
492 # replace v -> f
493 code = re.sub(r'v', r'f', code)
494
495 # replace dg -> 2g
496 code = re.sub(r'dg', r'2g', code)
497
498 # replace ti[o,a] -> si[o,a]
499 code = re.sub(r'ti([oa])', r'si\1', code)
500
501 # replace d -> t
502 code = re.sub(r'd', r't', code)
503
504 # replace ph -> fh
505 code = re.sub(r'ph', r'fh', code)
506
507 # replace b -> p
508 code = re.sub(r'b', r'p', code)
509
510 # replace sh -> s2
511 code = re.sub(r'sh', r's2', code)
512
513 # replace z -> s
514 code = re.sub(r'z', r's', code)
515
516 # replace initial vowel [aeiou] -> A
517 code = re.sub(r'^[aeiou]', r'A', code)
518
519 # replace all other vowels [aeiou] -> 3
520 code = re.sub(r'[aeiou]', r'3', code)
521
522 # replace j -> y
523 code = re.sub(r'j', r'y', code)
524
525 # replace an initial y3 -> Y3
526 code = re.sub(r'^y3', r'Y3', code)
527
528 # replace an initial y -> A
529 code = re.sub(r'^y', r'A', code)
530
531 # replace y -> 3
532 code = re.sub(r'y', r'3', code)
533
534 # replace 3gh3 -> 3kh3
535 code = re.sub(r'3gh3', r'3kh3', code)
536
537 # replace gh -> 22
538 code = re.sub(r'gh', r'22', code)
539
540 # replace g -> k
541 code = re.sub(r'g', r'k', code)
542
543 # replace groups of s,t,p,k,f,m,n by its single, upper-case equivalent
544 for single_letter in ["s", "t", "p", "k", "f", "m", "n"]:
545 otherParts = re.split(single_letter + "+", code)
546 code = string.join(otherParts, string.upper(single_letter))
547
548 # replace w[3,h3] by W[3,h3]
549 code = re.sub(r'w(h?3)', r'W\1', code)
550
551 # replace final w with 3
552 code = re.sub(r'w$', r'3', code)
553
554 # replace w -> 2
555 code = re.sub(r'w', r'2', code)
556
557 # replace h at the beginning with an A
558 code = re.sub(r'^h', r'A', code)
559
560 # replace all other occurrences of h with a 2
561 code = re.sub(r'h', r'2', code)
562
563 # replace r3 with R3
564 code = re.sub(r'r3', r'R3', code)
565
566 # replace final r -> 3
567 code = re.sub(r'r$', r'3', code)
568
569 # replace r with 2
570 code = re.sub(r'r', r'2', code)
571
572 # replace l3 with L3
573 code = re.sub(r'l3', r'L3', code)
574
575 # replace final l -> 3
576 code = re.sub(r'l$', r'3', code)
577
578 # replace l with 2
579 code = re.sub(r'l', r'2', code)
580
581 # remove all 2's
582 code = re.sub(r'2', r'', code)
583
584 # replace the final 3 -> A
585 code = re.sub(r'3$', r'A', code)
586
587 # remove all 3's
588 code = re.sub(r'3', r'', code)
589
590 # extend the code by 10 '1' (one)
591 code += '1' * 10
592
593 # take the first 10 characters
594 caverphoneCode = code[:10]
595
596 # return caverphone code
597 return caverphoneCode
598
Something went wrong with that request. Please try again.