Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

* updated changelog

* fixed lots of doctest-related issues


svn/trunk@8784
  • Loading branch information...
commit 4b802fc7a1093aa6f412fa39525ac90acda127f0 1 parent 392f099
Steven Bird stevenbird authored
14 ChangeLog
... ... @@ -1,4 +1,4 @@
1   -Version 2.0.1 2011-04-??
  1 +Version 2.0.1 (rc1) 2011-04-11
2 2
3 3 NLTK:
4 4 * added interface to the Stanford POS Tagger
@@ -12,7 +12,7 @@ NLTK:
12 12 * fixed issue with NLTK's tokenize module colliding with the Python tokenize module
13 13 * fixed issue with stemming Unicode strings
14 14 * changed ViterbiParser.nbest_parse to parse
15   -* KNBC Japanese corpus reader
  15 +* ChaSen and KNBC Japanese corpus readers
16 16 * preserve case in concordance display
17 17 * fixed bug in simplification of Brown tags
18 18 * a version of IBM Model 1 as described in Koehn 2010
@@ -28,9 +28,15 @@ NLTK:
28 28 * simplifications and corrections of Earley Chart Parser rules
29 29 * several changes to the feature chart parsers for correct unification
30 30 * bugfixes: FreqDist.plot, FreqDist.max, NgramModel.entropy, CategorizedCorpusReader, DecisionTreeClassifier
  31 +* removal of Python >2.4 language features for 2.4 compatibility
  32 +* removal of deprecated functions and associated warnings
  33 +* added semantic domains to wordnet corpus reader
  34 +* changed wordnet similarity functions to include instance hyponyms
  35 +* updated to use latest version of Boxer
31 36
32 37 Data:
33   -* Japanese corpora...
  38 +* JEITA Public Morphologically Tagged Corpus (in ChaSen format)
  39 +* KNB Annotated corpus of Japanese blog posts
34 40 * Fixed some minor bugs in alvey.fcfg, and added number of parse trees in alvey_sentences.txt
35 41 * added more comtrans data
36 42
@@ -39,7 +45,7 @@ Documentation:
39 45 * NLTK Japanese book (chapter 12) by Masato Hagiwara
40 46
41 47 NLTK-Contrib:
42   -* Contribute a version of the Viethen and Dale referring expression algorithms
  48 +* Viethen and Dale referring expression algorithms
43 49
44 50 Thanks to the following contributors to 2.0.1 (since 2.0b9, July 2010)
45 51 Yonatan Becker, Steven Bethard, David Coles, Dan Garrette,
7 nltk/corpus/reader/bracket_parse.py
@@ -8,7 +8,7 @@
8 8
9 9 import sys
10 10
11   -from nltk.tree import bracket_parse, Tree
  11 +from nltk.tree import Tree
12 12
13 13 from util import *
14 14 from api import *
@@ -75,14 +75,15 @@ def _normalize(self, t):
75 75
76 76 def _parse(self, t):
77 77 try:
78   - return bracket_parse(self._normalize(t))
  78 + return Tree.parse(self._normalize(t))
  79 +
79 80 except ValueError, e:
80 81 sys.stderr.write("Bad tree detected; trying to recover...\n")
81 82 # Try to recover, if we can:
82 83 if e.args == ('mismatched parens',):
83 84 for n in range(1, 5):
84 85 try:
85   - v = bracket_parse(self._normalize(t+')'*n))
  86 + v = Tree.parse(self._normalize(t+')'*n))
86 87 sys.stderr.write(" Recovered by adding %d close "
87 88 "paren(s)\n" % n)
88 89 return v
6 nltk/sem/chat80.py
@@ -403,7 +403,7 @@ def cities2table(filename, rel_name, dbname, verbose=False, setup=False):
403 403 cur.close()
404 404 except ImportError:
405 405 import warnings
406   - warnings.warn("To run this function, first install pysqlite.")
  406 + warnings.warn("To run this function, first install pysqlite, or else use Python 2.5 or later.")
407 407
408 408 def sql_query(dbname, query):
409 409 """
@@ -423,7 +423,7 @@ def sql_query(dbname, query):
423 423 return cur.execute(query)
424 424 except ImportError:
425 425 import warnings
426   - warnings.warn("To run this function, first install pysqlite.")
  426 + warnings.warn("To run this function, first install pysqlite, or else use Python 2.5 or later.")
427 427 raise
428 428
429 429 def _str2records(filename, rel):
@@ -780,7 +780,7 @@ def sql_demo():
780 780 print row
781 781 except ImportError:
782 782 import warnings
783   - warnings.warn("To run the SQL demo, first install pysqlite.")
  783 + warnings.warn("To run the SQL demo, first install pysqlite, or else use Python 2.5 or later.")
784 784
785 785
786 786 if __name__ == '__main__':
8 nltk/test/ccg.doctest
@@ -196,12 +196,12 @@ Note that while the two derivations are different, they are semantically equival
196 196 (((S\NP)/NP)\.,((S\NP)/NP))
197 197 -----------------------------------------------------------------------<
198 198 ((S\NP)/NP)
  199 + ------------------------------------------------------------------------------->B
  200 + ((S\NP)/N)
199 201 ------------------------------------->
200 202 (N\.,N)
201 203 ------------------------------------------------<
202 204 N
203   - -------------------------------------------------------->
204   - NP
205 205 ------------------------------------------------------------------------------------------------------------------------------->
206 206 (S\NP)
207 207 -----------------------------------------------------------------------------------------------------------------------------------<
@@ -216,12 +216,12 @@ Note that while the two derivations are different, they are semantically equival
216 216 (((S\NP)/NP)\.,((S\NP)/NP))
217 217 -----------------------------------------------------------------------<
218 218 ((S\NP)/NP)
219   - ------------------------------------------------------------------------------->B
220   - ((S\NP)/N)
221 219 ------------------------------------->
222 220 (N\.,N)
223 221 ------------------------------------------------<
224 222 N
  223 + -------------------------------------------------------->
  224 + NP
225 225 ------------------------------------------------------------------------------------------------------------------------------->
226 226 (S\NP)
227 227 -----------------------------------------------------------------------------------------------------------------------------------<
3  nltk/test/chat80.doctest
@@ -199,9 +199,8 @@ to SQL:
199 199
200 200 Given this grammar, we can express, and then execute, queries in English.
201 201
202   - >>> from nltk.parse import load_earley
203 202 >>> from string import join
204   - >>> cp = load_earley('grammars/book_grammars/sql0.fcfg')
  203 + >>> cp = nltk.data.load('grammars/book_grammars/sql0.fcfg')
205 204 >>> query = 'What cities are in China'
206 205 >>> trees = cp.nbest_parse(query.split())
207 206 >>> answer = trees[0].node['SEM']
2  nltk/test/probability.doctest
@@ -65,7 +65,7 @@ from the whole corpus, not just the training corpus
65 65 >>> symbols = list(set([word for sent in corpus for (word,tag) in sent]))
66 66 >>> print len(symbols)
67 67 1464
68   - >>> trainer = nltk.HiddenMarkovModelTrainer(tag_set, symbols)
  68 + >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
69 69
70 70 We divide the corpus into 90% training and 10% testing
71 71
22 nltk/test/tree.doctest
@@ -158,26 +158,26 @@ then it simply delegates to `Tree.parse()`.
158 158
159 159 Trees can be compared for equality:
160 160
161   - >>> tree == bracket_parse(str(tree))
  161 + >>> tree == Tree.parse(str(tree))
162 162 True
163   - >>> tree2 == bracket_parse(str(tree2))
  163 + >>> tree2 == Tree.parse(str(tree2))
164 164 True
165 165 >>> tree == tree2
166 166 False
167   - >>> tree == bracket_parse(str(tree2))
  167 + >>> tree == Tree.parse(str(tree2))
168 168 False
169   - >>> tree2 == bracket_parse(str(tree))
  169 + >>> tree2 == Tree.parse(str(tree))
170 170 False
171 171
172   - >>> tree != bracket_parse(str(tree))
  172 + >>> tree != Tree.parse(str(tree))
173 173 False
174   - >>> tree2 != bracket_parse(str(tree2))
  174 + >>> tree2 != Tree.parse(str(tree2))
175 175 False
176 176 >>> tree != tree2
177 177 True
178   - >>> tree != bracket_parse(str(tree2))
  178 + >>> tree != Tree.parse(str(tree2))
179 179 True
180   - >>> tree2 != bracket_parse(str(tree))
  180 + >>> tree2 != Tree.parse(str(tree))
181 181 True
182 182
183 183 >>> tree < tree2 or tree > tree2
@@ -567,7 +567,7 @@ variable:
567 567 Define a helper funciton to create new parented trees:
568 568
569 569 >>> def make_ptree(s):
570   - ... ptree = ParentedTree.convert(bracket_parse(s))
  570 + ... ptree = ParentedTree.convert(Tree.parse(s))
571 571 ... all_ptrees.extend(t for t in ptree.subtrees()
572 572 ... if isinstance(t, Tree))
573 573 ... return ptree
@@ -838,7 +838,7 @@ variable:
838 838 Define a helper funciton to create new parented trees:
839 839
840 840 >>> def make_mptree(s):
841   - ... mptree = MultiParentedTree.convert(bracket_parse(s))
  841 + ... mptree = MultiParentedTree.convert(Tree.parse(s))
842 842 ... all_mptrees.extend(t for t in mptree.subtrees()
843 843 ... if isinstance(t, Tree))
844 844 ... return mptree
@@ -1126,6 +1126,6 @@ This used to cause an infinite loop (fixed in svn 6269):
1126 1126
1127 1127 This used to discard the ``(B b)`` subtree (fixed in svn 6270):
1128 1128
1129   - >>> print bracket_parse('((A a) (B b))')
  1129 + >>> print Tree.parse('((A a) (B b))')
1130 1130 ( (A a) (B b))
1131 1131
2  nltk/test/treetransforms.doctest
@@ -11,7 +11,7 @@ Unit tests for the TreeTransformation class
11 11
12 12 >>> sentence = "(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))"
13 13
14   - >>> tree = bracket_parse(sentence)
  14 + >>> tree = Tree.parse(sentence)
15 15 >>> print tree
16 16 (TOP
17 17 (S
46 nltk/test/wordnet.doctest
@@ -171,13 +171,13 @@ The old behavior can be achieved by setting simulate_root to be False.
171 171 A score of 1 represents identity i.e. comparing a sense with itself
172 172 will return 1.
173 173
174   - >>> dog.path_similarity(cat)
  174 + >>> dog.path_similarity(cat) # doctest: +ELLIPSIS
175 175 0.2...
176 176
177   - >>> hit.path_similarity(slap)
  177 + >>> hit.path_similarity(slap) # doctest: +ELLIPSIS
178 178 0.142...
179 179
180   - >>> wn.path_similarity(hit, slap)
  180 + >>> wn.path_similarity(hit, slap) # doctest: +ELLIPSIS
181 181 0.142...
182 182
183 183 >>> print hit.path_similarity(slap, simulate_root=False)
@@ -194,13 +194,13 @@ of the taxonomy in which the senses occur. The relationship is given
194 194 as -log(p/2d) where p is the shortest path length and d the taxonomy
195 195 depth.
196 196
197   - >>> dog.lch_similarity(cat)
  197 + >>> dog.lch_similarity(cat) # doctest: +ELLIPSIS
198 198 2.028...
199 199
200   - >>> hit.lch_similarity(slap)
  200 + >>> hit.lch_similarity(slap) # doctest: +ELLIPSIS
201 201 1.312...
202 202
203   - >>> wn.lch_similarity(hit, slap)
  203 + >>> wn.lch_similarity(hit, slap) # doctest: +ELLIPSIS
204 204 1.312...
205 205
206 206 >>> print hit.lch_similarity(slap, simulate_root=False)
@@ -225,7 +225,7 @@ shortest path to the root node is the longest will be selected. Where
225 225 the LCS has multiple paths to the root, the longer path is used for
226 226 the purposes of the calculation.
227 227
228   - >>> dog.wup_similarity(cat)
  228 + >>> dog.wup_similarity(cat) # doctest: +ELLIPSIS
229 229 0.857...
230 230
231 231 >>> hit.wup_similarity(slap)
@@ -263,9 +263,9 @@ information content, the result is dependent on the corpus used to
263 263 generate the information content and the specifics of how the
264 264 information content was created.
265 265
266   - >>> dog.res_similarity(cat, brown_ic)
  266 + >>> dog.res_similarity(cat, brown_ic) # doctest: +ELLIPSIS
267 267 7.911...
268   - >>> dog.res_similarity(cat, genesis_ic)
  268 + >>> dog.res_similarity(cat, genesis_ic) # doctest: +ELLIPSIS
269 269 7.204...
270 270
271 271 ``synset1.jcn_similarity(synset2, ic):``
@@ -275,9 +275,9 @@ Information Content (IC) of the Least Common Subsumer (most specific
275 275 ancestor node) and that of the two input Synsets. The relationship is
276 276 given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
277 277
278   - >>> dog.jcn_similarity(cat, brown_ic)
  278 + >>> dog.jcn_similarity(cat, brown_ic) # doctest: +ELLIPSIS
279 279 0.449...
280   - >>> dog.jcn_similarity(cat, genesis_ic)
  280 + >>> dog.jcn_similarity(cat, genesis_ic) # doctest: +ELLIPSIS
281 281 0.285...
282 282
283 283 ``synset1.lin_similarity(synset2, ic):``
@@ -287,7 +287,7 @@ Information Content (IC) of the Least Common Subsumer (most specific
287 287 ancestor node) and that of the two input Synsets. The relationship is
288 288 given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
289 289
290   - >>> dog.lin_similarity(cat, semcor_ic)
  290 + >>> dog.lin_similarity(cat, semcor_ic) # doctest: +ELLIPSIS
291 291 0.886...
292 292
293 293
@@ -405,7 +405,7 @@ Bug 160: wup_similarity breaks when the two synsets have no common hypernym
405 405
406 406 >>> t = wn.synsets('picasso')[0]
407 407 >>> m = wn.synsets('male')[1]
408   - >>> t.wup_similarity(m)
  408 + >>> t.wup_similarity(m) # doctest: +ELLIPSIS
409 409 0.631...
410 410
411 411 >>> t = wn.synsets('titan')[1]
@@ -418,14 +418,14 @@ Bug 21: "instance of" not included in LCS (very similar to bug 160)
418 418 >>> a = wn.synsets("writings")[0]
419 419 >>> b = wn.synsets("scripture")[0]
420 420 >>> brown_ic = wordnet_ic.ic('ic-brown.dat')
421   - >>> a.jcn_similarity(b, brown_ic)
  421 + >>> a.jcn_similarity(b, brown_ic) # doctest: +ELLIPSIS
422 422 0.175...
423 423
424 424 Bug 221: Verb root IC is zero
425 425
426 426 >>> from nltk.corpus.reader.wordnet import information_content
427 427 >>> s = wn.synsets('say', wn.VERB)[0]
428   - >>> information_content(s, brown_ic)
  428 + >>> information_content(s, brown_ic) # doctest: +ELLIPSIS
429 429 4.623...
430 430
431 431 Bug 161: Comparison between WN keys/lemmas should not be case sensitive
@@ -451,7 +451,7 @@ Bug 382: JCN Division by zero error
451 451 >>> shlep = wn.synset('shlep.v.02')
452 452 >>> from nltk.corpus import wordnet_ic
453 453 >>> brown_ic = wordnet_ic.ic('ic-brown.dat')
454   - >>> tow.jcn_similarity(shlep, brown_ic)
  454 + >>> tow.jcn_similarity(shlep, brown_ic) # doctest: +ELLIPSIS
455 455 1...e+300
456 456
457 457 Bug 428: Depth is zero for instance nouns
@@ -473,7 +473,7 @@ Bug 470: shortest_path_distance ignored instance hypernyms
473 473
474 474 >>> google = wordnet.synsets("google")[0]
475 475 >>> earth = wordnet.synsets("earth")[0]
476   - >>> google.wup_similarity(earth)
  476 + >>> google.wup_similarity(earth) # doctest: +ELLIPSIS
477 477 0.1...
478 478
479 479 Bug 484: similarity metrics returned -1 instead of None for no LCS
@@ -505,17 +505,17 @@ Bug 482: Some nouns not being lemmatised by WordNetLemmatizer().lemmatize
505 505
506 506 Bug 284: instance hypernyms not used in similarity calculations
507 507
508   - >>> wn.synset('john.n.02').lch_similarity(wn.synset('dog.n.01'))
  508 + >>> wn.synset('john.n.02').lch_similarity(wn.synset('dog.n.01')) # doctest: +ELLIPSIS
509 509 1.335...
510   - >>> wn.synset('john.n.02').wup_similarity(wn.synset('dog.n.01'))
  510 + >>> wn.synset('john.n.02').wup_similarity(wn.synset('dog.n.01')) # doctest: +ELLIPSIS
511 511 0.571...
512   - >>> wn.synset('john.n.02').res_similarity(wn.synset('dog.n.01'), brown_ic)
  512 + >>> wn.synset('john.n.02').res_similarity(wn.synset('dog.n.01'), brown_ic) # doctest: +ELLIPSIS
513 513 2.224...
514   - >>> wn.synset('john.n.02').jcn_similarity(wn.synset('dog.n.01'), brown_ic)
  514 + >>> wn.synset('john.n.02').jcn_similarity(wn.synset('dog.n.01'), brown_ic) # doctest: +ELLIPSIS
515 515 0.075...
516   - >>> wn.synset('john.n.02').lin_similarity(wn.synset('dog.n.01'), brown_ic)
  516 + >>> wn.synset('john.n.02').lin_similarity(wn.synset('dog.n.01'), brown_ic) # doctest: +ELLIPSIS
517 517 0.252...
518   - >>> wn.synset('john.n.02').hypernym_paths()
  518 + >>> wn.synset('john.n.02').hypernym_paths() # doctest: +ELLIPSIS
519 519 [[Synset('entity.n.01'), ..., Synset('john.n.02')]]
520 520
521 521 Issue 541: add domains to wordnet

0 comments on commit 4b802fc

Please sign in to comment.
Something went wrong with that request. Please try again.