Skip to content
Browse files

tag_phrases basically working with ChunkedCorpusWriter

  • Loading branch information...
1 parent e2cea15 commit 6e8ae185d19ee4f7924581edd560ee7adb13a8b1 @japerk committed Feb 12, 2012
View
6 classify_corpus.py 100644 → 100755
@@ -68,6 +68,12 @@
source_corpus = load_corpus_reader(args.source_corpus, args.reader)
+if not source_corpus:
+ raise ValueError('%s is an unknown corpus')
+
+if args.trace:
+ print 'loaded %s' % args.source_corpus
+
########################
## text normalization ##
########################
View
8 nltk_trainer/__init__.py
@@ -27,7 +27,7 @@ def import_attr(path):
mod = __import__(basepath, globals(), locals(), [name])
return getattr(mod, name)
-def load_corpus_reader(corpus, reader=None, fileids=None, **kwargs):
+def load_corpus_reader(corpus, reader=None, fileids=None, sent_tokenizer=None, word_tokenizer=None, **kwargs):
if corpus == 'timit':
return LazyCorpusLoader('timit', NumberedTaggedSentCorpusReader,
'.+\.tags', tag_mapping_function=simplify_wsj_tag)
@@ -54,6 +54,12 @@ def load_corpus_reader(corpus, reader=None, fileids=None, **kwargs):
except LookupError:
raise ValueError('cannot find corpus path for %s' % corpus)
+ if sent_tokenizer and isinstance(sent_tokenizer, basestring):
+ kwargs['sent_tokenizer'] = nltk.data.load(sent_tokenizer)
+
+ if word_tokenizer and isinstance(word_tokenizer, basestring):
+ kwargs['word_tokenizer'] = import_attr(word_tokenizer)()
+
reader_cls = import_attr(reader)
real_corpus = reader_cls(root, fileids, **kwargs)
View
57 nltk_trainer/writer/__init__.py
@@ -0,0 +1,57 @@
+import codecs, collections, os, os.path
+
+class CorpusWriter(object):
+ def __init__(self, fileids, path='~/nltk_data/corpora', mode='a', encoding='utf-8', trace=1):
+ assert fileids and path and mode
+ self.mode = mode
+ self.encoding = encoding
+ self.trace = trace or 0
+ self.full_path = os.path.expanduser(path)
+
+ for dirname in set([os.path.dirname(fileid) for fileid in fileids]):
+ dirpath = os.path.join(self.full_path, dirname)
+
+ if not os.path.exists(dirpath):
+ if trace:
+ print 'making directory %s' % dirpath
+
+ os.makedirs(dirpath)
+
+ self.fileids = [os.path.join(self.full_path, fileid) for fileid in fileids]
+ self.files = {}
+
+ def get_file(self, fileid):
+ if not fileid.startswith(self.full_path):
+ fileid = os.path.join(self.full_path, fileid)
+
+ if fileid not in self.files:
+ self.files[fileid] = codecs.open(fileid, self.mode, self.encoding)
+
+ return self.files[fileid]
+
+ def open(self):
+ for fileid in self.fileids:
+ if self.trace:
+ print 'opening %s' % fileid
+
+ self.get_file(fileid)
+
+ return self
+
+ def close(self, *args, **kwargs):
+ for fileid, f in self.files.items():
+ if self.trace:
+ print 'closing %s' % fileid
+
+ f.close()
+ del self.files[fileid]
+
+ __enter__ = open
+ __exit__ = close
+ __del__ = close
+
+ def write(self, s, fileid=None):
+ if not fileid:
+ fileid = self.fileids[0]
+
+ self.get_file(fileid).write(s)
View
44 nltk_trainer/writer/chunked.py
@@ -0,0 +1,44 @@
+from nltk.tag.util import tuple2str
+from nltk_trainer.writer import CorpusWriter
+
+class ChunkedCorpusWriter(CorpusWriter):
+ def chunked_sent_string(self, sent):
+ parts = []
+
+ for word, tag in sent:
+ try:
+ brack = word in u'[]'
+ except:
+ brack = False
+
+ if brack:
+ # brackets don't get a tag
+ parts.append(word)
+ else:
+ # make sure no brackets or slashes in tag
+ tag = tag.replace(u'[', u'(').replace(u']', u')').replace(u'/', '|')
+ parts.append(tuple2str((word, tag)))
+
+ return ' '.join(parts)
+
+ def write_sents(self, sents, *args, **kwargs):
+ first = True
+
+ for sent in sents:
+ if not first:
+ self.write(' ', *args, **kwargs)
+ else:
+ first = False
+
+ self.write(self.chunked_sent_string(sent), *args, **kwargs)
+
+ def write_paras(self, paras, *args, **kwargs):
+ first = True
+
+ for para in paras:
+ if not first:
+ self.write('\n\n', *args, **kwargs)
+ else:
+ first = False
+
+ self.write_sents(para, *args, **kwargs)
View
15 nltk_trainer/writer/classified.py
@@ -0,0 +1,15 @@
+
+class ClassifiedCorpusWriter(CorpusWriter):
+ def __init__(self, path, labels):
+ self.path = path
+ self.labels = labels
+ # TODO: make sure works with with keyword
+ def __enter__(self):
+ self._files = dict([(l, self.open(os.path.join(path, l), 'a')) for l in labels])
+
+ def __exit__(self):
+ for f in self._files.values():
+ f.close()
+
+ def write(self, text, label):
+ self._files[label].write(text + u'\n\n')
View
77 tag_phrases.py
@@ -0,0 +1,77 @@
+import argparse, os.path
+import cPickle as pickle
+import nltk.data, nltk.tag
+from nltk_trainer import load_corpus_reader
+from nltk_trainer.writer.chunked import ChunkedCorpusWriter
+
+########################################
+## command options & argument parsing ##
+########################################
+
+# TODO: many of the args are shared with analyze_classifier_coverage, so abstract
+
+parser = argparse.ArgumentParser(description='Classify a plaintext corpus to a classified corpus')
+# TODO: make sure source_corpus can be a single file
+parser.add_argument('source_corpus', help='corpus name/path relative to an nltk_data directory')
+parser.add_argument('target_corpus', help='corpus name/path relative to an nltk_data directory')
+parser.add_argument('--trace', default=1, type=int,
+ help='How much trace output you want, defaults to 1. 0 is no trace output.')
+parser.add_argument('--tagger', default=nltk.tag._POS_TAGGER,
+ help='''pickled tagger filename/path relative to an nltk_data directory
+default is NLTK's default tagger''')
+
+# TODO: from analyze_tagged_corpus.py
+corpus_group = parser.add_argument_group('Corpus Reader Options')
+corpus_group.add_argument('--reader',
+ default='nltk.corpus.reader.plaintext.PlaintextCorpusReader',
+ help='Full module path to a corpus reader class, defaults to %(default)s.')
+corpus_group.add_argument('--fileids', default=None,
+ help='Specify fileids to load from corpus')
+corpus_group.add_argument('--sent-tokenizer', default='tokenizers/punkt/english.pickle',
+ help='Path to pickled sentence tokenizer')
+corpus_group.add_argument('--word-tokenizer', default='nltk.tokenize.WordPunctTokenizer',
+ help='Full module path to a tokenizer class, defaults to %(default)s.')
+
+args = parser.parse_args()
+
+###################
+## corpus reader ##
+###################
+
+source_corpus = load_corpus_reader(args.source_corpus, reader=args.reader,
+ fileids=args.fileids, encoding='utf-8', sent_tokenizer=args.sent_tokenizer,
+ word_tokenizer=args.word_tokenizer)
+
+if not source_corpus:
+ raise ValueError('%s is an unknown corpus')
+
+if args.trace:
+ print 'loaded %s' % args.source_corpus
+
+############
+## tagger ##
+############
+
+# TODO: from analyze_tagger_coverage.py
+if args.trace:
+ print 'loading tagger %s' % args.tagger
+
+try:
+ tagger = nltk.data.load(args.tagger)
+except LookupError:
+ try:
+ import cPickle as pickle
+ except ImportError:
+ import pickle
+
+ tagger = pickle.load(open(os.path.expanduser(args.tagger)))
+
+#############
+## tagging ##
+#############
+
+with ChunkedCorpusWriter(fileids=source_corpus.fileids(), path=args.target_corpus) as writer:
+ for fileid in source_corpus.fileids():
+ paras = source_corpus.paras(fileids=[fileid])
+ tagged_paras = ((tagger.tag(sent) for sent in para) for para in paras)
+ writer.write_paras(tagged_paras, fileid=fileid)
View
0 translate_corpus.py 100644 → 100755
File mode changed.

0 comments on commit 6e8ae18

Please sign in to comment.
Something went wrong with that request. Please try again.