In [55]:
"""
Averaged perceptron classifier. Implementation geared for simplicity rather than
efficiency.
"""
from collections import defaultdict
import pickle
import random


class AveragedPerceptron(object):

    '''An averaged perceptron, as implemented by Matthew Honnibal.
    See more implementation details here:
        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    '''

    def __init__(self):
        # Each feature gets its own weight vector, so weights is a dict-of-dicts
        
        self.weights = {}
        self.classes = set()
        # The accumulated values, for the averaging. These will be keyed by
        # feature/clas tuples
        self._totals = defaultdict(int)
        # The last time the feature was changed, for the averaging. Also
        # keyed by feature/clas tuples
        # (tstamps is short for timestamps)
        self._tstamps = defaultdict(int)
        # Number of instances seen
        self.i = 0

    def predict(self, features, dont_allow):
        '''Dot-product the features and current weights and return the best label.'''
        scores = defaultdict(float)
        
        for feat, value in features.items():
            
            if feat not in self.weights or value == 0:
                continue
            weights = self.weights[feat]
            for label, weight in weights.items():
                scores[label] += value * weight
        # Do a secondary alphabetic sort, for stability
        sort_by_score = lambda d: (d[1], d)
        
        first_found=False
        maxClass = "None"
        maxScore = 0
    
        secondMaxClass = "None"
        secondMaxScore = 0
        
        for label, score in sorted(scores.iteritems(), key=sort_by_score, reverse=True):
            if(label != dont_allow and not first_found):
                maxClass = label
                maxScore = score
                first_found=True
            elif(label != dont_allow and first_found):
                secondMaxClass = label
                secondMaxScore = score
                break
      
        return maxClass, maxScore-secondMaxScore

    def update(self, truth, guess, features):
        '''Update the feature weights.'''
        def upd_feat(c, f, w, v):
            param = (f, c)
            self._totals[param] += (self.i - self._tstamps[param]) * w
            self._tstamps[param] = self.i
            self.weights[f][c] = w + v

        self.i += 1
        if truth == guess:
            return None
        for f in features:
            weights = self.weights.setdefault(f, {})
            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
        return None

    def average_weights(self):
        '''Average weights from all iterations.'''
        for feat, weights in self.weights.items():
            new_feat_weights = {}
            for clas, weight in weights.items():
                param = (feat, clas)
                total = self._totals[param]
                total += (self.i - self._tstamps[param]) * weight
                averaged = round(total / float(self.i), 3)
                if averaged:
                    new_feat_weights[clas] = averaged
            self.weights[feat] = new_feat_weights
        return None

    def save(self, path):
        '''Save the pickled model weights.'''
        return pickle.dump(dict(self.weights), open(path, 'w'))

    def load(self, path):
        '''Load the pickled model weights.'''
        self.weights = pickle.load(open(path))
        return None


def train(nr_iter, examples):
    '''Return an averaged perceptron model trained on ``examples`` for
    ``nr_iter`` iterations.
    '''
    model = AveragedPerceptron()
    for i in range(nr_iter):
        random.shuffle(examples)
        for features, class_ in examples:
            scores = model.predict(features)
            guess, score = max(scores.items(), key=lambda i: i[1])
            if guess != class_:
                model.update(class_, guess, features)
    model.average_weights()
    return model

In [56]:
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import os
import random
from collections import defaultdict
import pickle
import logging

from textblob.tokenizers import WordTokenizer, SentenceTokenizer
from textblob.exceptions import MissingCorpusError


PICKLE = "trontagger-0.1.0.pickle"


class PerceptronTagger(AveragedPerceptron):

    '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
    See more implementation details here:
        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    :param load: Load the pickled model upon instantiation.
    '''

    START = ['-START-', '-START2-']
    END = ['-END-', '-END2-']
    #AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE)

    def __init__(self, load=False):
        self.model = AveragedPerceptron()
        self.tagdict = {}
        self.classes = set()
        if load:
            self.load(self.AP_MODEL_LOC)

    def tag(self, corpus, tokenize=True, dont_allow=None):
        '''Tags a string `corpus`.'''
        # Assume untokenized corpus has \n between sentences and ' ' between words
        s_split = SentenceTokenizer().tokenize if tokenize else lambda t: t.split('\n')
        w_split = WordTokenizer().tokenize if tokenize else lambda s: s.split()
        def split_sents(corpus):
            for s in s_split(corpus):
                yield w_split(s)

        prev, prev2 = self.START
        tokens = []
        for words in split_sents(corpus):
            sentence_tags = []
            context = self.START + [self._normalize(w) for w in words] + self.END
            for i, word in enumerate(words):
                tag = None#self.tagdict.get(word)
                confidence = 30
                if not tag:
                    features = self._get_features(i, word, context, prev, prev2)
                    tag, confidence = self.model.predict(features, dont_allow)
                sentence_tags.append((word, tag, confidence))
                prev2 = prev
                prev = tag
            tokens.append(sentence_tags)
        return tokens

    def train(self, sentences, save_loc=None, nr_iter=5, dont_allow=None):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.
        :param sentences: A list of (words, tags) tuples.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        "Hi train"
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        prev, prev2 = self.START
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            for words, tags in sentences:
                context = self.START + [self._normalize(w) for w in words] \
                                                                    + self.END
                for i, word in enumerate(words):
                    guess = None # self.tagdict.get(word)
                    confidence = 30
                    if not guess:
                        feats = self._get_features(i, word, context, prev, prev2)
                        guess, confidence = self.model.predict(feats, dont_allow)
                        self.model.update(tags[i], guess, feats)
                    prev2 = prev
                    prev = guess
                    c += guess == tags[i]
                    n += 1
            random.shuffle(sentences)
            logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)))
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                         open(save_loc, 'wb'), -1)
        return None

    def load(self, loc):
        '''Load a pickled model.'''
        try:
            w_td_c = pickle.load(open(loc, 'rb'))
        except IOError:
            msg = ("Missing trontagger.pickle file.")
            raise MissingCorpusError(msg)
        self.model.weights, self.tagdict, self.classes = w_td_c
        self.model.classes = self.classes
        return None

    def _normalize(self, word):
        '''Normalization used in pre-processing.
        - All words are lower cased
        - Digits in the range 1800-2100 are represented as !YEAR;
        - Other digits are represented as !DIGITS
        :rtype: str
        '''
        if '-' in word and word[0] != '-':
            return '!HYPHEN'
        elif word.isdigit() and len(word) == 4:
            return '!YEAR'
        elif word[0].isdigit():
            return '!DIGITS'
        else:
            return word.lower()

    def _get_features(self, i, word, context, prev, prev2):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        add('i-1 tag', prev)
        add('i-2 tag', prev2)
        add('i tag+i-2 tag', prev, prev2)
        add('i word', context[i])
        add('i-1 tag+i word', prev, context[i])
        add('i-1 word', context[i-1])
        add('i-1 suffix', context[i-1][-3:])
        add('i-2 word', context[i-2])
        add('i+1 word', context[i+1])
        add('i+1 suffix', context[i+1][-3:])
        add('i+2 word', context[i+2])
        return features

    def _make_tagdict(self, sentences):
        
        '''Make a tag dictionary for single-tag words.'''
        counts = defaultdict(lambda: defaultdict(int))
        for words, tags in sentences:
            for word, tag in zip(words, tags):
                counts[word][tag] += 1
                self.classes.add(tag)
        freq_thresh = 20
        ambiguity_thresh = 0.97
        for word, tag_freqs in counts.items():
            tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
            n = sum(tag_freqs.values())
            # Don't add rare words to the tag dictionary
            # Only add quite unambiguous words
            if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
                self.tagdict[word] = tag


def _pc(n, d):
    return (float(n) / d) * 100

In [24]:
def convert_corpus_to_sentence_list(corpus):
    sentence_list=[]
    for sentence in corpus.split("\n"):
        sentence_list.append(sentence.split(" "))
    return sentence_list

def convert_sentence_list_no_tags_to_corpus(sentence_list):
    return "".join(" ".join(x) for x in sentence_list)
    
def convert_tagged_to_train_format(tagged_sent_list):
    train_list = []
    for sent in tagged_sent_list:
        words=[]
        tags=[]
        for tup in sent:
            words.append(tup[0])
            tags.append(tup[1])
        train_list.append((words,tags))
    return train_list
    

In [25]:
#### get training set from UD
def load_tagged_sentences(file_name):
    sentences_w_tags = []
    count = 0
    words=[]
    tags=[]
    for line in open(trainFile):
    
        vals = line.split('\t')
        if (len(vals) > 1):
            words.append(vals[1])
            tags.append(vals[3])
        else:
            sentences_w_tags.append((words, tags))
            words=[]
            tags=[]
    return sentences_w_tags # [ (["word", "word", "word"], ["tag", "tag", "tag"]), next sentece...]


In [26]:
#args sentences_with_tags = [ (["word", "word", "word"], ["tag", "tag", "tag"]), next sentece...]
def train_tagger(tagger, sentences_with_tags, num_iters=5):
    tagger.train(sentences_with_tags, nr_iter=num_iters)

In [27]:
# return arg1 sentences with word/tokens seperated by a " " and sentences seperated by "\n" 
# return arg2 word with tag tuple list
def get_test_corpus(file_name):
    corpus=""
    words=[]
    test_correct_tags=[]
    sentence_tags = []
    for line in open(file_name):

        vals = line.split('\t')
        if (len(vals) > 1):
            words.append(vals[1])
            sentence_tags.append((vals[1],vals[3]))
        else:
            words.append("\n")
            test_correct_tags.append(sentence_tags)
            sentence_tags = []


    corpus = " ".join(words)
    return corpus, test_correct_tags


In [28]:
#expects corpus in the same form as get test corpus returns as arg1
# returns list ["word", "tag", float_confidence]
def tag_tagger(tagger, corpus, dont_allow=None):
    return tagger.tag(corpus, False, dont_allow)

In [29]:
import statistics as s
import copy

#todo get accuracy of tags above certain min_confidence_threshold
def analyze_tags(guess_tags, correct_tags, show_full=False, sort_key=lambda ((key_right,key_wrong), value): value):
    correct_tag_type ={}
    wrong_tag_type = {}
    
    conf_right = []
    conf_wrong = []
    
    total_tags = 0
    total_wrong_tags = 0
    
    total_sentences = len(guess_tags)
    total_wrong_sentences = 0
    
    for sent_num, correct_sentece in enumerate(correct_tags):
        perfect_sentece = True
        
        for word_idx, word_tag_tuple in enumerate(correct_sentece):
            test_tag_output_guess_tuple = guess_tags[sent_num][word_idx]
            word = test_tag_output_guess_tuple[0]
            tag_guess = test_tag_output_guess_tuple[1]
            guess_confidence = test_tag_output_guess_tuple[2]
            total_tags +=1
            
            if(word_tag_tuple[1] != tag_guess):
                total_wrong_tags +=1
                conf_wrong.append(guess_confidence)
                perfect_sentece = False
                error_tuple = (word_tag_tuple[1], tag_guess)
                wrong_tag_type[error_tuple] = wrong_tag_type.get(error_tuple, 0) + 1
            else:
                correct_tag_type[tag_guess] = correct_tag_type.get(tag_guess, 0) + 1
                conf_right.append(guess_confidence)
                
        if not perfect_sentece:
            total_wrong_sentences+= 1
    
    #todo need to redo statistical outputs        
    print "average confidence of right = " + str(s.mean(conf_right))
    print "average confidence of wrong = " + str(s.mean(conf_wrong))
    print "stdev confidence of right = " + str(s.stdev(conf_right))
    print "stdev confidence of wrong = " + str(s.stdev(conf_wrong))
    word_acc = (100.00*(total_tags-total_wrong_tags))/total_tags
    
    if(show_full):
        for tag_tup, count in sorted(wrong_tag_type.iteritems(),key=sort_key):
            print "correct:\t"+tag_tup[0]+"\tincorrect:\t"+tag_tup[1]+"\tcount:\t"+str(count)
    sentence_acc = (100.00*(total_sentences-total_wrong_sentences))/total_sentences

    
    print "token accuracy: " + str(word_acc) + "%"
    print "sentence accuracy: " + str(sentence_acc) + "%"



In [30]:
def get_alignment_info(source_file, tgt_file, align_file, num_matches=1000):
    sentence_word_mappings =[]
    orig_sentences = []
    target_sentences= []
    total=0
    matches=0

    from itertools import izip

    with open(align_file) as align, open(source_file) as orig, open(tgt_file) as tgt: 
        for x, y, z in izip(align, orig, tgt):
        
            pairings = []
            for pair in x.split(" "):
                indexs = pair.split("-")
                if(indexs[0] == "" or indexs[1] == ""):
                    continue
                pairings.append((int(indexs[0]), int(indexs[1])))
            src_tokens = y.split(" ")
            tgt_tokens = z.split(" ")
            
            if (not filter_alignments(src_tokens, tgt_tokens, pairings)):
                sentence_word_mappings.append(pairings)
                orig_sentences.append(src_tokens)
                target_sentences.append(tgt_tokens)
                matches+=1
         
            total +=1
            if matches>num_matches:
                break
    print "match percentage: " + str((100.0*matches)/total)
    return orig_sentences, target_sentences, sentence_word_mappings

In [31]:
#some sort of check to see if the alignment is "good" enough
def filter_alignments(src_sent_list, tgt_sent_list, align_pairing_list):
    #dont filter any sentences
    return False
    #filter if length of the target and source are different or if the source and pairings lengths dont match
    #return not (len(src_sent_list) == len(tgt_sent_list) or len(src_sent_list) == len(align_pairing_list))

In [32]:
untagged_tag_str = "NOTAG"
def map_tags(tagged_src, untagged_tgt, alignment_list):
    tagged_tgt =[]
    for sentence in untagged_tgt:
        sent_tag_tuple_list = []
        for word in sentence:
            sent_tag_tuple_list.append((word, untagged_tag_str))
        tagged_tgt.append(sent_tag_tuple_list)
            
    for sent_num, pairings in enumerate(alignment_list):
        for pair in pairings:
            src_tag_idx = pair[0]
            tgt_tag_idx = pair[1]
            
            
            tagged_tgt[sent_num][tgt_tag_idx] = tagged_src[sent_num][src_tag_idx]
            
    return tagged_tgt

In [33]:
untagged_tag_str = "NOTAG"

#english
en_train_file='../Data/UD_English/en-ud-train.conllu'
en_test_file='../Data/UD_English/en-ud-test.conllu'

#spanish
es_train_file='../Data/UD_Spanish/es-ud-train.conllu'
es_test_file='../Data/UD_Spanish/es-ud-test.conllu'

#arabic...

trainFile=en_train_file
testFile=en_test_file

In [34]:
src_text_file = "../Data/UN/c.true.en"
tgt_text_file = "../Data/UN/c.true.es"
align_file = "../Data/UN/aligned.intersect"
num_sents = 1000

In [35]:
src_language_train_data = load_tagged_sentences(trainFile)

In [44]:
src_language_tagger = PerceptronTagger()
train_tagger(src_language_tagger, src_language_train_data)

AVERAGE PTRON INIT


In [45]:
src_language_init_test_data, src_test_sentence_w_correct_tags = get_test_corpus(testFile)

In [46]:
src_guess_test_tags = tag_tagger(src_language_tagger, src_language_init_test_data)


In [47]:
analyze_tags(src_guess_test_tags, src_test_sentence_w_correct_tags)

average confidence of right = 20.8818171942
average confidence of wrong = 5.97765372766
stdev confidence of right = 9.14832106627
stdev confidence of wrong = 5.29390932818
token accuracy: 93.5328339178%
sentence accuracy: 58.6140519731%


In [48]:
src_sent_list, tgt_sent_list, alignments_list = get_alignment_info(src_text_file, tgt_text_file, align_file, num_sents)

match percentage: 100.0


In [53]:

tagged_source = tag_tagger(src_language_tagger, convert_sentence_list_no_tags_to_corpus(src_sent_list))
untagged_target = tgt_sent_list
tagged_target_data = map_tags(tagged_source, untagged_target, alignments_list)

target_language_tagger = PerceptronTagger()

AVERAGE PTRON INIT


In [None]:
train_tagger(target_language_tagger, convert_tagged_to_train_format(tagged_target_data))

WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF
WTF


ERROR: Internal Python error in the inspect module.
Below is the traceback from this internal error.


Unfortunately, your original traceback can not be constructed.

ERROR: Internal Python error in the inspect module.
Below is the traceback from this internal error.


Unfortunately, your original traceback can not be constructed.



TypeError: 'NoneType' object is not iterable

ERROR:tornado.general:Uncaught exception, closing connection.
Traceback (most recent call last):
  File "/usr/local/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 407, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 252, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 213, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 371, in execute_request
    time.sleep(self._execute_sleep)
KeyboardInterrupt
ERROR:tornado.general:Uncaught exception, closing connection.
Traceback (most recent call last):
  File "/usr/local/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 433, in _handle_events
    sel

In [200]:
tgt_language_test_data, tgt_test_sentence_w_correct_tags = get_test_corpus(es_test_file)
tgt_guess_test_tags = tag_tagger(target_language_tagger, tgt_language_test_data, untagged_tag_str)
sort_by_right = lambda ((key_right,key_wrong), value): key_right
sort_by_wrong = lambda ((key_right,key_wrong), value): key_wrong
sort_by_count = lambda ((key_right,key_wrong), value): value
analyze_tags(tgt_guess_test_tags, tgt_test_sentence_w_correct_tags,True, sort_by_count)

average confidence of right = 0.0
average confidence of wrong = 0.0
stdev confidence of right = 0.0
stdev confidence of wrong = 0.0
correct:	NUM	incorrect:	PRON	count:	1
correct:	ADV	incorrect:	NUM	count:	1
correct:	SCONJ	incorrect:	SYM	count:	1
correct:	SYM	incorrect:	NOTAG	count:	1
correct:	X	incorrect:	NUM	count:	1
correct:	PRON	incorrect:	CONJ	count:	1
correct:	CONJ	incorrect:	NUM	count:	1
correct:	CONJ	incorrect:	DET	count:	1
correct:	NUM	incorrect:	X	count:	1
correct:	AUX	incorrect:	ADP	count:	1
correct:	_	incorrect:	CONJ	count:	1
correct:	X	incorrect:	SCONJ	count:	1
correct:	SYM	incorrect:	ADV	count:	1
correct:	SCONJ	incorrect:	PART	count:	1
correct:	_	incorrect:	NUM	count:	1
correct:	VERB	incorrect:	X	count:	1
correct:	ADV	incorrect:	SCONJ	count:	1
correct:	DET	incorrect:	PART	count:	1
correct:	CONJ	incorrect:	PART	count:	1
correct:	VERB	incorrect:	PUNCT	count:	1
correct:	CONJ	incorrect:	NOTAG	count:	1
correct:	PRON	incorrect:	NUM	count:	1
correct:	AUX	incorrect:	NUM	count:	1
c