In [None]:
class Token:
    
    labels = ('index','token','lemma','upos','xpos','morph','head','deprel','enh','other')
    
    def __init__(self, line=None):
        if line==None:
            line="0\tROOT"+"\t_"*8
        self._data = {k: v for (k, v) in zip(Token.labels,line.split('\t'))}
        
        
    def __getitem__(self, arg):
        if arg not in Token.labels:
            raise ValueError('unknown token key: '+arg)
        else:
            return self._data[arg]
        
    def __str__(self):
        return '('+self['index']+','+self['token']+')'
    
    __repr__ = __str__

In [None]:
class Dependency:
    
    def __init__(self, head, dep):
        self._head = head
        self._dep = dep
        
    def head_index(self):
        return self._head['index']
        
    def __str__(self):
        return str(self._head)+'→'+str(self._dep)
    
    __repr__ = __str__

In [None]:
class Configuration:
    
    def __init__(self, tokens):
        self._depcounts = dict()
        for t in tokens:
            self._depcounts[t['head']] = self._depcounts.get(t['head'],0) + 1
        self._tokens = tokens
        self._stack = [Token()]
        self._deps = []
        
    def leftarc(self):
        dep = self._stack.pop(-2)
        self._deps.append(Dependency(self._stack[-1],dep))
        
    def shift(self):
        if len(self._tokens)==0:
            raise IndexError('Trying to shift from configuration:'+str(self))
        self._stack.append(self._tokens.pop(0))
        
    def rightarc(self):
        dep = self._stack.pop(-1)
        self._deps.append(Dependency(self._stack[-1],dep))
        
    def _dependents_found(self, headindex):
        return sum(1 for d in self._deps if d.head_index()==headindex)
        
    def done_p(self):
        return len(self._tokens)==0 and len(self._stack)==1
        
    def training_oracle(self):
        if len(self._stack)>2 and self._stack[-2]['head'] == self._stack[-1]['index']:
            answer = 'LeftArc'
            self.leftarc()
        elif len(self._stack)>1 and self._stack[-1]['head'] == self._stack[-2]['index'] and self._dependents_found(self._stack[-1]['index']) == self._depcounts.get(self._stack[-1]['index'],0):
            answer = 'RightArc'
            self.rightarc()
        elif len(self._tokens)>0:
            answer = 'Shift'
            self.shift()
        else:
            # Fail; usually because of non-projectivity, or sometimes bug in treebank!
            answer = 'Fail'
            pass
        return answer
        
    def __str__(self):
        return 'stack='+str(self._stack)+',tokens='+str(self._tokens)+',deps='+str(self._deps)
        

In [None]:
# sentence s is a list of dictionaries, each representing one token
# return True iff oracle-training parse succeeded
def process_sentence(s,verbose=False):
    config = Configuration(s)
    label = ''
    if verbose:
        print('\n\nStarting new parse:')
        print(config)
    while label != 'Fail' and not config.done_p():
        label = config.training_oracle()
        if verbose:
            print('Applied '+label+' operation...')
            print(config)
    return label!='Fail'

In [None]:
import re
corpus = 'ca_ancora-ud-train.conllu'
#corpus = 'test.conllu'
verbose = False
total = 0
ok = 0
with open(corpus,'r') as f:
    sentence = []
    for line in f:
        line = line.rstrip('\n')
        if line == '':
            total += 1
            if process_sentence(sentence,verbose):
                ok += 1
            sentence = []
        elif line[0] == '#' or re.match('^[0-9]+-',line):
            # skip comments and multiword tokens
            pass
        else:
            sentence.append(Token(line))
print('Successfully parsed',ok,'of',total,'sentences')