In [12]:
# imports
from conllu import parse
from collections import defaultdict
from pprint import pprint

In [2]:
# read data
with open("en_ewt-ud-train.conllu", "r", encoding="utf-8") as f:
    data = f.read()

In [3]:
# parse data
sentences = parse(data)

In [7]:
# extract POS transition counts

transitions_count = defaultdict(lambda: defaultdict(int))

for sentence in sentences:
    prev_tag = None
    for token in sentence:
        if prev_tag is not None:
            transitions_count[prev_tag][token['upos']] += 1
        prev_tag = token['upos']

# calculate probabilities based on the counts

transition_probabilities = defaultdict(lambda: defaultdict(float))

for prev_tag, next_tags in transitions_count.items():
    total_count = sum(next_tags.values())
    for next_tag, count in next_tags.items():
        transition_probabilities[prev_tag][next_tag] = count / total_count


In [13]:
# print probabilities
pprint(transition_probabilities)

defaultdict(<function <lambda> at 0x000001BEC58E9240>,
            {'ADJ': defaultdict(<class 'float'>,
                                {'ADJ': 0.05449549687070676,
                                 'ADP': 0.07678217066096779,
                                 'ADV': 0.013814684780949474,
                                 'AUX': 0.003358265913600977,
                                 'CCONJ': 0.04289421462372157,
                                 'DET': 0.0050373988704014655,
                                 'INTJ': 0.0003052969012364525,
                                 'NOUN': 0.5176308960464051,
                                 'NUM': 0.00831934055869333,
                                 'PART': 0.03159822927797283,
                                 'PRON': 0.011830254922912533,
                                 'PROPN': 0.06563883376583728,
                                 'PUNCT': 0.1299038314761105,
                                 'SCONJ': 0.022897267592733934,
                        

From these values we can clearly see that our intuitions hold. DET is followed by a NOUN with a probability of about 58.8% where as by a VERB with a probability of only about 1.9%. We can also find other interesting values: Adjectives are followed by nouns with a possibility of about 51.8% and PART -> VERB (particle to verb) and SYM -> NUM (symbol to number) are probable transitions. Punctuations are popular followers in many classes.