In [1]:
import os
import pathlib
import functools
import json
from typing import Tuple, List, Union
import pandas
from io import StringIO
import nltk

In [2]:
def parser_conllu_to_dict(document: str,
                          meta_description: dict = {}) -> List[dict]:
    """CoNLL-U file parser.
    
    See format details here: https://universaldependencies.org/format.html
    
    Args:
      document: Sentences string to parse.
      meta_description: Metadata tags.
                      If provided, it must contain 'tags' and 'delimiter', i.e.:
                      meta_description = {
                          "tags": ["tag1", "tag2"],
                          "delimiter": "=",
                      }
    
    Returns:
      Dict with sentences, UD tokens.
    """
    _comment_line = '#'
    _sentence_delimiter = '\n\n'
    _col_delimiter = '\t'
    _col_names = ["id", "form", "lemma", 
                  "upos", "xpos", 
                  "feats", "head", "deprel", 
                  "deps", "misc"]
    _na_char = '_'
    _feat_delimiter = '='
    _feats_delimiter = '|'
    
    
    if meta_description:
        def _meta_extractor(string: str) -> str:
            """Helper function to extract meta data string."""
            return string.split(meta_description['delimiter'])[1].strip()


    def _feat_parser(feats: str) -> dict:
        """Feature parser from string into dict."""
        if feats == _na_char:
            return _na_char
        return {
            i[0]: i[1] 
            for i in [feat.split(_feat_delimiter)[:2]
                      for feat in feats.split(_feats_delimiter)]
        }
    
    
    output = []
    
    for sentence in document.split(_sentence_delimiter):
        if sentence == "":
            break
        data = []
        meta = {}
        for line in sentence.split('\n'):
            if not line.startswith(_comment_line):
                data.append(line)
            else:
                if not meta_description:
                    continue
                else:
                    comment_tags = meta_description['tags'].copy()
                    for comment_tag in comment_tags:
                        if comment_tag in line:
                            meta[comment_tag] = _meta_extractor(line)
                            comment_tags.remove(comment_tag)
        
        data_str = '\n'.join(data)
        del data

        with StringIO(data_str.replace('"', '\\"')) as data:
            df = pandas.read_csv(data, 
                                 delimiter=_col_delimiter, 
                                 header=None,
                                 quotechar='"',
                                 names=_col_names)
        
        tokens = df.to_dict('records')
        for token in tokens:
            try:
                token['feats'] = _feat_parser(token['feats'])
            except Exception as ex:
                print(ex)
                continue
        
        out = {
            "tokens": tokens,
        }
        if meta:
            out['meta'] = meta
        
        output.append(out)
    
    return output  


parser = functools.partial(parser_conllu_to_dict, 
                           meta_description={
                               "tags": ["sent_id", "text", "s_tape"],
                               "delimiter": "=",
                           })


def corpus_parser(path: str) -> Union[Tuple[List[dict], None],
                                      Tuple[list, str]]:
    """Function to read corpus from conllu corpus file.
    
    Args:
      path: File path.
    
    Returns:
      List of token trees with error string in case of any.
    """
    if not os.path.isfile(path):
        return [], f"File {path} doesn't exist."
    try:
        with open(path, "r", encoding="utf-8") as f:
            document = f.read()
        return parser(document), None
    except Exception as ex:
        return [], ex

In [3]:
BUCKET_DATA = "/transfer/data/UD_English-GUM"

In [4]:
FILE_TRAIN = f"{BUCKET_DATA}/en_gum-ud-train.conllu"
FILE_TEST = f"{BUCKET_DATA}/en_gum-ud-test.conllu"
FILE_DEV = f"{BUCKET_DATA}/en_gum-ud-test.conllu"

In [5]:
tokens_tree_train, err = corpus_parser(FILE_TRAIN)
if err:
    print(err)

In [6]:
tokens_tree_test, err = corpus_parser(FILE_TEST)
if err:
    print(err)

In [7]:
tokens_tree_dev, err = corpus_parser(FILE_DEV)
if err:
    print(err)

In [8]:
# prepare tokens to be consumed by NLTK
def conllu_dict_to_nltk(tokens_list: List[dict]) -> List[List[Tuple[str]]]:
    """Function to convert conllu dict to the NLTK requred sturucture."""
    return [[(token['form'], token['upos']) 
             for token in tokens['tokens']]
            for tokens in tokens_list]

In [9]:
georgetown_tagged_sents_train = conllu_dict_to_nltk(tokens_tree_train)
georgetown_tagged_sents_dev = conllu_dict_to_nltk(tokens_tree_dev)
georgetown_tagged_sents_test = conllu_dict_to_nltk(tokens_tree_test)

## Rule-base tagger

In [10]:
rules = [
    (r"^(an|a|the)$", "DET"),
    (r"^(of|in|to|for|on|with|at|from|by|inside|outside)$", "ADP"),
    (r"^(also|so|then|just|more|as|very|well|even|most)$", "ADV"),
    (r"^(and|or|but|\&|both|either|nor|so|though|although|however)$", "CCONJ"),
    (f"^(yes|jup|yeah|yey|well|no|neh|meh|oh|yeah|hey|okay|yep|OK)$", "INTJ"),
    (f"^(that|if|when|as|how|where|because|while|after)$", "SCONJ"),
    (r"^(\.|\;|\:|\,|\'|\"|\"\"|\''|\]|\[|\(|\)|\?|\!)$", "PUNCT"),
    (r"^(\\|``|`|#|@|%|\$)$", "SYM"),
    (r"^-?[0-9]+(\.[0-9]+)?$", "NUM"),
    (r"^[a-zA-Z0-9\.\-]+@[a-zA-Z0-9\.\-]+\.[a-zA-Z]+$", "PRON"),
    (r"(.*ing|.*ish)$", "ADJ"),
    (f"^(.*es|.*ed)$", "VERB"),
]

In [11]:
t0 = nltk.DefaultTagger('NOUN')
t_rules = nltk.RegexpTagger(rules, backoff=t0)

In [12]:
# the most simplistic approach is, to assume that all words are nouns

t0.evaluate(georgetown_tagged_sents_test)

0.19831700577744285

This assumption simply reflets dataset composition.

In [13]:
t_rules.evaluate(georgetown_tagged_sents_test)

0.5362346144184879

A rule-based approach gives the accuracy higher than **50%** which is better than random guessing, or than assuming all words being *nouns*.

This approach doesn't include any machine learning though, so let's turn it on and improve PoS tagger accuracy.

## 1-gram tagger

1-gram tagger scans through the data set and defines the tag which is being assigned to a given token in most of the cases. This model can be assigned to the *lexical based* tagging method.

In [14]:
# scan the train sample and train the tagger
# let's also add the t0 (NOUN by default) tagger as the fallback solution

t_1gram = nltk.UnigramTagger(georgetown_tagged_sents_train, backoff=t0)

In [15]:
t_1gram.evaluate(georgetown_tagged_sents_test)

0.8401155488570711

## 2-gram tagger

None of the above illustrated models include sentese context into account. Let's fix that by looking up the PoS of the previous token for every token.

In [16]:
# train 2-gram tagger with the t_1gram as the fallback

t_2gram = nltk.BigramTagger(georgetown_tagged_sents_train, backoff=t_1gram)

In [17]:
t_2gram.evaluate(georgetown_tagged_sents_test)

0.8468977643808089

It doesn't seem to give much gain compared to 1-gram tagger. Let's include one more token to be looked back during training.

## 3-gram tagger

In [18]:
t_3gram = nltk.BigramTagger(georgetown_tagged_sents_train, backoff=t_2gram)

In [19]:
t_3gram.evaluate(georgetown_tagged_sents_test)

0.8468977643808089

It seems that we hit the wall and increase of the lookback window won't make any positive impact on the model accuracy.