# Malay POS tagger benchmarking

This notebook benchmarks multiple Part Of Speech tagger models trained on the PAN Localization Project's [POS tagged corpus](http://www.panl10n.net/english/outputs/Indonesia/UI/0802/UI-1M-tagged.zip).

In [2]:
import warnings; warnings.simplefilter('ignore')
import pickle
import nltk
import re
sents_tagged = pickle.load(open('../data/tagged/malay_pos_tagged.p', 'rb'))

In [9]:
#Cross validation evaluation function
#from sklearn.cross_validation import KFold
from sklearn.cross_validation import KFold
from sklearn import metrics
import numpy as np 

def cross_val(tagger, train_set, n_folds=10):    
    
    cv = KFold(len(train_set), n_folds, shuffle=True)
    scores = {"Accuracy":[], "Precision":[], "Recall":[], "F1-Score": []}

    for train_idx, test_idx in cv:
        train = [train_set[i] for i in train_idx]
        test = [train_set[i] for i in test_idx]

        #Train
        tagger.train(train)
        X_test = [[token for token,_ in test_sent] for test_sent in test]
        y_test = [tag for test_sent in test for _,tag in test_sent]

        #Predict
        tagged_pred = tagger.tag_sents(X_test)
        y_pred = [tag for pred_sent in tagged_pred for _,tag in pred_sent]

        scores["Accuracy"].append(metrics.accuracy_score(y_test, y_pred))
        scores["Precision"].append(metrics.precision_score(y_test, y_pred, average='weighted'))
        scores["Recall"].append(metrics.recall_score(y_test, y_pred, average='weighted'))
        scores["F1-Score"].append(metrics.f1_score(y_test, y_pred, average='weighted'))

    
    ave_scores = {k+'_mean':np.mean(v) for k,v in scores.items()}
    for metric, score in ave_scores.items():
        print('{}: {}'.format(metric, score))
    return ave_scores

def evaluateTagger(tagger, test_sents):
    scores = {}
    X_test = [[token for token,_ in test_sent] for test_sent in test_sents]
    y_test = [tag for test_sent in test_sents for _,tag in test_sent]

    
    tagged_pred = tagger.tag_sents(X_test)
    y_pred = [tag for pred_sent in tagged_pred for _,tag in pred_sent]
    y_pred = ['' if tag is None else tag for tag in y_pred]
    
    scores["Accuracy"] = metrics.accuracy_score(y_test, y_pred)
    scores["Precision"] = metrics.precision_score(y_test, y_pred, average='weighted')
    scores["Recall"] = metrics.recall_score(y_test, y_pred, average='weighted')
    scores["F1-Score"] = metrics.f1_score(y_test, y_pred, average='weighted')
    
    for metric, score in scores.items():
        print('{}: {}'.format(metric, score))
    return scores

## Benchmarking Sequential Backoff Taggers

In [133]:
def backoff_tagger(tagged_sents, tagger_classes, backoff=None):
    '''
    Convenience function to build sequential backoff taggers
    '''
    if not backoff:
        backoff = tagger_classes[0](tagged_sents)
        del tagger_classes[0]
 
    for cls in tagger_classes:
        tagger = cls(tagged_sents, backoff=backoff)
        backoff = tagger
 
    return backoff

In [134]:
#Train test split
from sklearn.cross_validation import train_test_split
train_sents, test_sents = train_test_split(sents_tagged)

tagger_scores = {}
#Unigram, bigram and trigram seq. backoff
ubt_tagger = backoff_tagger(train_sents, [nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger])
print('\nUBT tagger:')
tagger_scores['ubt'] = evaluateTagger(ubt_tagger, test_sents)

#UBT, affix tagger suffix=3
suffix3_tagger = nltk.tag.AffixTagger(train_sents)
s3ubt_tagger = nltk.tag.DefaultTagger('NN')
s3ubt_tagger._taggers = ubt_tagger._taggers + suffix3_tagger._taggers
print('\nSuffix3-UBT:')
tagger_scores['s3ubt'] = evaluateTagger(s3ubt_tagger, test_sents)

#UBT, affix tagger prefix=3
prefix3_tagger = nltk.tag.AffixTagger(train_sents, affix_length=3)
p3ubt_tagger = nltk.tag.DefaultTagger('NN')
p3ubt_tagger._taggers = ubt_tagger._taggers + prefix3_tagger._taggers
print('\nPrefix3-UBT:')
tagger_scores['p3ubt'] = evaluateTagger(p3ubt_tagger, test_sents)

#UBT, prefix-suffix tagger
s3p3ubt_tagger =  nltk.tag.DefaultTagger('NN')
s3p3ubt_tagger._taggers = p3ubt_tagger._taggers + suffix3_tagger._taggers
print('\nSuffix3-Prefix3-UBT:')
tagger_scores['s3p3ubt'] = evaluateTagger(s3p3ubt_tagger, test_sents)

#Regex tagger
word_patterns = [
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
    (r'(ter|se|bi)[a-z]*', 'JJ'),
    (r'ke[a-z]*an', 'JJ'),

    (r'me(ny|ng|r|l|w|y|p|t|k|s)[a-z]*', 'VBI'),
    (r'mem(b|f|p|v)[a-z]*(kan|i)?', 'VBI'),
    (r'men(d|c|j|sy|z|t|s)[a-z]*(kan|i)?', 'VBI'),
    (r'meng(g|gh|kh|h|k|a|e|i|o|u)[a-z]*', 'VBI'),
    (r'menge[a-z]*(an)?', 'VBI'),
    (r'(mem|di)per[a-z]*(kan)?', 'VBI'),
    (r'ber[a-z]*(kan|an)?', 'VBI'),
    (r'ter[a-z]*', 'VBI'),
    (r'ke[a-z]*(an)?', 'VBI'),
    (r'di(per)?[a-z]*(kan|i)?', 'VBI'),
    
    (r'.*nya$', 'NNG')
    
]

regex_tagger = nltk.tag.RegexpTagger(word_patterns)
rs3p3ubt_tagger =  nltk.tag.DefaultTagger('NN')
rs3p3ubt_tagger._taggers = s3p3ubt_tagger._taggers + regex_tagger._taggers
print('\nRegex-Suffix3-Prefix3-UBT:')
tagger_scores['rs3p3ubt'] = evaluateTagger(rs3p3ubt_tagger, test_sents)



UBT tagger:
F1-Score: 0.9799887930062702
Recall: 0.9646927828573805
Accuracy: 0.9646927828573805
Precision: 0.9965080818984777

Suffix3-UBT:
F1-Score: 0.9904810864166493
Recall: 0.9875579251812255
Accuracy: 0.9875579251812255
Precision: 0.9934852661892651

Prefix3-UBT:
F1-Score: 0.9875094555375102
Recall: 0.9844069801287175
Accuracy: 0.9844069801287175
Precision: 0.9908071922231153

Suffix3-Prefix3-UBT:
F1-Score: 0.9882445002838782
Recall: 0.9859916929630433
Accuracy: 0.9859916929630433
Precision: 0.9906529492113243

Regex-Suffix3-Prefix3-UBT:
F1-Score: 0.9882268358886409
Recall: 0.9859916929630433
Accuracy: 0.9859916929630433
Precision: 0.9906177142676877


In [None]:
#Use data viz here
for taggerName, tagger in seq_taggers.items():
    print("\n"+ taggerName)
    evaluateTagger(tagger, test_sents)

## Hidden Markov Model POS tagger

In [135]:
hmm_trainer = nltk.tag.hmm.HiddenMarkovModelTrainer()
hmm_tagger = hmm_trainer.train_supervised(train_sents)

In [136]:
tagger_scores['hmm'] = evaluateTagger(hmm_tagger, test_sents)

F1-Score: 0.8155819281944399
Recall: 0.710136155939439
Accuracy: 0.710136155939439
Precision: 0.9833169560956014


## Naive Bayes POS tagger

In [172]:
#Custom Naive bayes classifier based tagger

class custom_naiveBayesPOSTagger(nltk.tag.sequential.ClassifierBasedTagger):
    """
    A naive Bayes classifier based part of speech tagger.
    """

    def feature_detector(self, tokens, index, history):
        word = tokens[index]
        if index == 0:
            prevword = prevprevword = ""
            prevtag = prevprevtag = ""
        elif index == 1:
            prevword = tokens[index-1].lower()
            prevprevword = ""
            prevtag = history[index-1]
            prevprevtag = ""
        else:
            prevword = tokens[index-1].lower()
            prevprevword = tokens[index-2].lower()
            prevtag = history[index-1]
            prevprevtag = history[index-2]

        if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
            shape = 'number'
        elif re.match('\W+$', word):
            shape = 'punct'
        elif re.match('[A-Z][a-z]+$', word):
            shape = 'upcase'
        elif re.match('[a-z]+$', word):
            shape = 'downcase'
        elif re.match('\w+$', word):
            shape = 'mixedcase'
        else:
            shape = 'other'

        features = {
            'prevtag': prevtag,
            'prevprevtag': prevprevtag,
            'word': word,
            'word.lower': word.lower(),
            'suffix3': word.lower()[-3:],
            'suffix2': word.lower()[-2:],
            'suffix1': word.lower()[-1:],
            'prefix3': word.lower()[:3],
            'prefix2': word.lower()[:2],
            'prefix1': word.lower()[:1],
            'prevprevword': prevprevword,
            'prevword': prevword,
            'prevword+suffix3': prevword[-3:],
            'prevword+prefix3': prevword[:3],
            'prevtag+word': '%s+%s' % (prevtag, word.lower()),
            'prevprevtag+word': '%s+%s' % (prevprevtag, word.lower()),
            'prevword+word': '%s+%s' % (prevword, word.lower()),
            'shape': shape,
            }
        return features

In [173]:
naiveBayes_tagger = custom_naiveBayesPOSTagger(train=train_sents)
tagger_scores['naive_bayes'] = evaluateTagger(naiveBayes_tagger, test_sents)

F1-Score: 0.926375523133134
Recall: 0.9241001094976506
Accuracy: 0.9241001094976506
Precision: 0.9420780777498295


## Averaged Perceptron Tagger

In [4]:
from collections import defaultdict

class PerceptronTagger_custom(nltk.tag.perceptron.PerceptronTagger):
    
    def _get_features(self, i, word, context, prev, prev2):
        '''Map tokens into a feature representation, implemented as a
        {hashable: int} dict. If the features change, a new model must be
        trained.
        '''
        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref3', word[:3])
        add('i pref2', word[:2])
        add('i-1 tag', prev)
        add('i-2 tag', prev2)
        add('i tag+i-2 tag', prev, prev2)
        add('i word', context[i])
        add('i-1 tag+i word', prev, context[i])
        add('i-1 word', context[i-1])
        add('i-1 suffix', context[i-1][-3:])
        add('i-1 prefix', context[i-1][:3])
        add('i-2 word', context[i-2])
        add('i+1 word', context[i+1])
        add('i+1 suffix', context[i+1][-3:])
        add('i+1 prefix', context[i-1][:3])
        add('i+2 word', context[i+2])

        return features

In [138]:
perceptron_custom = PerceptronTagger_custom(load = False)
scores = cross_val(perceptron_custom, sents_tagged, n_folds=10)

Precision_mean: 0.9988696352059483
F1-Score_mean: 0.9988670324399808
Accuracy_mean: 0.9988692689529073
Recall_mean: 0.9988692689529073


The averaged perceptron tagger has the best performance, with an F-score and average 10-fold cross validation accuracy of 99.89%, beating the benchmark of [97.57%](http://www.panl10n.net/english/outputs/Indonesia/UI/0901/UI-POSTAG.pdf) averaged accuracy trained on the same PANL10N corpus. Let's try making more changes to our model that will allow it to generalise better to unseen data.

## Improving our model

### Reduce morphosyntatic distinctions

The tagged corpus was tagged with a tagset of 37 tags built for the Indonesian language. Let's have a look at the tag distribution of our training corpus:


In [82]:
#Replace morphosyntatic distinction-- don't need that much complexity.
#VBI, VBT ==> VB. Reduce number of classes.
#Tagging unknown words to better generalise to out of vocabulary words
def genTagDist(trainSet):
    '''
    Generates distribution of tags within training set
    '''
    tags = genTagset(trainSet)
    tagset = dict(zip(tags, [0]*len(tags)))
    for i in range(len(trainSet)):
        for j in range(len(trainSet[i])):
            tagset[trainSet[i][j][1]] += 1
    
    return tagset

tagDist = genTagDist(train_sents)
print(tagDist)

{'UH': 9, 'CC': 17894, 'NNG': 2655, 'CDO': 1455, 'PRN': 124, 'CDC': 46, 'NEG': 4965, ':': 426, 'RB': 16705, 'VB': 90, 'MD': 11330, 'NNC': 46856, 'SYM': 1285, 'RP': 288, 'CDI': 3286, '--': 539, 'WRB': 648, 'PRP': 8760, 'FW': 983, 'SC': 30669, '.': 29237, 'VBI': 18784, 'NNP': 8428, 'JJ': 23359, 'NN': 266937, 'VBT': 36291, 'DT': 15632, 'NNS': 852, 'PRL': 198, 'WP': 264, 'IN': 56048, ',': 35549, 'CDP': 15378}


Looking at the tags for verbs, there's a clear spread between VBI (intransitive verbs), VBT (transitive verbs), and VB (which is not in the official Indonesian tagset and must have resulted from mistaggings).
By grouping these similar POS distinctions into a single POS tag, we can reduce number of classes/tags that our tagger will have to learn, and increase the overall classification accuracy as the errors from misclassified samples amongst these similar groupings would be reduced.
This will also allow our model to generalise better to unseen data.

A separate script was written to do so, with the following changes:

- Group all cardinal distinctions into one:
        CDC, CDI, CDO, CDP ==> CD
- Group all verb distinctions into one:
        VB, VBI, VBT ==> VB
- Group NN(common nouns) with NNC (countable common nouns) and NNP (uncountable common nouns). Retain distinction between NNP (proper nouns) and NNG (genitive common nouns) as these would be useful for Named Entity Recognition for a future project:
        NN, NNC, NNP ==> NN
- Remove tags such as UH (interjections, eg. 'Wah', 'Aduh') which don't have enough samples to properly learn from. 


In [14]:
sents_pos_reduced = pickle.load(open('../data/tagged/malay_pos_reduced.p', 'rb'))
perceptron_reduced = PerceptronTagger_custom(load = False)
perceptron_reduced.train(sents_pos_reduced)

In [140]:
perceptron_red_scores = cross_val(perceptron_reduced, sents_pos_reduced, n_folds=10)

Precision_mean: 0.9986768201147038
F1-Score_mean: 0.9986760702516395
Accuracy_mean: 0.9986762742309307
Recall_mean: 0.9986762742309307


Although the average F1-score of 99.867% for our model with reduced tags is slightly lower than our previous model trained on all 37 tags, it's likely to generalise better to out of sample data which contain different vocabularies unseen from our training and test sets.

### Improving our training corpus

A look through our training corpus reveals some misclassifications, such as words that are clearly verbs being tagged as nouns. This was a parallel tagged Indonesian-English corpus, and it appears that some english sentences seem to have made their way into the dataset as well. A separate script was run to make the following changes to our corpus:

- Fix verb mistags
- Fix misclassified symbols
- Fix misclassified numeric tokens
- Remove English sentences

In [5]:
sents_pos_improved = pickle.load(open('../data/tagged/malay_pos_improved.p', 'rb'))
perceptron_improved = PerceptronTagger_custom(load = False)
perceptron_improved.train(sents_pos_improved)

In [10]:
improved_score = cross_val(perceptron_improved, sents_pos_improved, n_folds=10)

Accuracy_mean: 0.9986826182539877
Precision_mean: 0.9986830746117807
Recall_mean: 0.9986826182539877
F1-Score_mean: 0.9986824350854537


The F1 score of our model built on the improved training corpus is slightly higher than our previous model, which shows we're on the right track. 
Before we get too excited about our positive scores so far which have outperformed our benchmark, we need to assess how well our model generalises to out of sample data from different corpuses.

## Evaluating on out of sample data of different corpuses

In [11]:
OOS = open('../data/idn-tagged-corpus/Indonesian_Manually_Tagged_Corpus_ID.tsv').read() #From https://github.com/famrashel/idn-tagged-corpus
sents = re.findall(r'<kalimat id=[0-9]+>((.|\n)*?)</kalimat>', OOS, re.DOTALL)
#NLTK formatted
OOS_sents = [[tuple(pair.split('\t')) for pair in sent[0].strip().split('\n')] for sent in sents]

Before we evaluate our model on this new corpus, let's take a look at some tags in the manually tagged corpus:

In [144]:
def genTagset(trainSet):
    tagset = set([trainSet[i][j][1] for i in range(len(trainSet)) for j in range(len(trainSet[i]))])
    return tagset

print(genTagset(OOS_sents))

{'CC', 'RB', 'Z', 'MD', 'SYM', 'RP', 'SC', 'NNP', 'JJ', 'IN', 'DT', 'CD', 'UH', 'X', 'FW', 'NEG', 'VB', 'OD', 'PRP', 'NND', 'NN', 'PR', 'WH'}


We see some new tags such as RP and PR which our tagger doesn't recognise. We'll have to do some preprocessing on this new tagged corpus to be able to properly evaluate the performance of our model, and avoid deceitfully low scores from tagset incompatibility, as follows:

- **RP**: particles, eg. 'lah', 'kah', 'pun'.
        Our tagger doesn't recognise RP. Concatenate with previous word.
- **PR**: eg. 'itu', 'ini'.
        Replace this with the equivalent determiner tag(DT) that our tagger recognises.
- **NND**: eg. sebuah, barel, kiloliter.
        Appears to capture units of measurement. Replace with NN.
- **UH**: interjections, eg. 'ya', 'wah', 'aduh', 'oh'.
        Only a miniscule part of test set, remove.
- **Z**: symbols.
        Replace to match our tagset of symbols.

In [12]:
def combineParticles(sents):
    for i in range(len(sents)):
        sent = sents[i]
        popList = []
        for j in range(len(sent)-1):
            token = sent[j]
            tokenNext = sent[j+1]
            if tokenNext[1]=='RP':
                sents[i][j] = (token[0]+tokenNext[0], token[1])
                popList.append(j+1)
        for p in popList:
            sents[i].pop(p)
            
combineParticles(OOS_sents)

def fixSymbol(token, tag):
    if tag not in [',', '.', ':', '--']:
        if token in ['?', '.', '!']: #sentence terminators
            return (token, '.')
        elif token in ['"', '(', ')']:
            return (token, token)
        else:
            return (token, 'SYM')
    else: 
        return (token, tag)

#OOS_sents_fixed = [[fixSymbol(pos_pair[0], pos_pair[1]) if pos_pair[1]=='Z' else (pos_pair[0], pos_pair[1]) for pos_pair in sent] for sent in OOS_sents]
OOS_sents_fixed = [[(pos_pair[0], pos_pair[0]) if pos_pair[1]=='Z' else (pos_pair[0], pos_pair[1]) for pos_pair in sent] for sent in OOS_sents]
OOS_sents_fixed = [[(pos_pair[0], 'DT') if pos_pair[1]=='PR' else (pos_pair[0], pos_pair[1]) for pos_pair in sent] for sent in OOS_sents_fixed]
OOS_sents_fixed = [[(pos_pair[0], pos_pair[1]) for pos_pair in sent if pos_pair[1]!='UH' or pos_pair[1]!='X' or pos_pair[1]!='RP'] for sent in OOS_sents_fixed]
OOS_sents_fixed = [[(pos_pair[0], 'NN') if pos_pair[1]=='NND' else (pos_pair[0], pos_pair[1]) for pos_pair in sent] for sent in OOS_sents_fixed]


In [15]:
print("Perceptron trained on reduced tags: ")
perceptron_red_scores = evaluateTagger(perceptron_reduced, OOS_sents_fixed)
print("\nPerceptron trained on reduced tags and improved corpus: ")
perceptron_improved_scores = evaluateTagger(perceptron_improved, OOS_sents_fixed)

Perceptron trained on reduced tags: 
Accuracy: 0.7582485340957738
Precision: 0.7740084055846063
F1-Score: 0.7433275835319221
Recall: 0.7582485340957738

Perceptron trained on reduced tags and improved corpus: 
Accuracy: 0.7670498505622081
Precision: 0.7655742792011256
F1-Score: 0.7530183093037409
Recall: 0.7670498505622081


## Further improvements

The actual Malay POS tagger in this project was trained on a version of the PANL1ON and manually tagged corpuses adapted to the Malay language, in addition to some semi-manually tagged parliamentary documents from Sinar's Pardocs for a final accuracy of .

Further improvements that could be made:
- Train additionally on words from the Malay Wordnet.
- Train model on corpus that splits on genitive common nouns (-nya, -punya), and particles (-lah, -kah). Would have to write corresponding preprocessor.