In [2]:
import pandas as pd
import numpy as np
import re, winsound, time, joblib
from datetime import datetime
from collections import Counter
from sklearn_crfsuite import CRF, metrics
from nltk.tag.util import untag
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning) 

### Loading the manually corrected lexicon-based dataset

the model should be trained on manually corrected lexicon-based annotated dataset. The model here is trained on only 2000 sentences that are not manually corrected. The pre-trained model is trained on around 30 sentences that are manully corrected after lexicon-based tagging.

In [3]:
# load the manually corrected lexicon-based annotated corpus
df = pd.read_csv('data/lexicon_based_annotated_sample.csv', header=None)
tagged_sentences = []
words_count = 0
for i, row in df.iterrows():
    sent_id = str(row[0])
    sentence = row[1]
    sentence = sentence.split('|')
    tagged_sent = []
    for pair in sentence:
        words_count += 1
        pair = pair.split('_')
        word =  pair[0]
        tag = pair[1]
        tag = tag.replace('.','')
        tag = tag.replace('NNC1M', 'NNM')
        tag = tag.replace('NNC2', 'NNS')
        tag = tag.replace('NNC1F', 'NNF')
        el = (word, tag)
        tagged_sent.append(el)
    tagged_sentences.append(tagged_sent)

cutoff = int(.80 * len(tagged_sentences))
train = tagged_sentences[:cutoff]
test = tagged_sentences[cutoff:]

print('Total: ', len(tagged_sentences))
print('Train: ', len(train))
print('Test: ', len(test))
print('__________\nTotal Words: ', words_count)

Total:  2000
Train:  1600
Test:  400
__________
Total Words:  37694


### Features Extraction

In [4]:
def features(s, i): #s: sentence, i: word index
    word = s[i]
    length = len(word)
    len_prev_1, len_prev_2, len_next_1, len_next_2 = 0,0,0,0
    
    prefix_1, prefix_2, prefix_3, suffix_1, suffix_2, suffix_3 = '','','','','',''
    prev_1, prev_2, prev_3, next_1, next_2, next_3 = '','','','','',''
    prev_1_prefix_1, prev_1_prefix_2, prev_1_suffix_1, prev_1_suffix_2, prev_1_suffix_3 = '','','','',''
    next_1_prefix_1, next_1_prefix_2, next_1_suffix_1, next_1_suffix_2, next_1_suffix_3 = '','','','',''
    
    if(len(word)>1):
        prefix_1 = word[0]
        suffix_1 = word[-1]
    
    if(len(word)>2):
        prefix_2 = word[:2]
        suffix_2 = word[-2:]
    
    if(len(word)>3):
        prefix_3 = word[:3]
        suffix_3 = word[-3:]
    
    if(i>0): 
        prev_1 = s[i-1]
        len_prev_1 = len(prev_1)
        if(len_prev_1>1):
            prev_1_prefix_1 = prev_1[0]
            prev_1_suffix_1 = prev_1[-1]
        if(len_prev_1>2):
            prev_1_prefix_2 = prev_1[:2]
            prev_1_suffix_2 = prev_1[-2:]
        if(len_prev_1>3):
            prev_1_prefix_3 = prev_1[:3]
            prev_1_suffix_3 = prev_1[-3:]
            
    if(i>1): 
        prev_2 = s[i-2]
        len_prev_2 = len(prev_2)
    if(i>2): prev_3 = s[i-3]
    
    if(len(s)>i+1): 
        next_1 = s[i+1]
        len_next_1 = len(next_1)
        if(len_next_1>1):
            next_1_prefix_1 = next_1[0]
            next_1_suffix_1 = next_1[-1]
        if(len_next_1>2):
            next_1_prefix_2 = next_1[:2]
            next_1_suffix_2 = next_1[-2:]  
        if(len_next_1>3):
            next_1_prefix_3 = next_1[:3]
            next_1_suffix_3 = next_1[-3:]    
        
    if(len(s)>i+2): 
        next_2 = s[i+2]
        len_next_2 = len(next_2)
    if(len(s)>i+3): next_3 = s[i+3]
    
    features  = {
#         word attributes
        'word': word,
        'length': length,
        'is_first': i == 0,
        'is_last': i == len(s) - 1, 
        'is_numeric': word.isdigit(),
        'prefix_1': prefix_1,
        'prefix_2': prefix_2,
        'suffix_1': suffix_1,
        'suffix_2': suffix_2,
        
#         previsous words attributes
        'prev_1': prev_1,   
        'len_prev_1': len_prev_1,     
        'prev_1_prefix_1': prev_1_prefix_1,
        'prev_1_prefix_2': prev_1_prefix_2,
        'prev_1_suffix_1': prev_1_suffix_1,
        'prev_1_suffix_2': prev_1_suffix_2, 
#         'prev_1_suffix_3': prev_1_suffix_3, 
        'prev_2': prev_2,   
        'len_prev_2': len_prev_2,
        'prev_3': prev_3, 
        
#         next words attributes       
        'next_1': next_1,  
        'len_next_1': len_next_1, 
        'next_1_prefix_1': next_1_prefix_1,     
        'next_1_prefix_2': next_1_prefix_2,     
        'next_1_suffix_1': next_1_suffix_1,     
        'next_1_suffix_2': next_1_suffix_2,  
#         'next_1_suffix_3': next_1_suffix_3,  
        'next_2': next_2,
        'len_next_2': len_next_2,
        'next_3': next_3,
        
      }
    return features

def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        X.append([features(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])

    return X, y

X_train, y_train = transform_to_dataset(train)
X_test, y_test = transform_to_dataset(test)

### Model Training

In [5]:
model = CRF(algorithm = 'lbfgs', c1 = 0.1, c2 = 0.1, max_iterations = 100, all_possible_transitions=True)
model.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [19]:
# save the model
joblib.dump(model, 'Pashto_POS_Tagger_.sav')

['Pashto_POS_Tagger.sav']

### Testing

In [6]:
y_pred = model.predict(X_test)

acc = metrics.flat_accuracy_score(y_test, y_pred)
f1 = metrics.flat_f1_score(y_test, y_pred, average='weighted')

print('acc = ', acc)
print('f1 = ', f1)

acc =  0.9386836180638326
f1 =  0.9382305670299027


In [7]:
labels = list(model.classes_)
print(len(labels))
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
print(sorted_labels)
print('Report: \n\n{}'.format(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)))

37
['BA', 'NB', 'RB', 'VBD', 'VBDC', 'VBDX', 'VBG', 'VBH', 'VBIMP', 'VBINF', 'VBN', 'VBP', 'VBPC', 'VBPX', 'CC', 'NG', 'UH', 'JJ', 'IN', 'NNF', 'NNM', 'NNP', 'NNS', 'RP', 'PRC', 'PRDEM', 'PRDIS', 'PRP$', 'PRPi', 'PRPii', 'PRPiii', 'PRQ', 'DT', 'PT', 'PU', 'FW', 'FX']
Report: 

              precision    recall  f1-score   support

          BA      1.000     1.000     1.000        31
          NB      0.963     0.917     0.939       169
          RB      0.942     0.940     0.941       448
         VBD      0.833     0.840     0.837       125
        VBDC      1.000     1.000     1.000        36
        VBDX      0.974     0.995     0.984       191
         VBG      0.932     0.965     0.948        57
         VBH      0.910     0.947     0.928        75
       VBIMP      0.857     0.857     0.857         7
       VBINF      0.882     0.957     0.918        94
         VBN      0.875     0.583     0.700        12
         VBP      0.974     0.908     0.940       163
        VBPC      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
def print_transitions(transitions):
    for (label_from, label_to), weight in transitions:
        transition = label_from + '\t->\t' + label_to + '\t' + str(round(weight,2))
        print(transition)
    
likly = Counter(model.transition_features_).most_common(10)
unlikly = Counter(model.transition_features_).most_common()[-10:]

print('Top-10 most likely transitions')
print_transitions(likly)
print('\nTop-10 most unlikely transitions')
print_transitions(unlikly)

Top-10 most likely transitions
NB	->	FX	2.15
VBH	->	VBH	1.57
NB	->	NB	1.46
NG	->	FX	1.31
FW	->	UH	1.22
NB	->	NNS	1.2
NG	->	NNP	1.17
NNP	->	NNP	1.03
JJ	->	NNS	1.03
FX	->	VBIMP	1.02

Top-10 most unlikely transitions
IN	->	IN	-1.06
NB	->	VBG	-1.06
VBG	->	NNS	-1.06
VBH	->	IN	-1.07
VBH	->	NNF	-1.12
IN	->	VBD	-1.12
VBINF	->	PT	-1.18
NB	->	VBD	-1.19
NNP	->	VBDX	-1.26
IN	->	CC	-1.67
