In [1]:
import nltk
import sklearn
import sklearn_crfsuite
from sklearn_crfsuite import scorers,metrics
import scipy
from sklearn.metrics import make_scorer,accuracy_score
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from collections import Counter

In [2]:
train_corpus_file = open('hi-ud-train.conllu','r')
test_file = open('hi-ud-test.conllu','r')

def preprocess(file,file_type):
    corpus = []
    sent = []
    for line in file:
        if file_type == 'tr':
            if 'POS_TAG' in line:
                continue
            elif line == ',,\n':
                corpus.append(sent)
                sent = []            
            else:
                tup = tuple(line[:-1].split(','))
                sent.append(tup)
        else:
            if 'TAG' in line:
                continue
            elif line.split('	')[2] =='\n':
                corpus.append(sent)
                sent = []            
            else:
                tup = tuple(line.split())
                sent.append(tup)
    return corpus

train = preprocess(train_corpus_file,'tr')
test = preprocess(test_file,'te')

In [3]:
def word2features(sent, i):
    word = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()        
    }
    if i > 0:
        word1 = sent[i-1][1]
        postag1 = sent[i-1][2]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][1]
        postag1 = sent[i+1][2]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True
                
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [postag for num, token, postag in sent]

def sent2tokens(sent):
    return [postag for num, token, postag in sent]

In the above done feature extraction, I used word shape, word suffix, word identity and also some information about the nearby words as features.

In [4]:
sent2features(train[0])[0]

{'bias': 1.0,
 'word.lower()': 'yaha',
 'word[-3:]': 'aha',
 'word[-2:]': 'ha',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'BOS': True,
 '+1:word.lower()': 'esiya',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False}

In [5]:
X_train = [sent2features(s) for s in train]
y_train = [sent2labels(s) for s in train]

X_test = [sent2features(s) for s in test]
y_test = [sent2labels(s) for s in test]

In [6]:
#The following values of c1,c2 are obtained from hyperparameter optimization for 150 iterations
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.005,
    c2=0.0267,
    max_iterations=150,
    all_possible_transitions=True
    
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.005, c2=0.0267, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=150,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [7]:
labels = list(crf.classes_)
y_pred = crf.predict(X_test)
y_train_pred = crf.predict(X_train)
print(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))
print('Train Accuracy - ')
print(metrics.flat_accuracy_score(y_train, y_train_pred))
print('Test Accuracy - ')
print(metrics.flat_accuracy_score(y_test, y_pred))

0.8375695028148106
Train Accuracy - 
0.9993419320873914
Test Accuracy - 
0.8407202216066482


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [8]:
#Top 10 most common and least common transition features along with transition weights
def print_transitions(features):
    for (from_label, to_label), wgt in features:
        print("%-6s -> %-6s %0.6f" % (from_label, to_label, wgt))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(10))

print('\n')
print("Top unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

Top likely transitions:
VERB   -> AUX    4.465070
PROPN  -> PROPN  2.738170
AUX    -> AUX    2.661140
PRON   -> ADP    2.628760
ADJ    -> NOUN   2.579719
PROPN  -> ADP    2.523272
AUX    -> SCONJ  2.225268
NOUN   -> ADP    2.135462
NUM    -> NOUN   2.082605
DET    -> NOUN   1.979730


Top unlikely transitions:
AUX    -> ADJ    -1.552327
PROPN  -> PART   -1.603829
DET    -> CCONJ  -1.641650
PROPN  -> DET    -1.656438
NUM    -> PRON   -1.766365
PROPN  -> AUX    -1.950686
CCONJ  -> AUX    -1.973740
ADJ    -> ADP    -2.226438
ADJ    -> PRON   -2.299370
DET    -> ADP    -2.824333


In [9]:
#precison, recall and f1-score per unique POS tag
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

              precision    recall  f1-score   support

           X      0.000     0.000     0.000         0
        PART      0.968     0.909     0.937        33
       CCONJ      1.000     1.000     1.000        25
       SCONJ      0.750     1.000     0.857         3
         ADJ      0.667     0.766     0.713        94
         ADP      0.946     0.980     0.963       303
         ADV      0.600     0.429     0.500        21
        VERB      0.859     0.798     0.827        99
         DET      0.795     0.861     0.827        36
       COMMA      0.000     0.000     0.000         0
        NOUN      0.767     0.864     0.813       324
        PRON      0.794     0.831     0.812        65
       PROPN      0.648     0.486     0.556       144
         NUM      0.920     0.920     0.920        25
       PUNCT      1.000     0.828     0.906       134
         AUX      0.935     0.942     0.939       138

   micro avg      0.841     0.841     0.841      1444
   macro avg      0.728   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
