In [3]:
import json
data_files = ['train_orchid.pos.json', 'dev_orchid.pos.json', 'test_orchid.pos.json']
train_data, dev_data, test_data = tuple(map(lambda x: json.load(open(x)), data_files))

In [4]:
import pandas as pd
pd.DataFrame({'sentences': [len(train_data), len(dev_data), len(test_data) ],
              'words': [sum(map(len, train_data)), sum(map(len, dev_data)),sum(map(len, test_data)) ]
             }, 
             index=['train', 'dev', 'test'])

Unnamed: 0,sentences,words
train,18500,272620
dev,2312,33371
test,2313,36651


# Experiment 1: Most likely tag baseline

In [5]:
def flatten(l):
    return [x for sublist in l for x in sublist]

from sklearn.feature_extraction import DictVectorizer

class Exp1Featurizer():
    
    def __init__(self):
        self.dv = DictVectorizer(sparse=True)
        
    def featurize(self, data_list, train=False):
        features = [{x:1} for x in data_list]
        if train:
            return self.dv.fit_transform(features)
        else:
            return self.dv.transform(features)
        

In [6]:
train_X, train_y = zip(*flatten(train_data))
exp1_featurizer = Exp1Featurizer()
train_X = exp1_featurizer.featurize(train_X, train=True)

In [5]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()

In [6]:
lr_model.fit(train_X, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
dev_X, dev_y = zip(*flatten(dev_data))
dev_X = exp1_featurizer.featurize(dev_X, train=False)

In [127]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
predictions = lr_model.predict(dev_X)

In [9]:
print(classification_report(dev_y, predictions))

              precision    recall  f1-score   support

         ADP       0.93      0.89      0.91      2457
         ADV       0.84      0.58      0.69       730
         AUX       0.92      0.96      0.94      1863
       CCONJ       0.90      0.98      0.94        46
         DET       0.95      0.94      0.95       963
        NOUN       0.88      0.98      0.93     10714
         NUM       0.81      0.42      0.55       658
        PART       1.00      0.62      0.76        13
        PRON       0.92      0.87      0.90       858
       PROPN       0.61      0.37      0.46       443
       PUNCT       1.00      0.99      0.99      6247
       SCONJ       0.89      0.90      0.89      2046
        VERB       0.95      0.90      0.92      6333

    accuracy                           0.92     33371
   macro avg       0.89      0.80      0.83     33371
weighted avg       0.92      0.92      0.92     33371



## Evaluate only unknown words (words not in the training set)

In [59]:
unk_data_files = ['dev_orchid_noknown.json', 'test_orchid_noknown.json']
dev_unk, test_unk= tuple(map(lambda x: json.load(open(x)), unk_data_files))

In [14]:
pd.DataFrame({'sentences': [ len(dev_unk), len(test_unk)],
              'words': [sum(map(len, dev_unk)),sum(map(len, test_unk)) ]
             }, 
             index=['dev_unk', 'test_unk'])

Unnamed: 0,sentences,words
dev_unk,1271,2469
test_unk,1426,2821


In [64]:
dev_X_unk, dev_y_unk = zip(*flatten(dev_unk))
dev_X_unk = exp1_featurizer.featurize(dev_X_unk, train=False)
predictions_unk = lr_model.predict(dev_X_unk)

In [12]:
print(classification_report(dev_y_unk, predictions_unk))

              precision    recall  f1-score   support

         ADV       0.00      0.00      0.00        34
         DET       0.00      0.00      0.00         5
        NOUN       0.76      1.00      0.87      1885
         NUM       0.00      0.00      0.00       113
        PRON       0.00      0.00      0.00         1
       PROPN       0.00      0.00      0.00       215
       PUNCT       0.00      0.00      0.00        39
       SCONJ       0.00      0.00      0.00         9
        VERB       0.00      0.00      0.00       168

    accuracy                           0.76      2469
   macro avg       0.08      0.11      0.10      2469
weighted avg       0.58      0.76      0.66      2469



  'precision', 'predicted', average, warn_for)


# Experiment 2 : Evaluate PythaiNLP POS tagger

In [1]:
from pythainlp import pos_tag

In [44]:
dev_x = [word for word, pos in flatten(dev_data)]
dev_y = [pos for word, pos in flatten(dev_data)]

In [56]:
mapping = pd.read_csv('tag_mapping.csv')
orch_to_ud = {x:y for x, y in zip(mapping.Tag.to_list(), mapping.UDTag.to_list())}
pythai_pred = [orch_to_ud[pos] for word, pos in pos_tag(dev_x)]

In [58]:
print(classification_report(dev_y, pythai_pred))

              precision    recall  f1-score   support

         ADP       0.95      0.98      0.96      2457
         ADV       0.95      0.88      0.91       730
         AUX       0.98      0.99      0.99      1863
       CCONJ       0.96      0.98      0.97        46
         DET       0.98      0.96      0.97       963
        NOUN       0.98      0.98      0.98     10714
         NUM       0.76      0.98      0.86       658
        PART       1.00      1.00      1.00        13
        PRON       0.93      0.96      0.94       858
       PROPN       0.94      0.85      0.89       443
       PUNCT       1.00      1.00      1.00      6247
       SCONJ       0.97      0.91      0.94      2046
        VERB       0.98      0.99      0.99      6333

    accuracy                           0.97     33371
   macro avg       0.95      0.96      0.95     33371
weighted avg       0.98      0.97      0.97     33371



# Experiment 3 : CRF with only current word features

In [121]:
import pycrfsuite
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

In [112]:
train_X = [[word for word, freq in seq] for seq in train_data]
train_y = [[freq for word, freq in seq] for seq in train_data]
dev_X = [[word for word, freq in seq] for seq in dev_data]
dev_y = [[freq for word, freq in seq] for seq in dev_data]

In [113]:
trainer = pycrfsuite.Trainer()
for feature_sequence, label_sequence in zip(train_X, train_y):
    trainer.append(feature_sequence, label_sequence)
trainer.train('vanilla.crf')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 1134
Seconds required: 0.234

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 554402.247592
Feature norm: 1.000000
Error norm: 114919.706858
Active features: 1134
Line search trials: 1
Line search step: 0.000006
Seconds required for this iteration: 0.875

***** Iteration #2 *****
Loss: 400169.982128
Feature norm: 3.702283
Error norm: 46357.547737
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.469

***** Iteration #3 *****
Loss: 356153.024325
Feature norm: 4.668525
Error norm: 31431.089187
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds requi

***** Iteration #40 *****
Loss: 154864.789664
Feature norm: 43.339500
Error norm: 3051.733891
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.500

***** Iteration #41 *****
Loss: 154627.370058
Feature norm: 43.510782
Error norm: 1253.846636
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.500

***** Iteration #42 *****
Loss: 154532.672134
Feature norm: 43.560674
Error norm: 1229.965744
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.484

***** Iteration #43 *****
Loss: 154274.861254
Feature norm: 44.066522
Error norm: 1288.382927
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.516

***** Iteration #44 *****
Loss: 154143.150083
Feature norm: 44.455835
Error norm: 1729.719165
Active features: 1134
Line search trials: 2
Line search step: 0.501321

***** Iteration #81 *****
Loss: 151695.703480
Feature norm: 53.048849
Error norm: 820.654128
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.469

***** Iteration #82 *****
Loss: 151664.887977
Feature norm: 53.134621
Error norm: 391.630204
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.438

***** Iteration #83 *****
Loss: 151648.622623
Feature norm: 53.189688
Error norm: 348.403093
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.422

***** Iteration #84 *****
Loss: 151631.679699
Feature norm: 53.319027
Error norm: 510.199582
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.453

***** Iteration #85 *****
Loss: 151616.008412
Feature norm: 53.555462
Error norm: 766.488172
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seco

***** Iteration #121 *****
Loss: 151229.897570
Feature norm: 56.460344
Error norm: 189.383649
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.406

***** Iteration #122 *****
Loss: 151226.784696
Feature norm: 56.483370
Error norm: 165.893400
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.422

***** Iteration #123 *****
Loss: 151222.911695
Feature norm: 56.527171
Error norm: 202.670954
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.438

***** Iteration #124 *****
Loss: 151218.365330
Feature norm: 56.646032
Error norm: 417.392726
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.422

***** Iteration #125 *****
Loss: 151212.841649
Feature norm: 56.716999
Error norm: 288.361971
Active features: 1134
Line search trials: 1
Line search step: 1.000000

***** Iteration #165 *****
Loss: 151118.688909
Feature norm: 57.344661
Error norm: 121.723047
Active features: 1134
Line search trials: 2
Line search step: 0.398780
Seconds required for this iteration: 0.844

***** Iteration #166 *****
Loss: 151117.187007
Feature norm: 57.358667
Error norm: 90.447516
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.438

***** Iteration #167 *****
Loss: 151116.661148
Feature norm: 57.377845
Error norm: 222.523842
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.438

***** Iteration #168 *****
Loss: 151115.684002
Feature norm: 57.372260
Error norm: 83.844539
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.422

***** Iteration #169 *****
Loss: 151115.165808
Feature norm: 57.369057
Error norm: 64.079937
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Se

***** Iteration #205 *****
Loss: 151095.533008
Feature norm: 57.628265
Error norm: 65.558809
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.500

***** Iteration #206 *****
Loss: 151095.178320
Feature norm: 57.627319
Error norm: 33.724594
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.406

***** Iteration #207 *****
Loss: 151094.983182
Feature norm: 57.626969
Error norm: 35.986335
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.406

***** Iteration #208 *****
Loss: 151094.676880
Feature norm: 57.626443
Error norm: 45.428112
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.422

***** Iteration #209 *****
Loss: 151094.585312
Feature norm: 57.625651
Error norm: 115.971329
Active features: 1134
Line search trials: 1
Line search step: 1.000000
Sec

In [114]:
tagger = pycrfsuite.Tagger()
tagger.open('vanilla.crf')
y_pred = [tagger.tag(xseq) for xseq in dev_X]

In [130]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) 
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

print(bio_classification_report(dev_y, y_pred))

              precision    recall  f1-score   support

         ADP       0.69      0.76      0.72      2457
         ADV       0.62      0.39      0.48       730
         AUX       0.80      0.82      0.81      1863
       CCONJ       0.77      0.80      0.79        46
         DET       0.74      0.61      0.67       963
        NOUN       0.79      0.80      0.80     10714
         NUM       0.94      0.94      0.94       658
        PART       0.75      0.23      0.35        13
        PRON       0.80      0.84      0.82       858
       PROPN       0.52      0.39      0.44       443
       PUNCT       1.00      1.00      1.00      6247
       SCONJ       0.76      0.76      0.76      2046
        VERB       0.65      0.67      0.66      6333

   micro avg       0.79      0.79      0.79     33371
   macro avg       0.76      0.69      0.71     33371
weighted avg       0.79      0.79      0.79     33371
 samples avg       0.79      0.79      0.79     33371



# Experiment 4 : Adding more features for CRF? or use Bi-LSTM of sort