In [9]:
import json
data_files = ['train_orchid.pos.json', 'dev_orchid.pos.json', 'test_orchid.pos.json']
train_data, dev_data, test_data = tuple(map(lambda x: json.load(open(x)), data_files))

In [10]:
import pandas as pd
pd.DataFrame({'sentences': [len(train_data), len(dev_data), len(test_data) ],
              'words': [sum(map(len, train_data)), sum(map(len, dev_data)),sum(map(len, test_data)) ]
             }, 
             index=['train', 'dev', 'test'])

Unnamed: 0,sentences,words
train,18500,272620
dev,2312,33371
test,2313,36651


# Experiment 1: Most likely tag baseline

In [45]:
def flatten(l):
    return [x for sublist in l for x in sublist]

from sklearn.feature_extraction import DictVectorizer

class Exp1Featurizer():
    
    def __init__(self):
        self.dv = DictVectorizer(sparse=True)
        
    def featurize(self, data_list, train=False):
        features = [{x:1} for x in data_list]
        if train:
            return self.dv.fit_transform(features)
        else:
            return self.dv.transform(features)
        

In [46]:
train_X, train_y = zip(*flatten(train_data))
exp1_featurizer = Exp1Featurizer()
train_X = exp1_featurizer.featurize(train_X, train=True)

In [47]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()

In [48]:
lr_model.fit(train_X, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [50]:
dev_X, dev_y = zip(*flatten(dev_data))
dev_X = exp1_featurizer.featurize(dev_X, train=False)
predictions = lr_model.predict(dev_X)

In [54]:
from sklearn.metrics import classification_report, confusion_matrix

In [53]:
print(classification_report(dev_y, predictions))

              precision    recall  f1-score   support

         ADP       0.93      0.89      0.91      2457
         ADV       0.84      0.58      0.69       730
         AUX       0.92      0.96      0.94      1863
       CCONJ       0.90      0.98      0.94        46
         DET       0.95      0.94      0.95       963
        NOUN       0.88      0.98      0.93     10714
         NUM       0.81      0.42      0.55       658
        PART       1.00      0.62      0.76        13
        PRON       0.92      0.87      0.90       858
       PROPN       0.61      0.37      0.46       443
       PUNCT       1.00      0.99      0.99      6247
       SCONJ       0.89      0.90      0.89      2046
        VERB       0.95      0.90      0.92      6333

    accuracy                           0.92     33371
   macro avg       0.89      0.80      0.83     33371
weighted avg       0.92      0.92      0.92     33371



TODO -- Evaluate only unknown words (words not in the training set)

# Experiment 2 : Evaluate PythaiNLP POS tagger

In [57]:
import pythainlp.tag

# Experiment 3 : CRF with only current word features

# Experiment 4 : Adding more features for CRF? or use Bi-LSTM of sort