In [1]:
!pip install flair

Collecting flair
  Downloading flair-0.13.0-py3-none-any.whl (387 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m387.2/387.2 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3>=1.20.27 (from flair)
  Downloading boto3-1.29.1-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bpemb>=0.3.2 (from flair)
  Downloading bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting conllu>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Collecting deprecated>=1.2.13 (from flair)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.10.0 (from flair)
  Downloading huggingface_hub-0.19.3-py3-n

In [2]:
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.datasets import ColumnCorpus
from flair.datasets import SentenceDataset
from flair.data import Token
from tqdm import tqdm
import gensim
from gensim.models.word2vec import Word2Vec
import pandas as pd
pd.set_option('max.colwidth', 0)

# Prepare Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!cp /content/drive/MyDrive/mypos-ver.3.0-flair.txt .
!cp /content/drive/MyDrive/word2vec.model .

In [5]:
dataset_file = "mypos-ver.3.0-flair.txt"

# Define the columns in dataset
columns = {0: 'text', 1: 'pos'}

# Initialize the corpus
corpus = ColumnCorpus(data_folder='.', column_format=columns, train_file=dataset_file)

2023-11-16 09:19:56,059 Reading data from .
2023-11-16 09:19:56,067 Train: mypos-ver.3.0-flair.txt
2023-11-16 09:19:56,068 Dev: None
2023-11-16 09:19:56,070 Test: None
2023-11-16 09:20:11,239 No test split found. Using 0% (i.e. 4320 samples) of the train split as test data
2023-11-16 09:20:11,267 No dev split found. Using 0% (i.e. 3888 samples) of the train split as dev data


# Custom Word Embeddings (word2vec)

In [6]:
from flair.embeddings import TokenEmbeddings
from gensim.models import Word2Vec
from flair.data import Sentence
from typing import List, Dict, Any
import numpy as np
import torch

class CustomWordEmbeddings(TokenEmbeddings):
    embeddings_name = "custom_word_embeddings"
    def __init__(self, model_path):
        super().__init__()
        self.name = "custom_word_embeddings"
        self.static_embeddings = True
        self.model_path = model_path
        self.model = Word2Vec.load(model_path)

    @property
    def embedding_length(self):
        return self.model.vector_size

    def embed(self, sentences: List[Sentence]) -> List[Sentence]:
        for sentence in sentences:
            for token in sentence.tokens:
                if token.text in self.model.wv:
                    embedding = np.array(self.model.wv[token.text])
                    embedding_tensor = torch.tensor(embedding)  # Convert to a PyTorch tensor
                    token.set_embedding(self.name, embedding_tensor)
                else:
                    # Handle out-of-vocabulary words
                    token.set_embedding(self.name, torch.zeros(self.model.vector_size))
        return sentences

    @classmethod
    def from_params(cls, params: Dict[str, Any]) -> "CustomWordEmbeddings":
        model_path = params.get("model_path")
        return cls(model_path)

    def to_params(self) -> Dict[str, Any]:
        return {"model_path": self.model_path}


In [7]:
from flair.embeddings.base import EMBEDDING_CLASSES
model_path = "word2vec.model"
# Create a custom TokenEmbeddings object
custom_word_embeddings = CustomWordEmbeddings(model_path)

EMBEDDING_CLASSES.update({
    "custom_word_embeddings": custom_word_embeddings
})

# Sequence Tagging

In [8]:
label_type = 'pos'

# Create a label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)

# Create model
model = SequenceTagger(hidden_size=256,
                      embeddings=custom_word_embeddings,
                      tag_dictionary=label_dict,
                      tag_type=label_type)

# Create the trainer and train the model
trainer = ModelTrainer(model, corpus)
trainer.train('pos_tagger_w2v', learning_rate=0.1, mini_batch_size=32, max_epochs=10)

2023-11-16 09:20:11,879 Computing label dictionary. Progress:


0it [00:00, ?it/s]
34989it [00:01, 28574.53it/s]

2023-11-16 09:20:13,165 Dictionary created for label 'pos' with 16 values: part (seen 108378 times), n (seen 85028 times), ppm (seen 70057 times), v (seen 62463 times), punc (seen 43823 times), pron (seen 16539 times), conj (seen 14497 times), adj (seen 12821 times), adv (seen 8663 times), num (seen 4867 times), tn (seen 4734 times), fw (seen 2590 times), int (seen 545 times), abb (seen 283 times), sb (seen 224 times), O (seen 1 times)
2023-11-16 09:20:13,169 SequenceTagger predicts: Dictionary with 16 tags: part, n, ppm, v, punc, pron, conj, adj, adv, num, tn, fw, int, abb, sb, O





2023-11-16 09:20:26,519 ----------------------------------------------------------------------------------------------------
2023-11-16 09:20:26,521 Model: "SequenceTagger(
  (embeddings): CustomWordEmbeddings()
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=100, out_features=100, bias=True)
  (rnn): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=18, bias=True)
  (loss_function): ViterbiLoss()
  (crf): CRF()
)"
2023-11-16 09:20:26,525 ----------------------------------------------------------------------------------------------------
2023-11-16 09:20:26,527 Corpus: 34989 train + 3888 dev + 4320 test sentences
2023-11-16 09:20:26,532 ----------------------------------------------------------------------------------------------------
2023-11-16 09:20:26,532 Train:  34989 sentences
2023-11-16 09:20:26,536         (train_with_dev=False, train_with_test=False)
2023-11-

100%|██████████| 61/61 [00:10<00:00,  6.00it/s]


2023-11-16 09:22:02,973 DEV : loss 0.26060208678245544 - f1-score (micro avg)  0.9192
2023-11-16 09:22:03,238  - 0 epochs without improvement
2023-11-16 09:22:03,243 saving best model
2023-11-16 09:22:03,257 ----------------------------------------------------------------------------------------------------
2023-11-16 09:22:11,944 epoch 2 - iter 109/1094 - loss 0.42815902 - time (sec): 8.68 - samples/sec: 4972.78 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:22:19,491 epoch 2 - iter 218/1094 - loss 0.42612937 - time (sec): 16.23 - samples/sec: 5343.09 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:22:28,558 epoch 2 - iter 327/1094 - loss 0.42069821 - time (sec): 25.30 - samples/sec: 5119.10 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:22:35,848 epoch 2 - iter 436/1094 - loss 0.41786099 - time (sec): 32.59 - samples/sec: 5315.95 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:22:45,289 epoch 2 - iter 545/1094 - loss 0.41520458 - time (sec): 42.03 - samples/sec: 5156.42 - l

100%|██████████| 61/61 [00:16<00:00,  3.65it/s]


2023-11-16 09:23:44,213 DEV : loss 0.22997337579727173 - f1-score (micro avg)  0.929
2023-11-16 09:23:44,370  - 0 epochs without improvement
2023-11-16 09:23:44,372 saving best model
2023-11-16 09:23:44,384 ----------------------------------------------------------------------------------------------------
2023-11-16 09:23:53,729 epoch 3 - iter 109/1094 - loss 0.38229644 - time (sec): 9.34 - samples/sec: 4555.44 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:24:01,330 epoch 3 - iter 218/1094 - loss 0.38675175 - time (sec): 16.94 - samples/sec: 5057.32 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:24:10,231 epoch 3 - iter 327/1094 - loss 0.38206419 - time (sec): 25.84 - samples/sec: 5012.62 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:24:18,666 epoch 3 - iter 436/1094 - loss 0.38285586 - time (sec): 34.28 - samples/sec: 5053.55 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:24:27,394 epoch 3 - iter 545/1094 - loss 0.38165484 - time (sec): 43.01 - samples/sec: 5052.47 - lr

100%|██████████| 61/61 [00:14<00:00,  4.14it/s]


2023-11-16 09:25:25,132 DEV : loss 0.21252262592315674 - f1-score (micro avg)  0.9349
2023-11-16 09:25:25,283  - 0 epochs without improvement
2023-11-16 09:25:25,285 saving best model
2023-11-16 09:25:25,295 ----------------------------------------------------------------------------------------------------
2023-11-16 09:25:32,480 epoch 4 - iter 109/1094 - loss 0.35279117 - time (sec): 7.18 - samples/sec: 5992.53 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:25:41,801 epoch 4 - iter 218/1094 - loss 0.35311795 - time (sec): 16.50 - samples/sec: 5221.74 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:25:49,325 epoch 4 - iter 327/1094 - loss 0.35470020 - time (sec): 24.03 - samples/sec: 5390.21 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:25:57,779 epoch 4 - iter 436/1094 - loss 0.35446758 - time (sec): 32.48 - samples/sec: 5314.28 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:26:05,531 epoch 4 - iter 545/1094 - loss 0.35549979 - time (sec): 40.23 - samples/sec: 5357.12 - l

100%|██████████| 61/61 [00:15<00:00,  4.01it/s]


2023-11-16 09:27:07,521 DEV : loss 0.2078721523284912 - f1-score (micro avg)  0.9365
2023-11-16 09:27:07,778  - 0 epochs without improvement
2023-11-16 09:27:07,783 saving best model
2023-11-16 09:27:07,799 ----------------------------------------------------------------------------------------------------
2023-11-16 09:27:17,536 epoch 5 - iter 109/1094 - loss 0.35136357 - time (sec): 9.73 - samples/sec: 4540.26 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:27:24,505 epoch 5 - iter 218/1094 - loss 0.35462983 - time (sec): 16.70 - samples/sec: 5219.61 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:27:34,032 epoch 5 - iter 327/1094 - loss 0.35525298 - time (sec): 26.23 - samples/sec: 4964.96 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:27:41,052 epoch 5 - iter 436/1094 - loss 0.35326932 - time (sec): 33.25 - samples/sec: 5204.35 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:27:50,541 epoch 5 - iter 545/1094 - loss 0.35302567 - time (sec): 42.74 - samples/sec: 5073.88 - lr

100%|██████████| 61/61 [00:14<00:00,  4.15it/s]


2023-11-16 09:28:48,156 DEV : loss 0.19975405931472778 - f1-score (micro avg)  0.9399
2023-11-16 09:28:48,313  - 0 epochs without improvement
2023-11-16 09:28:48,314 saving best model
2023-11-16 09:28:48,327 ----------------------------------------------------------------------------------------------------
2023-11-16 09:28:55,902 epoch 6 - iter 109/1094 - loss 0.34820990 - time (sec): 7.57 - samples/sec: 5760.70 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:29:05,246 epoch 6 - iter 218/1094 - loss 0.34250507 - time (sec): 16.92 - samples/sec: 5126.20 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:29:12,694 epoch 6 - iter 327/1094 - loss 0.34224762 - time (sec): 24.36 - samples/sec: 5359.07 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:29:22,063 epoch 6 - iter 436/1094 - loss 0.34071287 - time (sec): 33.73 - samples/sec: 5140.17 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:29:29,044 epoch 6 - iter 545/1094 - loss 0.34059740 - time (sec): 40.71 - samples/sec: 5313.59 - l

100%|██████████| 61/61 [00:15<00:00,  3.82it/s]


2023-11-16 09:30:31,549 DEV : loss 0.19904547929763794 - f1-score (micro avg)  0.9408
2023-11-16 09:30:31,812  - 0 epochs without improvement
2023-11-16 09:30:31,814 saving best model
2023-11-16 09:30:31,828 ----------------------------------------------------------------------------------------------------
2023-11-16 09:30:39,112 epoch 7 - iter 109/1094 - loss 0.33127178 - time (sec): 7.28 - samples/sec: 5871.15 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:30:47,748 epoch 7 - iter 218/1094 - loss 0.33657902 - time (sec): 15.92 - samples/sec: 5459.21 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:30:55,722 epoch 7 - iter 327/1094 - loss 0.33615405 - time (sec): 23.89 - samples/sec: 5418.61 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:31:03,950 epoch 7 - iter 436/1094 - loss 0.33694499 - time (sec): 32.12 - samples/sec: 5395.48 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:31:12,899 epoch 7 - iter 545/1094 - loss 0.33482176 - time (sec): 41.07 - samples/sec: 5257.96 - l

100%|██████████| 61/61 [00:13<00:00,  4.45it/s]


2023-11-16 09:32:09,585 DEV : loss 0.19517748057842255 - f1-score (micro avg)  0.941
2023-11-16 09:32:09,743  - 0 epochs without improvement
2023-11-16 09:32:09,744 saving best model
2023-11-16 09:32:09,756 ----------------------------------------------------------------------------------------------------
2023-11-16 09:32:18,939 epoch 8 - iter 109/1094 - loss 0.32555982 - time (sec): 9.18 - samples/sec: 4626.27 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:32:26,673 epoch 8 - iter 218/1094 - loss 0.33085183 - time (sec): 16.91 - samples/sec: 5091.98 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:32:35,224 epoch 8 - iter 327/1094 - loss 0.32803467 - time (sec): 25.47 - samples/sec: 5085.33 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:32:45,147 epoch 8 - iter 436/1094 - loss 0.32905347 - time (sec): 35.39 - samples/sec: 4885.89 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:32:55,257 epoch 8 - iter 545/1094 - loss 0.32854627 - time (sec): 45.50 - samples/sec: 4759.38 - lr

100%|██████████| 61/61 [00:17<00:00,  3.58it/s]


2023-11-16 09:33:55,739 DEV : loss 0.19180220365524292 - f1-score (micro avg)  0.9416
2023-11-16 09:33:55,896  - 0 epochs without improvement
2023-11-16 09:33:55,898 saving best model
2023-11-16 09:33:55,908 ----------------------------------------------------------------------------------------------------
2023-11-16 09:34:04,398 epoch 9 - iter 109/1094 - loss 0.31988500 - time (sec): 8.49 - samples/sec: 5087.53 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:34:13,052 epoch 9 - iter 218/1094 - loss 0.32251950 - time (sec): 17.14 - samples/sec: 5028.37 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:34:21,403 epoch 9 - iter 327/1094 - loss 0.32382144 - time (sec): 25.49 - samples/sec: 5091.02 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:34:29,748 epoch 9 - iter 436/1094 - loss 0.32530121 - time (sec): 33.84 - samples/sec: 5109.07 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:34:37,171 epoch 9 - iter 545/1094 - loss 0.32506888 - time (sec): 41.26 - samples/sec: 5232.29 - l

100%|██████████| 61/61 [00:12<00:00,  4.75it/s]


2023-11-16 09:35:34,515 DEV : loss 0.19097286462783813 - f1-score (micro avg)  0.9417
2023-11-16 09:35:34,781  - 0 epochs without improvement
2023-11-16 09:35:34,786 saving best model
2023-11-16 09:35:34,796 ----------------------------------------------------------------------------------------------------
2023-11-16 09:35:42,645 epoch 10 - iter 109/1094 - loss 0.32932808 - time (sec): 7.85 - samples/sec: 5478.52 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:35:51,809 epoch 10 - iter 218/1094 - loss 0.33152024 - time (sec): 17.01 - samples/sec: 5098.71 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:36:00,544 epoch 10 - iter 327/1094 - loss 0.32824419 - time (sec): 25.74 - samples/sec: 5088.37 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:36:09,149 epoch 10 - iter 436/1094 - loss 0.32806077 - time (sec): 34.35 - samples/sec: 5074.84 - lr: 0.100000 - momentum: 0.000000
2023-11-16 09:36:17,614 epoch 10 - iter 545/1094 - loss 0.32719729 - time (sec): 42.81 - samples/sec: 5096.7

100%|██████████| 61/61 [00:16<00:00,  3.68it/s]


2023-11-16 09:37:14,888 DEV : loss 0.19604137539863586 - f1-score (micro avg)  0.9418
2023-11-16 09:37:15,046  - 0 epochs without improvement
2023-11-16 09:37:15,048 saving best model
2023-11-16 09:37:15,064 ----------------------------------------------------------------------------------------------------
2023-11-16 09:37:15,066 Loading model from best epoch ...
2023-11-16 09:37:15,421 SequenceTagger predicts: Dictionary with 18 tags: part, n, ppm, v, punc, pron, conj, adj, adv, num, tn, fw, int, abb, sb, O, <START>, <STOP>


100%|██████████| 68/68 [00:12<00:00,  5.66it/s]


2023-11-16 09:37:27,968 
Results:
- F-score (micro) 0.9381
- F-score (macro) 0.8474
- Accuracy 0.9381

By class:
              precision    recall  f1-score   support

        part     0.9383    0.9477    0.9430     13207
           n     0.9157    0.9196    0.9176     10494
         ppm     0.9630    0.9875    0.9751      8638
           v     0.9364    0.9398    0.9381      7588
        punc     0.9958    0.9959    0.9958      5421
        pron     0.9600    0.9711    0.9655      2004
        conj     0.8836    0.9196    0.9012      1741
         adj     0.8172    0.7541    0.7843      1541
         adv     0.8883    0.7629    0.8209      1063
         num     0.9444    0.9413    0.9429       596
          tn     0.9463    0.9511    0.9487       593
          fw     0.8389    0.3463    0.4902       361
         int     0.8000    0.6471    0.7154        68
         abb     1.0000    0.5000    0.6667        38
          sb     0.8571    0.6000    0.7059        20

    accuracy         

{'test_score': 0.9381335131995578}

# Predict

In [9]:
# load the model you trained
model = SequenceTagger.load('pos_tagger_w2v/final-model.pt')

# create example sentence
sentence = Sentence('ကျွန်တော့် အတွက် သိပ် အဆင်မပြေ လှ ပါ ဘူး ။')

# predict tags and print
model.predict(sentence)

print(sentence.to_tagged_string())

2023-11-16 09:37:28,321 SequenceTagger predicts: Dictionary with 18 tags: part, n, ppm, v, punc, pron, conj, adj, adv, num, tn, fw, int, abb, sb, O, <START>, <STOP>
Sentence[8]: "ကျွန်တော့် အတွက် သိပ် အဆင်မပြေ လှ ပါ ဘူး ။" → ["ကျွန်တော့်"/pron, "အတွက်"/ppm, "သိပ်"/adv, "အဆင်မပြေ"/v, "လှ"/part, "ပါ"/part, "ဘူး"/part, "။"/punc]


In [10]:
!mv pos_tagger_w2v /content/drive/MyDrive/