<a href="https://colab.research.google.com/github/marzinouri/AzeriPipeline/blob/main/Notebooks/POSTagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prerequisites

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install datasets
!pip install tokenizers
!pip install transformers
!pip install sentencepiece
!pip install seqeval

In [None]:
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

In [None]:
from datasets import *
from transformers import *
from tokenizers import *
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import math
from seqeval.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
import warnings
warnings.filterwarnings("ignore")

# Kappa Score Calculation

In [None]:
with open("/content/drive/MyDrive/Azari/Datasets/POS/POS_part3_person1.csv", "r") as f:
    azb_postags = f.readlines()

annotations1 = []

for pos_tag in azb_postags:
    try:
        id, token, tag = pos_tag.strip().split(",")
        if tag == "NEGP":
            tag = "VERB"
    except:
        print(pos_tag)
    if token == "<start>":
        tags = []
    elif token == "<end>":
        annotations1 += tags
    else:
        tags.append(tag)

with open("/content/drive/MyDrive/Azari/Datasets/POS/POS_part4_person1.csv", "r") as f:
    azb_postags = f.readlines()

for pos_tag in azb_postags:
    try:
        id, token, tag = pos_tag.strip().split(",")
        if tag == "NEGP":
            tag = "VERB"
    except:
        print(pos_tag)
    if token == "<start>":
        tags = []
    elif token == "<end>":
        annotations1 += tags
    else:
        tags.append(tag)

In [None]:
with open("/content/drive/MyDrive/Azari/Datasets/POS/POS_part3_person2.csv", "r") as f:
    azb_postags = f.readlines()

annotations2 = []

for pos_tag in azb_postags:
    try:
        id, token, tag = pos_tag.strip().split(",")
        if tag == "NEGP":
            tag = "VERB"
    except:
        print(pos_tag)
    if token == "<start>":
        tags = []
    elif token == "<end>":
        annotations2 += tags
    else:
        tags.append(tag)

with open("/content/drive/MyDrive/Azari/Datasets/POS/POS_part4_person2.csv", "r") as f:
    azb_postags = f.readlines()

for pos_tag in azb_postags:
    try:
        id, token, tag = pos_tag.strip().split(",")
        if tag == "NEGP":
            tag = "VERB"
    except:
        print(pos_tag)
    if token == "<start>":
        tags = []
    elif token == "<end>":
        annotations2 += tags
    else:
        tags.append(tag)

In [None]:
kappa = cohen_kappa_score(annotations1, annotations2)
kappa

0.9264001853186451

# Preparing Data

In [None]:
with open("/content/drive/MyDrive/Azari/Datasets/POS/POS_part3_person1.csv", "r") as f:
    azb_postags = f.readlines()

with open("/content/drive/MyDrive/Azari/Datasets/POS/POS_part4_person2.csv", "r") as f:
    azb_postags += f.readlines()

In [None]:
tagss = {'NOUN': 0,
  'PUNC': 1,
  'VERB': 2,
  'PRON': 3,
  'ADV': 4,
  'CONJ': 5,
  'DET': 6,
  'NUM': 7,
  'ADJ': 8,
  'QUES': 9,
  'POSTP': 10,
  'INTERJ': 11}
itagss = {v: k for k, v in tagss.items()}

In [None]:
tagged_sents = []

for pos_tag in azb_postags:
    try:
        id, token, tag = pos_tag.strip().split(",")
        if tag == "NEGP":
            tag = "VERB"
    except:
        print(pos_tag)
    if token == "<start>":
        tokens = []
        tags = []
    elif token == "<end>":
        tagged_sents.append(list(zip(tokens, tags)))
    else:
        tokens.append(token)
        tags.append(tag)

In [None]:
df = []

for tagged_sent in tagged_sents:
    row_tokens = []
    row_tags = []
    for token, pos_tag in tagged_sent:
        row_tokens.append(token)
        row_tags.append(tagss[pos_tag])
    if len(row_tags) != 0:
        df.append([row_tokens, row_tags])

df_train = pd.DataFrame(df)
df_train.columns = ["tokens", "pos_tags"]

In [None]:
df_train.head()

Unnamed: 0,tokens,pos_tags
0,"[بیرینین, دامی, آلچاق, گؤرونمه, دوروموندا, اول...","[6, 0, 4, 0, 0, 2, 1, 0, 0, 5, 2, 0, 8, 2, 1]"
1,"[گونده‌لیک, گونده‌کی, خبرلری, چاتدیرماق, اوچون...","[4, 8, 0, 0, 10, 0, 10, 1, 2, 0, 2, 1]"
2,"[بیلدی, یینه, گؤره, ده, واقتی, اولانا, دک, بیر...","[0, 0, 10, 5, 0, 8, 4, 6, 8, 0, 10, 2, 1]"
3,"[بیرسیرا, میللت‌لر, ده‌ییشیک‌لیک‌لرین, قارشی‌س...","[6, 0, 0, 0, 2, 1, 6, 0, 0, 2, 1]"
4,"[بیزیم, فیکیرلریمیز, باشقا, دیللرده, ده, یازیل...","[3, 0, 8, 0, 5, 2, 1, 4, 3, 0, 10, 1, 3, 0, 0,..."


In [None]:
ds_train = Dataset.from_pandas(df_train, features=Features({
                "tokens": Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
                "pos_tags": Sequence(feature=ClassLabel(num_classes=len(tagss), names=list(tagss.keys()), names_file=None, id=None), length=-1, id=None)
            })).train_test_split(train_size=0.8, seed=42)

ds_test = ds_train["test"].train_test_split(train_size=0.5, seed=42)

dataset = DatasetDict({
    "train": ds_train["train"],
    "validation": ds_test["train"],
    "test": ds_test["test"]})
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'pos_tags'],
        num_rows: 160
    })
    validation: Dataset({
        features: ['tokens', 'pos_tags'],
        num_rows: 20
    })
    test: Dataset({
        features: ['tokens', 'pos_tags'],
        num_rows: 20
    })
})

In [None]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding="max_length", max_length=64, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"pos_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Finetuning

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

model_path = "/content/drive/MyDrive/Azari/Models/AzerBert_v2"
pos_model_path = "/content/drive/MyDrive/Azari/Models/POS_TAGGER_v1"
model = AutoModelForTokenClassification.from_pretrained(os.path.join(model_path, "checkpoint-11630"), num_labels=len(tagss))
tokenizer = BertTokenizerFast.from_pretrained(model_path)
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

loading configuration file /content/drive/MyDrive/Azari/Models/AzerBert_v2/checkpoint-11630/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Azari/Models/AzerBert_v2/checkpoint-11630",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_eps": 1e-12,
 

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
args = TrainingArguments(
    output_dir=pos_model_path,
    evaluation_strategy = "epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    overwrite_output_dir=True,
    resume_from_checkpoint=pos_model_path,
    load_best_model_at_end=True,
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=20,
    weight_decay=0.01,
    save_total_limit=1
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
metric = load_metric("seqeval")

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [itagss[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [itagss[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, pos_tags. If tokens, pos_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 160
  Num Epochs = 20
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 60
  Number of trainable parameters = 92,795,916
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,2.0841,1.180312,0.454545,0.209924,0.287206,0.695255
2,1.1453,0.930381,0.480583,0.377863,0.423077,0.762774
3,0.8983,0.762076,0.564103,0.503817,0.532258,0.804745
4,0.7406,0.665376,0.637131,0.576336,0.60521,0.837591
5,0.6451,0.627103,0.618677,0.60687,0.612717,0.837591
6,0.5668,0.589029,0.624506,0.603053,0.613592,0.837591
7,0.5085,0.56734,0.65873,0.633588,0.645914,0.85219
8,0.4746,0.544993,0.65748,0.637405,0.647287,0.854015
9,0.4312,0.543777,0.637405,0.637405,0.637405,0.844891
10,0.3862,0.525656,0.661417,0.641221,0.651163,0.854015


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, pos_tags. If tokens, pos_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 64
Saving model checkpoint to /content/drive/MyDrive/Azari/Models/POS_TAGGER_v1/checkpoint-3
Configuration saved in /content/drive/MyDrive/Azari/Models/POS_TAGGER_v1/checkpoint-3/config.json
Model weights saved in /content/drive/MyDrive/Azari/Models/POS_TAGGER_v1/checkpoint-3/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Azari/Models/POS_TAGGER_v1/checkpoint-3/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Azari/Models/POS_TAGGER_v1/checkpoint-3/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have be

TrainOutput(global_step=60, training_loss=0.5357698023319244, metrics={'train_runtime': 174.9764, 'train_samples_per_second': 18.288, 'train_steps_per_second': 0.343, 'total_flos': 104528152166400.0, 'train_loss': 0.5357698023319244, 'epoch': 20.0})

In [None]:
# our tags
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [itagss[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [itagss[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
print(results)
print(classification_report(true_labels, true_predictions))

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, pos_tags. If tokens, pos_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 20
  Batch size = 64


{'DJ': {'precision': 0.5714285714285714, 'recall': 0.4444444444444444, 'f1': 0.5, 'number': 18}, 'DV': {'precision': 0.6666666666666666, 'recall': 0.6666666666666666, 'f1': 0.6666666666666666, 'number': 9}, 'ERB': {'precision': 0.6341463414634146, 'recall': 0.7878787878787878, 'f1': 0.7027027027027027, 'number': 33}, 'ET': {'precision': 0.9, 'recall': 0.75, 'f1': 0.8181818181818182, 'number': 12}, 'ONJ': {'precision': 0.8888888888888888, 'recall': 0.8888888888888888, 'f1': 0.8888888888888888, 'number': 9}, 'OSTP': {'precision': 1.0, 'recall': 0.8333333333333334, 'f1': 0.9090909090909091, 'number': 6}, 'OUN': {'precision': 0.5972222222222222, 'recall': 0.589041095890411, 'f1': 0.593103448275862, 'number': 73}, 'RON': {'precision': 0.6, 'recall': 0.5454545454545454, 'f1': 0.5714285714285713, 'number': 11}, 'UM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 2}, 'UNC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 47}, 'overall_precision': 0.728110599078341, 'overall_rec

# Inference

In [None]:
modelc = model.to('cpu')
pos_tagger = pipeline("token-classification", model=modelc, tokenizer=tokenizer)

In [None]:
pos_tagger("حالین نجوردی؟")