In [2]:
!pip install -q transformers datasets sentence-transformers

In [3]:
# import all it's needed
from sentence_transformers import SentenceTransformer, models
from datasets import load_dataset, load_from_disk, load_metric
from transformers import Trainer, TrainingArguments

from hazm import *

from torch import nn
import torch

import numpy as np
import pandas as pd

import os
from typing import Optional

2022-01-28 07:20:05.596964: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [4]:
word_embedding_model = models.Transformer('HooshvareLab/bert-fa-base-uncased-clf-digimag')

Downloading: 100%|██████████| 1.56k/1.56k [00:00<00:00, 1.26MB/s]
Downloading: 100%|██████████| 621M/621M [00:54<00:00, 12.0MB/s]  
Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased-clf-digimag were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████| 62.0/62.0 [00:00<00:00, 35.3kB/s]
Downloading: 100%|██████████| 1.14M/1.14M [00:00<00:00, 2.83MB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 98.5kB/s]


In [5]:
puncs = ['،', '.', ',', ':', ';', '"']
normalizer = Normalizer()
lemmatizer = Lemmatizer()

# turn a doc into clean tokens
def clean_doc(doc):
    doc = normalizer.normalize(doc) # Normalize document using Hazm Normalizer
    tokenized = word_tokenize(doc)  # Tokenize text
    tokens = []
    for t in tokenized:
      temp = t
      for p in puncs:
        temp = temp.replace(p, '')
      tokens.append(temp)
    # tokens = [w for w in tokens if not w in stop_set]    # Remove stop words
    tokens = [w for w in tokens if not len(w) <= 1]
    tokens = [w for w in tokens if not w.isdigit()]
    tokens = [lemmatizer.lemmatize(w) for w in tokens] # Lemmatize sentence words using Hazm Lemmatizer
    tokens = ' '.join(tokens)
    return tokens

In [6]:
# prepare train data
original = pd.read_csv('Dataset/original.csv',
                       names=["text", "labels"], index_col=None, header=None, encoding="utf-8")
balanced = pd.read_csv('Dataset/balanced.csv',
                       names=["text", "labels"], index_col=None, header=None, encoding="utf-8")
translation = pd.read_csv('Dataset/translation.csv',
                          names=["text", "labels"], index_col=None, header=None, encoding="utf-8")
# we just use of translation data so we can compare the result fairly to previous works.
train = translation
# cleaning text
train['text'] = train.apply(lambda row:clean_doc(row.text), axis=1)
# +2 make the label in range 0, 5 so it's now aceptable for rest of the process 
train['labels'] = train['labels'] + 2
# save as parquet in storage so we can load it by transformes dataset
train.to_parquet('Dataset/train.parquet', index=False)
# prepare test data for reporting evaluation on it
test = pd.read_csv('Dataset/test.csv',
                   names=["text", "labels"], index_col=None, header=None, encoding="utf-8")
# cleaning text
test['text'] = test.apply(lambda row:clean_doc(row.text), axis=1)
# +2 make the label in range 0, 5 so it's now aceptable for rest of the process 
test['labels'] = test['labels'] + 2
# save as parquet in storage so we can load it by transformes dataset
test.to_parquet('Dataset/test.parquet', index=False)

In [7]:
# load dataset
dataset = load_dataset("parquet", data_files={'train': 'Dataset/train.parquet',
                                              'test': 'Dataset/test.parquet'})

# tokenize and padding
def tokenize_function(data_point):
    return word_embedding_model.tokenizer(data_point["text"], padding='max_length', truncation=True, max_length=64)

tokenized_datasets = dataset.map(tokenize_function, 
                                 remove_columns=["text"],
                                 batched=True,
                                 batch_size=8)
# split into train and test and shuffle so each batch is an unbiased sample of entire dataset
full_train_dataset = tokenized_datasets["train"].shuffle(seed=43)
full_eval_dataset = tokenized_datasets["test"].shuffle(seed=43)

[W 2022-01-28 07:25:44,954.954 datasets.builder] Using custom data configuration default-d8be219597c4581e


Downloading and preparing dataset parquet/default to /home/jovyan/.cache/huggingface/datasets/parquet/default-d8be219597c4581e/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...


100%|██████████| 2/2 [00:00<00:00, 4019.46it/s]
100%|██████████| 2/2 [00:00<00:00, 1290.36it/s]


Dataset parquet downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/parquet/default-d8be219597c4581e/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 933.31it/s]
100%|██████████| 1756/1756 [00:02<00:00, 656.69ba/s]
100%|██████████| 232/232 [00:00<00:00, 669.86ba/s]


In [8]:
# avg pooling strategy for getting avg of each word embedding to obtain sentence embedding
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
# add the layer above on top of the parsBERT model to get embedding of each sent in 768 dim.
sent_bert_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [9]:
# this class play a classifier role so we make it by sent_embedding(sent_bert_model)+dropout+linear
class SentimentModel(torch.nn.Module):

    def __init__(self, sent_bert_model):
        super(SentimentModel, self).__init__()

        self.sent_bert_model = sent_bert_model
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(768, 5)
        )
    
    def forward(self, input_token):
        output = self.sent_bert_model(input_token)
        logits = self.classifier(output['sentence_embedding'])
        return {'logits': logits} 

In [10]:
sent_model = SentimentModel(sent_bert_model)

In [11]:
# metrics that use in this sentiment analysis research path
metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    res_metric = metric.compute(predictions=predictions, references=labels, average="weighted")
    return res_metric

Downloading: 5.29kB [00:00, 3.65MB/s]                   


In [12]:
# we implement cross entropy loss into comput_loss fn in transformer trainer in order to train our classification problem
# we implement custom pytorch data loader as well
class SentimentTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels')
        outputs = model(inputs)
        logits = outputs.get('logits')
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss
    
    def get_train_dataloader(self):
        train_dataset = self.train_dataset
        return torch.utils.data.DataLoader(
                train_dataset,
                batch_size=self.args.per_device_train_batch_size,
                collate_fn=self.data_collator,
                num_workers=self.args.dataloader_num_workers,
                pin_memory=self.args.dataloader_pin_memory,
            )
    
    def get_eval_dataloader(self, eval_dataset: Optional[torch.utils.data.Dataset] = None):
        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
        return torch.utils.data.DataLoader(
                eval_dataset,
                batch_size=self.args.eval_batch_size,
                collate_fn=self.data_collator,
                num_workers=self.args.dataloader_num_workers,
                pin_memory=self.args.dataloader_pin_memory,
            )
    
    def get_test_dataloader(self, test_dataset: torch.utils.data.Dataset):
        return torch.utils.data.DataLoader(
                test_dataset,
                batch_size=self.args.eval_batch_size,
                collate_fn=self.data_collator,
                num_workers=self.args.dataloader_num_workers,
                pin_memory=self.args.dataloader_pin_memory,
            )
        

In [13]:
# trainer args
training_args = TrainingArguments("transformers-sentiment", 
                                  per_device_train_batch_size=2, 
                                  per_device_eval_batch_size=64,
                                  num_train_epochs=3,
                                  eval_accumulation_steps=1,
                                  dataloader_num_workers=8,
                                  gradient_accumulation_steps = 64,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=True,)

In [14]:
# initiate trainer with pre-defined model,datasets and metrics
trainer = SentimentTrainer(
    model=sent_model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    compute_metrics=compute_metrics, 
)

In [15]:
# training phase
trainer.train()
# we can check the f1-score in printed logs below it was printed by the name "eval_f1"

***** Running training *****
  Num examples = 14046
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 64
  Total optimization steps = 327


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

***** Running Evaluation *****
  Num examples = 1854
  Batch size = 64


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

***** Running Evaluation *****
  Num examples = 1854
  Batch size = 64


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

***** Running Evaluation *****
  Num examples = 1854
  Batch size = 64


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.7240970134735107, 'eval_f1': 0.7354990965370065, 'eval_runtime': 2.8263, 'eval_samples_per_second': 655.977, 'eval_steps_per_second': 10.261, 'epoch': 2.99}
{'train_runtime': 621.3771, 'train_samples_per_second': 67.814, 'train_steps_per_second': 0.526, 'train_loss': 0.736384913826572, 'epoch': 2.99}


TrainOutput(global_step=327, training_loss=0.736384913826572, metrics={'train_runtime': 621.3771, 'train_samples_per_second': 67.814, 'train_steps_per_second': 0.526, 'train_loss': 0.736384913826572, 'epoch': 2.99})