# Sentiment Analysis - PhoBERT 

In [1]:
import pandas as pd
from collections import Counter
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from underthesea import word_tokenize, sent_tokenize, text_normalize
from evaluate import load
import torch
import gc


class Colors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

## 1. Data

In [2]:
df = pd.read_feather('data/facebook_comments.ftr')

# labels
labels = df['sentiment'].unique().tolist()
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}
print(f'label: {id2label}')

df['label'] = df['sentiment'].map({'positive': 0, 'negative': 1})
df.drop(columns=['sentiment'], inplace=True)

label: {0: 'positive', 1: 'negative'}


### 1.1 Tokenizer

In [3]:
def apply_word_tokenize(sen):
    sen = " ".join(sen.split())
    sens = sent_tokenize(sen)
    tokenized_sen = []
    for sen in sens:
        tokenized_sen += word_tokenize(text_normalize(sen))
    return ' '.join(['_'.join(words.split(' ')) for words in tokenized_sen])


df['token'] = df['content'].map(lambda x: apply_word_tokenize(x.lower()))
df.drop(columns=['content'], inplace=True)
df.head()

Unnamed: 0,label,token
0,0,mình cần mua xúc_xích cho chó nên mình đặt và ...
1,1,"mệt_mỏi quá mọi người ơi . j & t , ghn dừng nh..."
2,0,mấy ac nào mà giờ con ham gửi hàng thì chuẩn_b...
3,0,tình_hình kho pi exress - bưu_cục chi_nhánh ch...
4,1,🛑 tất_cả đơn_vị vận_chuyển shopee đã được bật ...


In [4]:
splits = Dataset.from_pandas(df).train_test_split(test_size=0.3)
dataset_test_valid = splits['test'].train_test_split(test_size=0.5)
train_data, val_data, test_data = splits['train'], dataset_test_valid['train'], dataset_test_valid['test']

raw_dataset = DatasetDict({
    'train': splits['train'].shuffle(seed=42),
    'test': dataset_test_valid['train'],
    'valid': dataset_test_valid['test'],
})
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'token'],
        num_rows: 7845
    })
    test: Dataset({
        features: ['label', 'token'],
        num_rows: 1681
    })
    valid: Dataset({
        features: ['label', 'token'],
        num_rows: 1682
    })
})

In [5]:
for i in raw_dataset:
    print(i, Counter(raw_dataset[i]['label']))

train Counter({0: 4722, 1: 3123})
test Counter({0: 1021, 1: 660})
valid Counter({0: 989, 1: 693})


### 1.2 Tokenizer BERT

In [6]:
pretrain_name = "vinai/phobert-base"
folder = 'category_save_model/phobert'
tokenizer = AutoTokenizer.from_pretrained(pretrain_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
num = 25
token = tokenizer(raw_dataset['train'][num]['token'], padding='max_length', max_length=15, truncation=True)

print(f"{Colors.OKGREEN}Text:{Colors.ENDC} {raw_dataset['train'][num]['token']}")
print(f"{Colors.OKGREEN}Len of token:{Colors.ENDC} {len(token['input_ids'])}")
print(f"{Colors.OKGREEN}Input_ids:{Colors.ENDC} {token['input_ids']}")

[92mText:[0m mã săn phụ_kiện nữ cho ce nào cần ạ ._shopee còn đang có mã giảm 20 k yentamonha kết_hợp voucher_shop đơn còn có 15 k nhé ce :))) quá bá cháy https://shopee.vn/meky.house?smtt=0.0.9
[92mLen of token:[0m 15
[92mInput_ids:[0m [0, 1624, 2587, 4386, 401, 13, 5526, 142, 115, 3628, 2586, 11269, 3040, 36653, 2]


In [8]:
def preprocess_function(examples):
    return tokenizer(examples['token'], padding="max_length", truncation=True, max_length=128)

tokenized_train = raw_dataset['train'].map(preprocess_function, batched=True)
tokenized_valid = raw_dataset['valid'].map(preprocess_function, batched=True)
tokenized_test = raw_dataset['test'].map(preprocess_function, batched=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [9]:
example = tokenized_train[26]
print(example.keys())
print(tokenizer.decode(example['input_ids']))
print(example['label'])

dict_keys(['label', 'token', 'input_ids', 'token_type_ids', 'attention_mask'])
<s> có nên bật cod lên ko anh </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
1


## 2. Models

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
    pretrain_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    acc = load('accuracy').compute(predictions=predictions, references=labels)
    f1_macro = load('f1').compute(predictions=predictions, references=labels, average='macro')
    f1_weight = load('f1').compute(predictions=predictions, references=labels, average='weighted')
    return {'accuracy': acc, 'F1 macro': f1_macro, 'F1 weighted': f1_weight}

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['

In [11]:
torch.cuda.empty_cache()
gc.collect()

training_args = TrainingArguments(
    output_dir=folder,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    weight_decay=0.001,
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    num_train_epochs=10,
    fp16=True,
    logging_strategy='epoch',
    save_strategy='epoch',
    evaluation_strategy='epoch',
    # evaluation_strategy='steps',
    # save_steps=200,
    # eval_steps=200,
    # logging_steps=20,
    save_total_limit=2,
    push_to_hub=False,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Using cuda_amp half precision backend


In [12]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: token. If token are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7845
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2460
  Number of trainable parameters = 134999810
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mkevinkhang[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1 macro,F1 weighted
1,0.4587,0.393195,{'accuracy': 0.8304580606781677},{'f1': 0.8268090990429194},{'f1': 0.8322077676917908}
2,0.3113,0.465759,{'accuracy': 0.8459250446162998},{'f1': 0.8411988207148877},{'f1': 0.8470821546059557}
3,0.229,0.393221,{'accuracy': 0.8619869125520524},{'f1': 0.8507630374554194},{'f1': 0.8595522324877067}
4,0.1973,0.380729,{'accuracy': 0.8655562165377751},{'f1': 0.8560019588579191},{'f1': 0.8639675178011708}
5,0.1555,0.482947,{'accuracy': 0.851279000594884},{'f1': 0.8479426578284656},{'f1': 0.8527796848914256}
6,0.1121,0.539218,{'accuracy': 0.8536585365853658},{'f1': 0.8493898761396117},{'f1': 0.8548350578389659}
7,0.0824,0.570615,{'accuracy': 0.8607971445568114},{'f1': 0.8539094741908744},{'f1': 0.8607216632377327}
8,0.058,0.561375,{'accuracy': 0.8667459845330161},{'f1': 0.8612349347861978},{'f1': 0.8671737077969484}
9,0.0417,0.667401,{'accuracy': 0.8578227245687091},{'f1': 0.8533364971297673},{'f1': 0.8588450961279236}
10,0.0368,0.668061,{'accuracy': 0.8619869125520524},{'f1': 0.8571604377572997},{'f1': 0.8627991348476095}


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: token. If token are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1681
  Batch size = 8
Trainer is attempting to log a value of "{'accuracy': 0.8304580606781677}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.8268090990429194}" of type <class 'dict'> for key "eval/F1 macro" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.8322077676917908}" of type <class 'dict'> for key "eval/F1 weighted" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we drop

***** train metrics *****
  epoch                    =       10.0
  total_flos               =  4805871GF
  train_loss               =     0.1683
  train_runtime            = 0:10:17.13
  train_samples_per_second =     127.12
  train_steps_per_second   =      3.986
