# Sentiment Analysis

In [1]:
import pandas as pd
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from underthesea import word_tokenize, sent_tokenize, text_normalize
import torch
import gc
from lightning import seed_everything
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np


class Colors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    
    
seed_everything(42)

  warn(f"Failed to load image Python extension: {e}")
Global seed set to 42


42

## 1. Data

In [2]:
df = pd.read_feather('data/facebook_comments.ftr')

# labels
labels = df['sentiment'].unique().tolist()
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}
print(f'label: {id2label}')

df['label'] = df['sentiment'].map({'positive': 0, 'negative': 1})
df.drop(columns=['sentiment'], inplace=True)

label: {0: 'positive', 1: 'negative'}


### 1.1 Tokenizer

In [3]:
def apply_word_tokenize(sen):
    sen = " ".join(sen.split())
    sens = sent_tokenize(sen)
    tokenized_sen = []
    for sen in sens:
        tokenized_sen += word_tokenize(text_normalize(sen))
    return ' '.join(['_'.join(words.split(' ')) for words in tokenized_sen])


df['token'] = df['content'].map(lambda x: apply_word_tokenize(x.lower()))
df.drop(columns=['content'], inplace=True)
df.head()

Unnamed: 0,label,token
0,0,mình cần mua xúc_xích cho chó nên mình đặt và ...
1,1,"mệt_mỏi quá mọi người ơi . j & t , ghn dừng nh..."
2,0,mấy ac nào mà giờ con ham gửi hàng thì chuẩn_b...
3,0,tình_hình kho pi exress - bưu_cục chi_nhánh ch...
4,1,🛑 tất_cả đơn_vị vận_chuyển shopee đã được bật ...


In [4]:
texts = df['token'].to_numpy().tolist()
labels = df['label'].to_numpy().tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=.2, random_state=42)

### 1.2 Tokenizer BERT

In [5]:
# pretrain_name = "vinai/phobert-base"
# folder = 'category_save_model/phobert'

pretrain_name = 'FPTAI/vibert-base-cased'
folder = 'category_save_model/electra'
tokenizer = AutoTokenizer.from_pretrained(pretrain_name)

In [6]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=64,
                            return_attention_mask = False, return_tensors='pt', return_token_type_ids=False)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=64,
                          return_attention_mask = False, return_tensors='pt', return_token_type_ids=False)

In [7]:
class CategoryDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = CategoryDataset(train_encodings, train_labels)
val_dataset = CategoryDataset(val_encodings, val_labels)

In [8]:
example = train_dataset[0]
print(example.keys())
print(tokenizer.decode(example['input_ids']))
print(example['labels'])

dict_keys(['input_ids', 'labels'])
[CLS] 811037737940 nho ad bao phat gap đon hang giup minh sao buu [UNK] ta chua lien [UNK] he khach ma bao sai thong [UNK] tin nguoi nhan la sao [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
tensor(1)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


## 2. Models

In [9]:
id2label = {0: 'positive', 1: 'negative'}
label2id = {'positive': 0, 'negative': 1}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
    pretrain_name,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id)

Some weights of the model checkpoint at FPTAI/vibert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not in

In [10]:
def custom_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    f1 = f1_score(labels, predictions, average="micro")
    accuracy = accuracy_score(labels, predictions)
    return {"f1": f1, "accuracy": accuracy}


torch.cuda.empty_cache()
gc.collect()


training_args = TrainingArguments(
    output_dir=folder,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    weight_decay=0.001,
    learning_rate=1e-4,
    per_device_train_batch_size=128,
    num_train_epochs=2,
    fp16=True,
    logging_strategy='epoch',
    save_strategy='epoch',
    evaluation_strategy='epoch',
    save_total_limit=2,
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=custom_metrics,
)

Using cuda_amp half precision backend


In [11]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

***** Running training *****
  Num examples = 8966
  Num Epochs = 2
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 142
  Number of trainable parameters = 115355906
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.4414,0.385393,0.805977,0.805977
2,0.2692,0.314172,0.866191,0.866191


***** Running Evaluation *****
  Num examples = 2242
  Batch size = 8
Saving model checkpoint to category_save_model/electra\checkpoint-71
Configuration saved in category_save_model/electra\checkpoint-71\config.json
Model weights saved in category_save_model/electra\checkpoint-71\pytorch_model.bin
tokenizer config file saved in category_save_model/electra\checkpoint-71\tokenizer_config.json
Special tokens file saved in category_save_model/electra\checkpoint-71\special_tokens_map.json
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 2242
  Batch size = 8
Saving model checkpoint to category_save_model/electra\checkpoint-142
Configuration saved in category_save_model/electra\checkpoint-142\config.json
Model weights saved in category_save_model/electra\checkpoint-142\pytorch_model.bin
tokenizer config file saved in category_save_model/electra\checkpoint-142\tokenizer_config.json
Special tokens file saved in catego

***** train metrics *****
  epoch                    =        2.0
  total_flos               =   549259GF
  train_loss               =     0.3553
  train_runtime            = 0:00:55.13
  train_samples_per_second =     325.21
  train_steps_per_second   =      2.575
