In [1]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
df = pd.read_csv('../data/processed/facebook_comments_processed.csv')
df.head(10)

Unnamed: 0,url,comment_text,comment_text_remove_emojis
0,https://www.facebook.com/100064878600431/posts...,Em xin chúc anh mạnh khỏe vui vẻ thành công,Em xin chúc anh mạnh khỏe vui vẻ thành công
1,https://www.facebook.com/100064878600431/posts...,Em xin anh chị bỏ qua giúp em,Em xin anh chị bỏ qua giúp em
2,https://www.facebook.com/100064878600431/posts...,Em xin cảm ơn mến Đức,Em xin cảm ơn mến Đức
3,https://www.facebook.com/100064878600431/posts...,36 16 56 96. Dem xac mai moi san . DATA ? xac....,36 16 56 96. Dem xac mai moi san . DATA ? xac....
4,https://www.facebook.com/100064878600431/posts...,❤️,
5,https://www.facebook.com/100064878600431/posts...,Rất tuyệt vời,Rất tuyệt vời
6,https://www.facebook.com/100064878600431/posts...,Iphone 14prm,Iphone 14prm
7,https://www.facebook.com/100064878600431/posts...,Hi,Hi
8,https://www.facebook.com/100064878600431/posts...,Mua laptop,Mua laptop
9,https://www.facebook.com/100064878600431/posts...,Ko thêm chữ còn sau chữ “mà” đc à,Ko thêm chữ còn sau chữ “mà” đc à


In [3]:
model_name = "hieudinhpro/BERT_Sentiment_Vietnamese"
save_path = "../models/bert_sentiment_vietnamese"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [4]:
labels = ['negative', 'neutral', 'positive']

class CommentDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def collate_batch(batch_texts):
    return tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True)

In [5]:
texts = df['comment_text_remove_emojis'].fillna("").tolist()
dataset = CommentDataset(texts)
dataloader = DataLoader(dataset, batch_size=16, collate_fn=collate_batch)

all_preds = []

with torch.no_grad():
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        probs = F.softmax(outputs.logits, dim=-1)
        preds = torch.argmax(probs, dim=-1)
        all_preds.extend(preds.cpu().tolist())

In [6]:
df['sentiment'] = [labels[p] for p in all_preds]
df[['comment_text_remove_emojis', 'sentiment']]

Unnamed: 0,comment_text_remove_emojis,sentiment
0,Em xin chúc anh mạnh khỏe vui vẻ thành công,positive
1,Em xin anh chị bỏ qua giúp em,neutral
2,Em xin cảm ơn mến Đức,neutral
3,36 16 56 96. Dem xac mai moi san . DATA ? xac....,negative
4,,neutral
...,...,...
413,Hết ý tưởng r à ad,positive
414,"À nay Tgdđ không bán laptop nữa, ae chuyển qua...",negative
415,,neutral
416,,neutral


In [7]:
sentiment_counts = df['sentiment'].value_counts()
sentiment_percent = df['sentiment'].value_counts(normalize=True) * 100

result = pd.DataFrame({
    'Count': sentiment_counts,
    'Percentage (%)': sentiment_percent.round(2)
})

result

Unnamed: 0_level_0,Count,Percentage (%)
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,212,50.72
neutral,174,41.63
positive,32,7.66


In [8]:
positive_df = df[df['sentiment'] == 'positive']
positive_df

Unnamed: 0,url,comment_text,comment_text_remove_emojis,sentiment
0,https://www.facebook.com/100064878600431/posts...,Em xin chúc anh mạnh khỏe vui vẻ thành công,Em xin chúc anh mạnh khỏe vui vẻ thành công,positive
5,https://www.facebook.com/100064878600431/posts...,Rất tuyệt vời,Rất tuyệt vời,positive
7,https://www.facebook.com/100064878600431/posts...,Hi,Hi,positive
11,https://www.facebook.com/100064878600431/posts...,"Tự hào về bạn quá, cố gắng phát huy nhé!","Tự hào về bạn quá, cố gắng phát huy nhé!",positive
21,https://www.facebook.com/100064878600431/posts...,"""Must-have item"" được săn lùng nhiều nhất! Chí...","""Must-have item"" được săn lùng nhiều nhất! Chí...",positive
23,https://www.facebook.com/100064878600431/posts...,Rất tuyệt vời,Rất tuyệt vời,positive
24,https://www.facebook.com/100064878600431/posts...,Rất tuyệt,Rất tuyệt,positive
25,https://www.facebook.com/100064878600431/posts...,Rây tuyệt đẹp,Rây tuyệt đẹp,positive
38,https://www.facebook.com/100064878600431/posts...,10 đỉm cho team vì biết nghe lời 🤣,10 đỉm cho team vì biết nghe lời,positive
41,https://www.facebook.com/100064878600431/posts...,Rất tuyệt vời,Rất tuyệt vời,positive


In [9]:
df.to_csv('../data/processed/facebook_comments_processed_with_sentiment.csv', index=False)