In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
df = pd.read_csv('../data/processed/facebook_comments_cleaned.csv')
df.head(10)

Unnamed: 0,url,comment_text,comment_text_remove_emojis
0,https://www.facebook.com/100064878600431/posts...,‚ù§Ô∏è,
1,https://www.facebook.com/100064878600431/posts...,R·∫•t tuy·ªát v·ªùi,R·∫•t tuy·ªát v·ªùi
2,https://www.facebook.com/100064878600431/posts...,Em xin ch√∫c anh m·∫°nh kh·ªèe vui v·∫ª th√†nh c√¥ng,Em xin ch√∫c anh m·∫°nh kh·ªèe vui v·∫ª th√†nh c√¥ng
3,https://www.facebook.com/100064878600431/posts...,Em xin anh ch·ªã b·ªè qua gi√∫p em,Em xin anh ch·ªã b·ªè qua gi√∫p em
4,https://www.facebook.com/100064878600431/posts...,Em xin c·∫£m ∆°n m·∫øn ƒê·ª©c,Em xin c·∫£m ∆°n m·∫øn ƒê·ª©c
5,https://www.facebook.com/100064878600431/posts...,36 16 56 96. Dem xac mai moi san . DATA ? xac....,36 16 56 96. Dem xac mai moi san . DATA ? xac....
6,https://www.facebook.com/100064878600431/posts...,Iphone 14prm,Iphone 14prm
7,https://www.facebook.com/100064878600431/posts...,Hi,Hi
8,https://www.facebook.com/100064878600431/posts...,Mua laptop,Mua laptop
9,https://www.facebook.com/100064878600431/posts...,Ko th√™m ch·ªØ c√≤n sau ch·ªØ ‚Äúm√†‚Äù ƒëc √†,Ko th√™m ch·ªØ c√≤n sau ch·ªØ ‚Äúm√†‚Äù ƒëc √†


In [13]:
model_name = "hieudinhpro/BERT_Sentiment_Vietnamese"
save_path = "../models/bert_sentiment_vietnamese"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [4]:
labels = ['negative', 'neutral', 'positive']

class CommentDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def collate_batch(batch_texts):
    return tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True)

In [5]:
texts = df['comment_text_remove_emojis'].fillna("").tolist()
dataset = CommentDataset(texts)
dataloader = DataLoader(dataset, batch_size=16, collate_fn=collate_batch)

all_preds = []

with torch.no_grad():
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        probs = F.softmax(outputs.logits, dim=-1)
        preds = torch.argmax(probs, dim=-1)
        all_preds.extend(preds.cpu().tolist())

In [6]:
df['sentiment'] = [labels[p] for p in all_preds]
df[['comment_text_remove_emojis', 'sentiment']]

Unnamed: 0,comment_text_remove_emojis,sentiment
0,,neutral
1,R·∫•t tuy·ªát v·ªùi,positive
2,Em xin ch√∫c anh m·∫°nh kh·ªèe vui v·∫ª th√†nh c√¥ng,positive
3,Em xin anh ch·ªã b·ªè qua gi√∫p em,neutral
4,Em xin c·∫£m ∆°n m·∫øn ƒê·ª©c,neutral
...,...,...
401,H·∫øt √Ω t∆∞·ªüng r √† ad,positive
402,"√Ä nay Tgdƒë kh√¥ng b√°n laptop n·ªØa, ae chuy·ªÉn qua...",negative
403,,neutral
404,,neutral


In [7]:
sentiment_counts = df['sentiment'].value_counts()
sentiment_percent = df['sentiment'].value_counts(normalize=True) * 100

result = pd.DataFrame({
    'Count': sentiment_counts,
    'Percentage (%)': sentiment_percent.round(2)
})

result

Unnamed: 0_level_0,Count,Percentage (%)
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,211,51.97
neutral,165,40.64
positive,30,7.39


In [8]:
positive_df = df[df['sentiment'] == 'positive']
positive_df

Unnamed: 0,url,comment_text,comment_text_remove_emojis,sentiment
1,https://www.facebook.com/100064878600431/posts...,R·∫•t tuy·ªát v·ªùi,R·∫•t tuy·ªát v·ªùi,positive
2,https://www.facebook.com/100064878600431/posts...,Em xin ch√∫c anh m·∫°nh kh·ªèe vui v·∫ª th√†nh c√¥ng,Em xin ch√∫c anh m·∫°nh kh·ªèe vui v·∫ª th√†nh c√¥ng,positive
7,https://www.facebook.com/100064878600431/posts...,Hi,Hi,positive
10,https://www.facebook.com/100064878600431/posts...,"T·ª± h√†o v·ªÅ b·∫°n qu√°, c·ªë g·∫Øng ph√°t huy nh√©!","T·ª± h√†o v·ªÅ b·∫°n qu√°, c·ªë g·∫Øng ph√°t huy nh√©!",positive
21,https://www.facebook.com/100064878600431/posts...,"""Must-have item"" ƒë∆∞·ª£c sƒÉn l√πng nhi·ªÅu nh·∫•t! Ch√≠...","""Must-have item"" ƒë∆∞·ª£c sƒÉn l√πng nhi·ªÅu nh·∫•t! Ch√≠...",positive
23,https://www.facebook.com/100064878600431/posts...,R·∫•t tuy·ªát v·ªùi,R·∫•t tuy·ªát v·ªùi,positive
30,https://www.facebook.com/100064878600431/posts...,10 ƒë·ªâm cho team v√¨ bi·∫øt nghe l·ªùi ü§£,10 ƒë·ªâm cho team v√¨ bi·∫øt nghe l·ªùi,positive
32,https://www.facebook.com/100064878600431/posts...,R·∫•t tuy·ªát v·ªùi,R·∫•t tuy·ªát v·ªùi,positive
50,https://www.facebook.com/100064878600431/posts...,Content n√†y c√≤n ƒë∆∞·ª£c n√†y add:),Content n√†y c√≤n ƒë∆∞·ª£c n√†y add:),positive
52,https://www.facebook.com/100064878600431/posts...,R·∫•t tuy·ªát v·ªùi,R·∫•t tuy·ªát v·ªùi,positive


In [9]:
df.to_csv('../data/processed/facebook_comments_cleaned_with_sentiment.csv', index=False)