In [1]:
import pandas as pd
import emoji
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
df = pd.read_csv('../../data/apify/comments_apify_processed.csv')
df.head(10)

Unnamed: 0,cmtDate,postId,userId,cmtText,cmtLikes
0,2025-06-02 07:05:22+00:00,1147436614095615,pfbid036432FUUhBiAzqFUkjSn8ZR3JqpSz9qfJyZ9qFwA...,R·∫•t tuy·ªát v·ªùi,0
1,2025-06-02 07:04:14+00:00,1147436614095615,100082365036655,Mua laptop,0
2,2025-06-02 06:33:15+00:00,1147436614095615,100010438388165,hello,0
3,2025-06-01 10:42:34+00:00,1146548730851070,pfbid036432FUUhBiAzqFUkjSn8ZR3JqpSz9qfJyZ9qFwA...,R·∫•t tuy·ªát v·ªùi,0
4,2025-06-01 09:24:03+00:00,1146548730851070,100082365036655,Mua 2 c√°i laptop,0
5,2025-06-01 03:43:02+00:00,1146504504188826,pfbid02vrerdeMR2eCGuRDBSkngtWyL5eoMRZkgLn11m5C...,"""Kh√¥ng ch·ªâ... M√† c√≤n...""\nGi·ªù gi√°o vi√™n ti·∫øng ...",32
6,2025-06-01 02:29:56+00:00,1146504504188826,100005658369921,"Laptop l√† laptop, ch·ª´ng n√†o b√°n c·∫∑p s√°ch th√¨ n...",39
7,2025-06-01 01:04:13+00:00,1146504504188826,pfbid0Y3aDzJzsRfJtZbJCigEeGGAD7LENS4DHTizR1K2p...,Content c·ªßa th·∫ø h·ªá c·ª£t nh·∫£ ƒë√¢y sao,39
8,2025-06-01 02:54:30+00:00,1146504504188826,pfbid0BKiWDx96iWSYTerMP7zS7LbciGFJeFoKqbUuCskw...,Content nh·∫°t h∆°n n∆∞·ªõc c·∫•t ü§¶‚Äç‚ôÄÔ∏è,22
9,2025-06-01 01:46:20+00:00,1146504504188826,pfbid02TWA7CFgJmJbEgbZCHsprRLCDsCjqMKL53KRf6f6...,L√†m content ch√°n qu√° shop ∆°i,23


In [3]:
def remove_emoji(text):
    return emoji.replace_emoji(str(text), replace='')

df['cmtText_clean'] = df['cmtText'].apply(remove_emoji)
df[['cmtText', 'cmtText_clean']].head(10)

Unnamed: 0,cmtText,cmtText_clean
0,R·∫•t tuy·ªát v·ªùi,R·∫•t tuy·ªát v·ªùi
1,Mua laptop,Mua laptop
2,hello,hello
3,R·∫•t tuy·ªát v·ªùi,R·∫•t tuy·ªát v·ªùi
4,Mua 2 c√°i laptop,Mua 2 c√°i laptop
5,"""Kh√¥ng ch·ªâ... M√† c√≤n...""\nGi·ªù gi√°o vi√™n ti·∫øng ...","""Kh√¥ng ch·ªâ... M√† c√≤n...""\nGi·ªù gi√°o vi√™n ti·∫øng ..."
6,"Laptop l√† laptop, ch·ª´ng n√†o b√°n c·∫∑p s√°ch th√¨ n...","Laptop l√† laptop, ch·ª´ng n√†o b√°n c·∫∑p s√°ch th√¨ n..."
7,Content c·ªßa th·∫ø h·ªá c·ª£t nh·∫£ ƒë√¢y sao,Content c·ªßa th·∫ø h·ªá c·ª£t nh·∫£ ƒë√¢y sao
8,Content nh·∫°t h∆°n n∆∞·ªõc c·∫•t ü§¶‚Äç‚ôÄÔ∏è,Content nh·∫°t h∆°n n∆∞·ªõc c·∫•t
9,L√†m content ch√°n qu√° shop ∆°i,L√†m content ch√°n qu√° shop ∆°i


In [4]:
model_name = "hieudinhpro/BERT_Sentiment_Vietnamese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [5]:
labels = ['negative', 'neutral', 'positive']

class CommentDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def collate_batch(batch_texts):
    return tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True)

In [6]:
texts = df['cmtText_clean'].fillna("").tolist()
dataset = CommentDataset(texts)
dataloader = DataLoader(dataset, batch_size=16, collate_fn=collate_batch)

all_preds = []

with torch.no_grad():
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        probs = F.softmax(outputs.logits, dim=-1)
        preds = torch.argmax(probs, dim=-1)
        all_preds.extend(preds.cpu().tolist())

In [7]:
df['sentiment'] = [labels[p] for p in all_preds]
df[['cmtText_clean', 'sentiment']]

Unnamed: 0,cmtText_clean,sentiment
0,R·∫•t tuy·ªát v·ªùi,positive
1,Mua laptop,neutral
2,hello,neutral
3,R·∫•t tuy·ªát v·ªùi,positive
4,Mua 2 c√°i laptop,neutral
...,...,...
398,Trung L√™,positive
399,Minh Thu·ª∑ =)),neutral
400,"Kh√¥ng b√°n th√¨ th√¥i, thi·∫øu g√¨ ch·ªó mua",negative
401,Tuy·∫øt Ng·ªçc l√† sao,neutral


In [8]:
sentiment_counts = df['sentiment'].value_counts()
sentiment_percent = df['sentiment'].value_counts(normalize=True) * 100

result = pd.DataFrame({
    'Count': sentiment_counts,
    'Percentage (%)': sentiment_percent.round(2)
})

result

Unnamed: 0_level_0,Count,Percentage (%)
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,214,53.1
neutral,155,38.46
positive,34,8.44


In [9]:
positive_df = df[df['sentiment'] == 'positive']
positive_df

Unnamed: 0,cmtDate,postId,userId,cmtText,cmtLikes,cmtText_clean,sentiment
0,2025-06-02 07:05:22+00:00,1147436614095615,pfbid036432FUUhBiAzqFUkjSn8ZR3JqpSz9qfJyZ9qFwA...,R·∫•t tuy·ªát v·ªùi,0,R·∫•t tuy·ªát v·ªùi,positive
3,2025-06-01 10:42:34+00:00,1146548730851070,pfbid036432FUUhBiAzqFUkjSn8ZR3JqpSz9qfJyZ9qFwA...,R·∫•t tuy·ªát v·ªùi,0,R·∫•t tuy·ªát v·ªùi,positive
47,2025-06-01 08:35:59+00:00,1146529790852964,pfbid036432FUUhBiAzqFUkjSn8ZR3JqpSz9qfJyZ9qFwA...,R·∫•t tuy·ªát v·ªùi,0,R·∫•t tuy·ªát v·ªùi,positive
57,2025-06-01 02:08:26+00:00,1146504504188826,pfbid036J87CPCX7AGfDMEev1yF1ffikf4LrKX6FCzMYsj...,T·∫∑ng office tr·ªçn ƒë·ªùi hay sao ·∫° üòÜ,2,T·∫∑ng office tr·ªçn ƒë·ªùi hay sao ·∫°,positive
60,2025-06-02 04:26:10+00:00,1147413360764607,100003151770365,Content n√†y c√≤n ƒë∆∞·ª£c n√†y add:),4,Content n√†y c√≤n ƒë∆∞·ª£c n√†y add:),positive
76,2025-06-01 02:45:13+00:00,1146504504188826,100030966722781,Office vƒ©nh vi·ªÖn h·∫£ ta,0,Office vƒ©nh vi·ªÖn h·∫£ ta,positive
89,2025-05-30 15:53:43+00:00,1145328240973119,100002315641810,Ai duy·ªát camp n√†y ƒë∆∞·ª£c c≈©ng hay ü•≤,0,Ai duy·ªát camp n√†y ƒë∆∞·ª£c c≈©ng hay,positive
91,2025-05-31 04:47:58+00:00,1145328240973119,pfbid02pAjpumXwThoq3VXRV3GUhKNe9oS5jxZYLczgbzV...,Continued,0,Continued,positive
150,2025-06-01 03:13:23+00:00,1146504504188826,pfbid02JDeG28iipGv6sonh3FxcXfSakH1SyuZJ2BZsqEC...,C∆∞·ªùi v ü§£,4,C∆∞·ªùi v,positive
165,2025-05-30 13:26:56+00:00,1145328240973119,100008200831345,Chak ban may tinh sach tay,0,Chak ban may tinh sach tay,positive


In [10]:
df.to_csv('../../data/apify/comments_apify_processed_with_sentiment.csv', index=False)