In [3]:
import re
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, RobertaForSequenceClassification

In [4]:
df = pd.read_csv('../processing_data/comment_data_processed.csv')
df

Unnamed: 0,cmtDate,postId,userId,cmtText,cmtLikes
0,2025-06-02 07:05:22+00:00,1147436614095615,pfbid036432FUUhBiAzqFUkjSn8ZR3JqpSz9qfJyZ9qFwA...,R·∫•t tuy·ªát v·ªùi,0
1,2025-06-02 07:04:14+00:00,1147436614095615,100082365036655,Mua laptop,0
2,2025-06-02 06:33:15+00:00,1147436614095615,100010438388165,hello,0
3,2025-06-01 10:42:34+00:00,1146548730851070,pfbid036432FUUhBiAzqFUkjSn8ZR3JqpSz9qfJyZ9qFwA...,R·∫•t tuy·ªát v·ªùi,0
4,2025-06-01 09:24:03+00:00,1146548730851070,100082365036655,Mua 2 c√°i laptop,0
...,...,...,...,...,...
398,2025-05-30 12:42:12+00:00,1145328240973119,pfbid02BEajmCmXeonadqLrSeGzerdkWsy7jHBKL95oRbM...,Trung L√™,0
399,2025-05-30 19:22:30+00:00,1145328240973119,pfbid02B4RgbbDb9uC7z5UweLdB1WYKbES6ch9WMXeeNJn...,Minh Thu·ª∑ =)),0
400,2025-05-30 14:39:11+00:00,1145328240973119,pfbid02r9C46AVkPbc6wBicTGD1ZBQWz8dUpAP81L5zEUa...,"Kh√¥ng b√°n th√¨ th√¥i, thi·∫øu g√¨ ch·ªó mua ü§®",2
401,2025-05-30 12:25:48+00:00,1145328240973119,pfbid0ZrR4v7BcVnZyerN5RdYBWAt6wyuy3kWeqdShbS18...,Tuy·∫øt Ng·ªçc l√† sao,0


In [5]:
def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # m·∫∑t c∆∞·ªùi
        u"\U0001F300-\U0001F5FF"  # bi·ªÉu t∆∞·ª£ng kh√°c
        u"\U0001F680-\U0001F6FF"  # ph∆∞∆°ng ti·ªán
        u"\U0001F1E0-\U0001F1FF"  # qu·ªëc k·ª≥
        u"\U00002702-\U000027B0"  # k√Ω hi·ªáu ƒë·∫∑c bi·ªát
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

df['cmtText_clean'] = df['cmtText'].apply(remove_emoji)

In [6]:
model_name = "hieudinhpro/BERT_Sentiment_Vietnamese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [7]:
labels = ['negative', 'neutral', 'positive']

def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()
        return labels[pred]

In [8]:
df['sentiment'] = df['cmtText_clean'].apply(predict_sentiment)
df

Unnamed: 0,cmtDate,postId,userId,cmtText,cmtLikes,cmtText_clean,sentiment
0,2025-06-02 07:05:22+00:00,1147436614095615,pfbid036432FUUhBiAzqFUkjSn8ZR3JqpSz9qfJyZ9qFwA...,R·∫•t tuy·ªát v·ªùi,0,R·∫•t tuy·ªát v·ªùi,positive
1,2025-06-02 07:04:14+00:00,1147436614095615,100082365036655,Mua laptop,0,Mua laptop,neutral
2,2025-06-02 06:33:15+00:00,1147436614095615,100010438388165,hello,0,hello,neutral
3,2025-06-01 10:42:34+00:00,1146548730851070,pfbid036432FUUhBiAzqFUkjSn8ZR3JqpSz9qfJyZ9qFwA...,R·∫•t tuy·ªát v·ªùi,0,R·∫•t tuy·ªát v·ªùi,positive
4,2025-06-01 09:24:03+00:00,1146548730851070,100082365036655,Mua 2 c√°i laptop,0,Mua 2 c√°i laptop,neutral
...,...,...,...,...,...,...,...
398,2025-05-30 12:42:12+00:00,1145328240973119,pfbid02BEajmCmXeonadqLrSeGzerdkWsy7jHBKL95oRbM...,Trung L√™,0,Trung L√™,positive
399,2025-05-30 19:22:30+00:00,1145328240973119,pfbid02B4RgbbDb9uC7z5UweLdB1WYKbES6ch9WMXeeNJn...,Minh Thu·ª∑ =)),0,Minh Thu·ª∑ =)),neutral
400,2025-05-30 14:39:11+00:00,1145328240973119,pfbid02r9C46AVkPbc6wBicTGD1ZBQWz8dUpAP81L5zEUa...,"Kh√¥ng b√°n th√¨ th√¥i, thi·∫øu g√¨ ch·ªó mua ü§®",2,"Kh√¥ng b√°n th√¨ th√¥i, thi·∫øu g√¨ ch·ªó mua ü§®",negative
401,2025-05-30 12:25:48+00:00,1145328240973119,pfbid0ZrR4v7BcVnZyerN5RdYBWAt6wyuy3kWeqdShbS18...,Tuy·∫øt Ng·ªçc l√† sao,0,Tuy·∫øt Ng·ªçc l√† sao,neutral


In [9]:
df['sentiment'].value_counts()

sentiment
negative    211
neutral     158
positive     34
Name: count, dtype: int64

In [10]:
positive_df = df[df['sentiment'] == 'positive']
positive_df

Unnamed: 0,cmtDate,postId,userId,cmtText,cmtLikes,cmtText_clean,sentiment
0,2025-06-02 07:05:22+00:00,1147436614095615,pfbid036432FUUhBiAzqFUkjSn8ZR3JqpSz9qfJyZ9qFwA...,R·∫•t tuy·ªát v·ªùi,0,R·∫•t tuy·ªát v·ªùi,positive
3,2025-06-01 10:42:34+00:00,1146548730851070,pfbid036432FUUhBiAzqFUkjSn8ZR3JqpSz9qfJyZ9qFwA...,R·∫•t tuy·ªát v·ªùi,0,R·∫•t tuy·ªát v·ªùi,positive
47,2025-06-01 08:35:59+00:00,1146529790852964,pfbid036432FUUhBiAzqFUkjSn8ZR3JqpSz9qfJyZ9qFwA...,R·∫•t tuy·ªát v·ªùi,0,R·∫•t tuy·ªát v·ªùi,positive
57,2025-06-01 02:08:26+00:00,1146504504188826,pfbid036J87CPCX7AGfDMEev1yF1ffikf4LrKX6FCzMYsj...,T·∫∑ng office tr·ªçn ƒë·ªùi hay sao ·∫° üòÜ,2,T·∫∑ng office tr·ªçn ƒë·ªùi hay sao ·∫°,positive
60,2025-06-02 04:26:10+00:00,1147413360764607,100003151770365,Content n√†y c√≤n ƒë∆∞·ª£c n√†y add:),4,Content n√†y c√≤n ƒë∆∞·ª£c n√†y add:),positive
76,2025-06-01 02:45:13+00:00,1146504504188826,100030966722781,Office vƒ©nh vi·ªÖn h·∫£ ta,0,Office vƒ©nh vi·ªÖn h·∫£ ta,positive
89,2025-05-30 15:53:43+00:00,1145328240973119,100002315641810,Ai duy·ªát camp n√†y ƒë∆∞·ª£c c≈©ng hay ü•≤,0,Ai duy·ªát camp n√†y ƒë∆∞·ª£c c≈©ng hay ü•≤,positive
91,2025-05-31 04:47:58+00:00,1145328240973119,pfbid02pAjpumXwThoq3VXRV3GUhKNe9oS5jxZYLczgbzV...,Continued,0,Continued,positive
125,2025-06-01 04:56:57+00:00,1146504504188826,pfbid028gpapzcTZrVUrLtzbEVmEuuvTHgwEwRc68ocPdp...,Co ten t√†o b√≠ ƒëao ü§£,0,Co ten t√†o b√≠ ƒëao ü§£,positive
150,2025-06-01 03:13:23+00:00,1146504504188826,pfbid02JDeG28iipGv6sonh3FxcXfSakH1SyuZJ2BZsqEC...,C∆∞·ªùi v ü§£,4,C∆∞·ªùi v ü§£,positive


In [11]:
df.to_csv('../processing_data/comment_data_processed_with_sentiment.csv', index=False)