In [None]:
!pip install transformers accelerate
!pip install vncorenlp

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m97.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Down

In [None]:
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

--2023-07-04 08:47:59--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27412575 (26M) [application/octet-stream]
Saving to: ‘VnCoreNLP-1.1.1.jar’


2023-07-04 08:47:59 (175 MB/s) - ‘VnCoreNLP-1.1.1.jar’ saved [27412575/27412575]

--2023-07-04 08:47:59--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 526544 (514K) [application/octet-stream]
Saving to: ‘vi-voc

In [None]:
from google.colab import drive
import os
import re
import pandas as pd
import numpy as np
from vncorenlp import VnCoreNLP
from transformers import BertConfig, AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer

In [None]:
labels_task_1 = ["no-spam", "spam"]
labels_task_2 = ["no-spam", "spam-1", "spam-2", "spam-3"]

In [None]:
drive.mount('/gdrive')
DIR_ROOT = '/gdrive/MyDrive/SpamReviewDetect'

STOPWORDS_PATH = os.path.join(DIR_ROOT, 'vietnamese-stopwords-dash.txt')

MODEL_DIR = os.path.join(DIR_ROOT, 'transformer_model')
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

Mounted at /gdrive


In [None]:
train_data = pd.read_csv(PATH_TRAIN)
X_train = train_data.iloc[:, 0:2]
y_train = train_data.iloc[:, 2:4]

dev_data = pd.read_csv(PATH_DEV)
X_dev = dev_data.iloc[:, 0:2]
y_dev = dev_data.iloc[:, 2:4]

test_data = pd.read_csv(PATH_TEST)
X_test = test_data.iloc[:, 0:2]
y_test = test_data.iloc[:, 2:4]

In [None]:
vncorenlp = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar",
                      annotators="wseg", max_heap_size='-Xmx500m')

In [None]:
with open(STOPWORDS_PATH, "r") as ins:
    stopwords = []
    for line in ins:
        dd = line.strip('\n')
        stopwords.append(dd)
    stopwords = set(stopwords)

In [None]:
def filter_stop_words(train_sentences, stop_words):
    new_sent = [word for word in train_sentences.split()
                if word not in stop_words]
    train_sentences = ' '.join(new_sent)

    return train_sentences


def deEmojify(text):
    regrex_pattern = re.compile(pattern="["
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                "]+", flags=re.UNICODE)
    return regrex_pattern.sub(r'', text)


def preprocess(text, tokenized=True, lowercased=True):
    text = filter_stop_words(text, stopwords)
    text = deEmojify(text)
    text = text.lower() if lowercased else text
    if tokenized:
        pre_text = ""
        sentences = vncorenlp.tokenize(text)
        for sentence in sentences:
            pre_text += " ".join(sentence)
        text = pre_text
    return text


def pre_process_features(X, y1, y2, tokenized=True, lowercased=True):
    X = np.array(X)
    y1 = np.array(y1)
    y2 = np.array(y2)
    X = [preprocess(str(p), tokenized=tokenized, lowercased=lowercased)
         for p in list(X)]
    for idx, ele in enumerate(X):
        if not ele:
            np.delete(X, idx)
            np.delete(y1, idx)
            np.delete(y2, idx)
    return X, y1, y2

In [None]:
# config_task_1_name = BertConfig.from_json_file("/gdrive/MyDrive/SpamReviewDetect/transformer_model/phobert/task_1/config.json")
model_task_1_name = "/gdrive/MyDrive/SpamReviewDetect/transformer_model/phobert/task_1"
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
model_task_1 = AutoModelForSequenceClassification.from_pretrained(model_task_1_name) #config=config_task_1_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
X, Y, Y_spam = pre_process_features(
    X_test['Comment'], y_test['Label'], y_test['SpamLabel'], tokenized=True, lowercased=False)

In [None]:
print(Y[1])

0


In [None]:
tokenized_input = tokenizer(str(X[0]), truncation=True, padding=True, max_length=100, return_tensors="pt")
outputs = model_task_1(**tokenized_input)
predicted_class_id = outputs.logits.argmax().item()
print(predicted_class_id)
result = [0,0,0,0]
sum_false = 0
for index in range(len(X)):
  tokenized_input = tokenizer(str(X[index]), truncation=True, padding=True, max_length=100, return_tensors="pt")
  outputs = model_task_1(**tokenized_input)
  predicted_class_id = outputs.logits.argmax().item()
  result[predicted_class_id] += 1
  # if predicted_class_id == 1:
  #   print()
  if predicted_class_id != Y[index]:
    sum_false += 1
    print("--------",sum_false,"--------data number------",index)
    print(X[index])
    print(labels_task_1[predicted_class_id],labels_task_1[Y[index]])
print(result)
print(sum_false)

1
-------- 1 --------data number------ 12
❤❤Khuyến mai gói v 120 ❤️❤️ Giá lẻ 120 k / Mua 6 80 k / ố lượt gọi amm ại mạng / Tặng 2 gb / ngày .Hàng đóng_gói cẩn_thận .Hàng đẹp , shop tư_vấn nhiệt_tình .Hàng đóng_gói cẩn_thận .Hàng đẹp , shop tư_vấn nhiệt_tình .Hàng đóng_gói cẩn_thận .Hàng đẹp , shop tư_vấn nhiệt_tình .
no-spam spam
-------- 2 --------data number------ 20
Giá chăng , tiết_kiệm , tiki giao hàng hẹn , giặt chất ưng
no-spam spam
-------- 3 --------data number------ 38
Mua bnh túi đựng k bao h ấy , túi shop rẻ chất chất sờ
no-spam spam
-------- 4 --------data number------ 44
Ok mặt .Quá ok 🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🤪🤪🤪🤪🤪🧐🥳🥳🥸🥸🤪🧐🧐🥳🤩🤨🤩🧐🥳🥳🧐
spam no-spam
-------- 5 --------data number------ 46
Shopeemall gian_dối lừa_đảo à .. đặt ensure giao pediasure nhắn shop 3 trời ko 1 hỗ_trợ phản_hồi đơn .. gọi đt cty cty bảo chờ liên_hệ hỗ_trợ hỗ_trợ lại .. thế đợi jo chẳng đâu .. lừa_đảo
spam no-spam
-------- 6 --------data number------ 51
Ngôi trọng đâu , ngủ giường .Ô_tô đâu , tốc_độ phạt tiền .Túi_xách

In [None]:
model_task_2_name = "/gdrive/MyDrive/SpamReviewDetect/transformer_model/phobert/task_2"
model_task_2 = AutoModelForSequenceClassification.from_pretrained(model_task_2_name)

In [None]:
# tokenized_input = tokenizer(str(X[0]), truncation=True, padding=True, max_length=100, return_tensors="pt")
# outputs = model_task_1(**tokenized_input)
# predicted_class_id = outputs.logits.argmax().item()
# print(predicted_class_id)
result = [0,0,0,0]
sum_false = 0
for index in range(len(X)):
  tokenized_input = tokenizer(str(X[index]), truncation=True, padding=True, max_length=100, return_tensors="pt")
  outputs = model_task_2(**tokenized_input)
  predicted_class_id = outputs.logits.argmax().item()
  result[predicted_class_id] += 1
  # if predicted_class_id == 1:
  #   print()
  if predicted_class_id != Y_spam[index]:
    sum_false += 1
    print("--------",sum_false,"--------data number------",index)
    print(X[index])
    print(labels_task_2[predicted_class_id],labels_task_2[Y_spam[index]])
print(result)
print(sum_false)

-------- 1 --------data number------ 12
❤❤Khuyến mai gói v 120 ❤️❤️ Giá lẻ 120 k / Mua 6 80 k / ố lượt gọi amm ại mạng / Tặng 2 gb / ngày .Hàng đóng_gói cẩn_thận .Hàng đẹp , shop tư_vấn nhiệt_tình .Hàng đóng_gói cẩn_thận .Hàng đẹp , shop tư_vấn nhiệt_tình .Hàng đóng_gói cẩn_thận .Hàng đẹp , shop tư_vấn nhiệt_tình .
no-spam spam-3
-------- 2 --------data number------ 20
Giá chăng , tiết_kiệm , tiki giao hàng hẹn , giặt chất ưng
no-spam spam-2
-------- 3 --------data number------ 38
Mua bnh túi đựng k bao h ấy , túi shop rẻ chất chất sờ
no-spam spam-2
-------- 4 --------data number------ 41
Kcn anessa k bàn rồi .Mua 1 đôi 1 lọ 60ml + 1 lọ 20ml giá 500k .Đc tặng 1 lọ mini 12ml vs 1 tuýp gel chống nắng 15gr sample nữa .Quá hời
spam-3 no-spam
-------- 5 --------data number------ 44
Ok mặt .Quá ok 🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🤪🤪🤪🤪🤪🧐🥳🥳🥸🥸🤪🧐🧐🥳🤩🤨🤩🧐🥳🥳🧐
spam-3 no-spam
-------- 6 --------data number------ 46
Shopeemall gian_dối lừa_đảo à .. đặt ensure giao pediasure nhắn shop 3 trời ko 1 hỗ_trợ phản_hồi đơn .. gọi đ