In [1]:
!pip install pyvi

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.11 pyvi-0.1.1 sklearn-crfsui

In [2]:
import pandas as pd
import numpy as np
import pickle
import re
import torch
from pyvi import ViTokenizer
from transformers import AutoTokenizer, AutoModel
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import load_model
import warnings
warnings.filterwarnings('ignore')
from joblib import dump, load

## Load train data and model

In [3]:
data_train = pd.read_csv("/kaggle/input/dataset/data_train.csv")
data_train['processed_text'] = data_train['processed_text'].astype(str)

In [4]:
with open ("/kaggle/input/vietnamese-aspect-based-sentiment-analysis/keras/default/1/random_forest_ad_model.pkl", "rb") as file:
    rf_clf = pickle.load(file)
with open ("/kaggle/input/vietnamese-aspect-based-sentiment-analysis/keras/default/1/SVM_se.pkl", "rb") as file:
    svm_clf = pickle.load(file)
bigru_conv = load_model("/kaggle/input/vietnamese-aspect-based-sentiment-analysis/keras/default/1/bigru_conv1D_model.h5")

## Get input text

In [5]:
# preprocess text
def preprocess_text(input_text):
    input_text = input_text.lower()
    input_text = re.sub(r'[^\w\s]', '', input_text)
    input_text = ' '.join(input_text.split())
    tokenized_text = ViTokenizer.tokenize(input_text)
    return tokenized_text

## Embedding

In [7]:
# tfidf embedding
emb = TfidfVectorizer(min_df=5, max_df=0.8,max_features=3000,sublinear_tf=True)
emb.fit(data_train['processed_text'])
# emb = load("/kaggle/working/tfidf_embedding.joblib")

In [8]:
# phoBert embedding
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
model = AutoModel.from_pretrained("vinai/phobert-base-v2")

def get_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs[0][:, 0, :]
    return embedding.numpy()

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Predict

In [10]:
aspect_columns = data_train.columns[:-1]

In [11]:
# machine learning predict
replacements = {0: None, 3: 'positive', 1: 'negative', 2: 'neutral'}
def ml_predict(ml_embedding_text):
    ml_results = {}
    for aspect in aspect_columns:
        if rf_clf[aspect].predict(ml_embedding_text) == 1:
            if aspect == 'others':
                ml_results['others'] = 'neutral'
            else:
                svm_pred = svm_clf[aspect].predict(ml_embedding_text)
                ml_results[aspect] = replacements[svm_pred[0]]
    return ml_results

In [17]:
# deep learning predict
def dl_predict(dl_embedding_text):
    dl_results = {}
    prediction = bigru_conv.predict(dl_embedding_text)
    predicted_labels = np.argmax(prediction, axis=-1).tolist()
    sentiments = map(lambda x: replacements[x], predicted_labels[0])
    for aspect, sentiment in zip(aspect_columns, sentiments):
        if sentiment: dl_results[aspect] = sentiment
    return dl_results

In [20]:
input_text = input()
processed_text = preprocess_text(input_text)
ml_embedding_text = emb.transform([processed_text])[0]
dl_embedding_text = get_embedding(processed_text)
ml_results = ml_predict(ml_embedding_text)
dl_results = dl_predict(dl_embedding_text)
print("ml_predict:")
for key, value in ml_results.items():
    print(f"==> {key} : {value}")
print("dl_predict:")
for key, value in dl_results.items():
    print(f"==> {key} : {value}")

 nhân viên hỗ trợ nhiệt tình, shipper giao hàng nhanh, chất liệu tạm được, nói chung là bình thường


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
ml_predict:
==> delivery : positive
==> fabric_quality : neutral
dl_predict:
==> delivery : positive
