# Predict

In [10]:
import requests
from bs4 import BeautifulSoup
import json
import csv

def get_reviews(product_id, page=1):
    url = f"https://tiki.vn/api/v2/reviews?product_id={product_id}&limit=10&page={page}&include=comments"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept": "application/json",
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        reviews = data.get('data', [])
        return reviews
    else:
        print(f"Failed to fetch reviews: {response.status_code}")
        return []

def save_reviews_to_file(reviews, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(reviews, f, ensure_ascii=False, indent=4)

def remove_duplicates(reviews):
    # Dùng set để loại bỏ trùng lặp dựa trên nội dung bình luận
    seen = set()
    unique_reviews = []
    for review in reviews:
        content = review.get('content', '').strip()
        if content not in seen:
            seen.add(content)
            unique_reviews.append(review)
    return unique_reviews

def extract_reviews_to_csv(json_file, csv_file):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    filtered_reviews = []
    
    # Lọc bình luận và nhãn
    for review in data:
        content = review.get("content", "").strip()
        
        if content:
            filtered_reviews.append({"content": content})
    
    # Ghi dữ liệu vào file CSV
    with open(csv_file, "w", encoding="utf-8-sig", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["content"])
        writer.writeheader()
        writer.writerows(filtered_reviews)
    
    print(f"Extracted {len(filtered_reviews)} reviews and saved to {csv_file}")

def main():
    product_id = 154298071  # ID của sản phẩm từ URL
    all_reviews = []
    for page in range(1, 20):  # Crawl 9 trang
        print(f"Fetching page {page}...")
        reviews = get_reviews(product_id, page)
        if not reviews:
            break
        all_reviews.extend(reviews)
    
    print(f"Total reviews fetched: {len(all_reviews)}")

    # Loại bỏ bình luận trùng lặp
    unique_reviews = remove_duplicates(all_reviews)
    print(f"Total unique reviews: {len(unique_reviews)}")

    # Lưu các bình luận đã loại bỏ trùng lặp
    save_reviews_to_file(unique_reviews, "data_test/test_1.json")

    input_file = "data_test/test_1.json"  # Tệp JSON đầu vào
    output_csv = "data_test/test_1.csv"  # Tệp CSV đầu ra
    extract_reviews_to_csv(input_file, output_csv)

if __name__ == "__main__":
    main()


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Fetching page 19...
Total reviews fetched: 190
Total unique reviews: 98
Extracted 97 reviews and saved to data_test/test_1.csv


In [11]:
import pandas as pd

# Đọc tệp stopword_vietnamese.txt và tạo danh sách từ dừng
def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stopwords = file.readlines()
    stopwords = [word.strip() for word in stopwords]  # Loại bỏ khoảng trắng thừa
    return stopwords

# Tải stopwords
stop_words = load_stopwords("stopwords_vietnamese.txt")

# Loại bỏ từ dừng
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

# Chuấn hóa dữ liệu
def standardize_data(row):
    # Convert to lowercase
    row = row.lower()
    
    # Remove stopwords (if you have a list of stopwords, you can use it here)
    row = row.replace(",", " ").replace(".", " ") \
        .replace(";", " ").replace("“", " ") \
        .replace(":", " ").replace("”", " ") \
        .replace('"', " ").replace("'", " ") \
        .replace("!", " ").replace("?", " ") \
        .replace("-", " ").replace("?", " ") \
        .replace("(", " ").replace(")", " ")

    row = row.strip()
    return row
# Chưa được xử lý dữ liệu
df = pd.read_csv("data_test/test_1.csv")
print("Dữ liệu chưa được làm sạch: \n",df.head(), end="\n\n")
# Đã được xử lý làm sạch dữ liệu
df['content'] = df['content'].apply(standardize_data).apply(remove_stopwords)
print("Dữ liệu đã được làm sạch: \n",df.head())

# Lưu dữ liệu đã xử lý vào file mới
output_file = "data_test/processed_test_1.csv"
df.to_csv(output_file, index=False, encoding="utf-8-sig")


Dữ liệu chưa được làm sạch: 
                                              content
0  Giá cả hợp lý. Quần may kỹ, chắc chắn. Cắt chỉ...
1  Sản phẩm quá tuyệt vời, giao hàng nhanh đóng g...
2  Tiki giao 2h Siêu nhanh, kịp nhận quần để tham...
3  hàng cực kì chất lượng,kiểu dáng màu sắc đẹp. ...
4  Tiki giao hàng đúng hạn. Đóng gói đẹp, cẩn thậ...

Dữ liệu đã được làm sạch: 
                                              content
0  giá hợp lý quần may kỹ chắn cắt thừa vệ sinh s...
1  sản phẩm tuyệt vời giao hàng đóng gói siêu cẩn...
2  tiki giao 2h siêu kịp quần tham gia giải chạy ...
3  hàng cực kì chất kiểu dáng màu sắc đẹp hộp gói...
4  tiki giao hàng hạn đóng gói đẹp cẩn thận hàng ...


In [12]:
import pandas as pd
import torch
from transformers import BertTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Đường dẫn file
input_file = "data_test/processed_test_1.csv"  # File đầu vào
output_file = "data_test/output_test_labeled_1.csv"  # File đầu ra

# Khởi tạo model và tokenizer
MODEL_NAME = 'bert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Đưa model về GPU nếu có
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Dataset tùy chỉnh cho batch processing
class CommentDataset(Dataset):
    def __init__(self, comments, tokenizer, max_len=128):
        self.comments = comments
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        comment = str(self.comments[idx])
        encoding = self.tokenizer(
            comment,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
        }

# Hàm dự đoán nhãn
def predict_labels(comments, batch_size=32):
    dataset = CommentDataset(comments, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    all_preds = []
    model.eval()

    for batch in tqdm(dataloader, desc="Processing Batches"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)
            preds = torch.argmax(probs, dim=1).cpu().numpy()
            all_preds.extend(preds)

    return all_preds

# Đọc file đầu vào
try:
    df = pd.read_csv(input_file)
    if "content" not in df.columns:
        raise KeyError("Cột 'content' không tồn tại trong file CSV.")
    print(f"Đã tải dữ liệu từ {input_file} với {len(df)} bình luận.")
except Exception as e:
    raise ValueError(f"Không thể đọc file CSV: {e}")

# Dự đoán nhãn
try:
    comments = df["content"].fillna("").tolist()  # Thay thế giá trị null bằng chuỗi rỗng
    print("Bắt đầu gán nhãn...")
    labels = predict_labels(comments)
    df["label"] = labels
except Exception as e:
    raise RuntimeError(f"Lỗi khi dự đoán nhãn: {e}")

# Lưu kết quả ra file
try:
    df.to_csv(output_file, index=False)
    print(f"Đã lưu file với nhãn tại: {output_file}")
except Exception as e:
    raise IOError(f"Lỗi khi lưu file: {e}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Đã tải dữ liệu từ data_test/processed_test_1.csv với 97 bình luận.
Bắt đầu gán nhãn...


Processing Batches: 100%|██████████| 4/4 [00:11<00:00,  3.00s/it]

Đã lưu file với nhãn tại: data_test/output_test_labeled_1.csv





In [17]:
import pandas as pd
import torch
from transformers import BertTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
def load_pretrainModel(data):
    
    '''
    Load pretrain model/ tokenizers
    Return : features
    '''
    MODEL_NAME = 'bert-base-uncased'
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

    #encode lines
    tokenized = data.apply((lambda x: tokenizer.encode(x, add_special_tokens = True)))

    # get lenght max of tokenized
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)
    print('max len:', max_len)

    # if lenght of tokenized not equal max_len , so padding value 0
    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    print('padded:', padded[1])
    print('len padded:', padded.shape)

    #get attention mask ( 0: not has word, 1: has word)
    attention_mask = np.where(padded ==0, 0,1)
    print('attention mask:', attention_mask[1])

    # Convert input to tensor
    padded = torch.tensor(padded)
    attention_mask = torch.tensor(attention_mask)


    # Load model
    with torch.no_grad():
        last_hidden_states = model(padded, attention_mask =attention_mask)
    #     print('last hidden states:', last_hidden_states)

    features = last_hidden_states[0][:,0,:].numpy()
    print('features:', features)
    
    return features


In [22]:
import pandas as pd
import joblib
import numpy as np

def predict(file_path):
    data = pd.read_csv(file_path)
    data['content'] = data['content'].fillna('')  # Replace NaN with empty string
    features = load_pretrainModel(data['content'])
    # 2. Load weights
    model = joblib.load('save_model.pkl')
    # 3. Result
    result = model.predict(features)
    print(result)
    print(analyze(result))

predict("data_test/output_test_labeled_1.csv")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

max len: 64
padded: [  101  2624  6887  3286 10722  6672  2102 29536  2072 27699  2080  6865
  1102  5063  2175  2072  9033 13765  2064  2084  6865  8945  2278 11382
 14163  2050  6865 14841  3211  3393  1102  4887 15030  1037  1062 23564
  2100  1056  5705  4497   102     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
len padded: (97, 64)
attention mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


IndexError: too many indices for tensor of dimension 2