In [2]:
import openai
import json
import pandas as pd
import random
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Đọc file JSON chứa các review
df = pd.read_csv('amazon-reviews/cleaned_reviews.csv')
print(df["cleaned_review"].head())

0    i wish would have gotten one earlier love it a...
1    i ve learned this lesson again open the packag...
2            it is so slow and lags find better option
3    roller ball stopped working within months of m...
4    i like the color and size but it few days out ...
Name: cleaned_review, dtype: object


In [None]:
reviews = df['cleaned_review'].dropna().astype(str).tolist()

## Loại bỏ reviews quá ngắn
def chunk_reviews(reviews, chunk_size=10):
    for i in range(0, len(reviews), chunk_size):
        yield reviews[i:i + chunk_size]

sample_size = 20  # Số lượng review muốn lấy mẫu, có thể điều chỉnh
if len(reviews) > sample_size:
    reviews = random.sample(reviews, sample_size)

# Tiền xử lý: Loại bỏ stopwords, từ đồng nghĩa, ký tự đặc biệt, cú pháp không cần thiết mà vẫn giữ được bản chất ban đầu

# Tải stopwords nếu chưa có
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_review(review):
    # Loại bỏ ký tự đặc biệt, giữ lại chữ, số, dấu câu cơ bản
    review = re.sub(r"[^a-zA-ZÀ-ỹ0-9,.!?;:\s\-]", '', review)
    # Chuyển về chữ thường
    review = re.sub(r'https?://\S+|www\.\S+', '', review)
    review = re.sub(r'<.*?>', '', review)
    # Rút gọn khoảng trắng
    text = re.sub(r'\s+', ' ', text).strip()
    review = review.lower()
    # Tách từ
    words = review.split()
    # Loại bỏ stopwords và lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Ghép lại thành câu
    return ' '.join(words)

reviews = [preprocess_review(r) for r in reviews]

print("Processed reviews:", reviews)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hoangtuan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/hoangtuan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processed reviews: ['turn night light backlit keyboard sufficient enough see key letter number touch keyboard quiet enough roommate sleep soundly', 'great deal price sound amazing', 'bought february already stopped working headset stand place done using like tossed aside like use every day either eta customer service sent replacement nice work', 'cheap quality really comfortable mic good literally snap crackle pop go put head', 'mouse light fit hand well fun watch color slowly change however stayed charged long cannot used charging', 'really good', 'wireless mode nice sleek design purchased matte design love click quieter others think mattered much problem cursor speed connectivity enjoy change color feel expensive actually excellent price', 'great gaming', 'presentation work plug lap top television use mouse advance slide', 'awesome mouse slender build fit small hand light totally fun click button strong click know clicked something also quick response computer switch plus automatic s

In [None]:
%pip install torch

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")

def tokenize_reviews(reviews, max_length=512):
    tokenized_reviews = []
    for review in reviews:
        tokens = tokenizer.encode(review, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')
        tokenized_reviews.append(tokens)
    return tokenized_reviews

tokenized_reviews = tokenize_reviews(reviews)
print("Tokenized reviews:", tokenized_reviews)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [None]:
## Tách train/test nếu chưa có:

from sklearn.model_selection import train_test_split
train_reviews, test_reviews = train_test_split(tokenized_reviews, test_size=0.2, random_state=42)
print("Train reviews:", train_reviews)

In [None]:
##  Lưu dữ liệu tokenized 

import torch

torch.save(tokenized_reviews, 'tokenized_reviews.pt')
