In [1]:
!pip install nltk spacy transformers pandas torch

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

In [2]:
import pandas as pd
import numpy as np
import spacy
import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [4]:
# Định nghĩa từ điển contractions
contractions = {
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "let's": "let us",
    "might've": "might have",
    "mustn't": "must not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "we'd": "we would",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who're": "who are",
    "who's": "who is",
    "who've": "who have",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

In [5]:
# Hàm mở rộng contractions
def expand_contractions(text):
    for contraction, expanded in contractions.items():
        text = re.sub(r'\b' + contraction + r'\b', expanded, text, flags=re.IGNORECASE)
    return text

# Hàm loại bỏ HTML tags
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# Hàm loại bỏ ký tự đặc biệt và số
def remove_special_letters(text, remove_digits=True):
    pattern = r'[^a-zA-Z\s]' if remove_digits else r'[^a-zA-Z0-9\s]'
    text = re.sub(pattern, '', text)
    return text

# Hàm tiền xử lý văn bản
def preprocess_text_batch(texts):
    # Chuyển thành chuỗi và mở rộng contractions
    texts = [expand_contractions(str(t)) for t in texts]
    # Loại bỏ HTML tags
    texts = [strip_html(t) for t in texts]
    # Chuyển thành chữ thường và loại bỏ ký tự đặc biệt
    texts = [remove_special_letters(t, remove_digits=True).lower() for t in texts]
    
    # Loại bỏ stopwords
    stop_words = set(stopwords.words('english'))
    
    # Tokenization và POS tagging với spaCy, sau đó lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = []
    for doc in nlp.pipe(texts, batch_size=1000):
        # Lấy POS tag và lemmatize dựa trên POS
        tokenized = []
        for token in doc:
            if token.text not in stop_words:
                # Chuyển đổi POS tag của spaCy sang định dạng WordNet
                pos = get_wordnet_pos(token.pos_)
                if pos:  # Chỉ lemmatize nếu có POS hợp lệ
                    lemma = lemmatizer.lemmatize(token.text, pos=pos)
                else:
                    lemma = lemmatizer.lemmatize(token.text)
                tokenized.append(lemma)
        tokens.append(tokenized)
    return tokens

# Hàm ánh xạ POS tag của spaCy sang WordNet POS
def get_wordnet_pos(spacy_pos):
    from nltk.corpus import wordnet
    if spacy_pos in ['ADJ']:
        return wordnet.ADJ
    elif spacy_pos in ['VERB']:
        return wordnet.VERB
    elif spacy_pos in ['NOUN']:
        return wordnet.NOUN
    elif spacy_pos in ['ADV']:
        return wordnet.ADV
    return None

In [7]:
# Đọc dữ liệu
imdb_data = pd.read_csv('/kaggle/input/imdb-datasetin/IMDB Dataset(in).csv')
# Lưu văn bản gốc
imdb_data['original_review'] = imdb_data['review']

In [8]:
# Áp dụng tiền xử lý
print("Đang tiền xử lý dữ liệu...")
texts = imdb_data['original_review'].tolist()
preprocessed_texts = preprocess_text_batch(texts)
imdb_data['review'] = preprocessed_texts
print("Tiền xử lý hoàn tất!")

Đang tiền xử lý dữ liệu...
Tiền xử lý hoàn tất!


In [11]:
# Lưu kết quả tiền xử lý
output_df = pd.DataFrame({
    # 'review_id': range(1, len(imdb_data) + 1),
    # 'original_review': imdb_data['original_review'],
    'sentiment': imdb_data['sentiment'],
    'preprocessed_tokens': [" ".join(tokens) if isinstance(tokens, list) else "" for tokens in preprocessed_texts]
})
output_file = '/kaggle/working/preprocessed_results.csv'
output_df.to_csv(output_file, index=False)

In [12]:
# Xem trước kết quả
print("\nXem trước DataFrame đầu ra:")
print(output_df.head(5))


Xem trước DataFrame đầu ra:
  sentiment                                preprocessed_tokens
0  positive  one reviewer mention watch   oz episode hook r...
1  positive  wonderful little production filming technique ...
2  positive  think wonderful way spend time hot summer week...
3  negative  basically family little boy jake think zombie ...
4  positive  petter matteis love time money visually stunni...
