In [10]:
!pip install --upgrade datasets transformers



In [29]:
import spacy
import nltk
from datasets import load_dataset
import random

# Tải mô hình spaCy tiếng Anh
nlp = spacy.load("en_core_web_sm")

# Tải dữ liệu IMDB
dataset = load_dataset("imdb")
reviews_train = dataset["train"].select(range(1000))

# Lemma dạng gốc để khớp chính xác
aspect_keywords = {
    "plot", "story", "acting", "actor", "actress", "performance", "character",
    "dialogue", "visual", "cinematography", "music", "soundtrack", "direction",
    "ending", "scene"
}

In [12]:
def extract_noun_phrase_aspects(text, min_len=1, max_len=4):
    doc = nlp(text)
    aspects = []
    for chunk in doc.noun_chunks:

        phrase = chunk.text.lower().strip()
        if min_len <= len(phrase.split()) <= max_len:
            aspects.append(phrase)
    return aspects

example = "The plot was weak, but the visual effects and soundtrack were amazing"
aspects = extract_noun_phrase_aspects(example)
print(aspects)

['the plot', 'the visual effects', 'soundtrack']


In [13]:
def extract_aspects_combined(text, aspect_keywords):
    found_kw = [kw for kw in aspect_keywords if kw in text.lower()]
    found_np = extract_noun_phrase_aspects(text)
    return list(set(found_kw + found_np))

In [14]:
import nltk
import re
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

def get_subtree_for_aspect_extended(sentence, aspect, extra_window=2):
    doc = nlp(sentence)
    aspect_lower = aspect.lower()

    for token in doc:
        if aspect_lower in token.text.lower():
            # Nếu aspect là part of chunk thì dùng root chunk luôn
            subtree = list(token.subtree)
            start = max(0, subtree[0].i - extra_window)
            end = min(len(doc), subtree[-1].i + extra_window + 1)
            return doc[start:end].text.strip()

    return sentence.strip()

def get_sentiment_for_aspect(sentence, aspect):
    """Dự đoán cảm xúc cho 1 khía cạnh bằng VADER"""
    clause = get_subtree_for_aspect_extended(sentence, aspect)
    score = sid.polarity_scores(clause)
    compound = score['compound']

    if compound >= 0.05:
        return "positive"
    elif compound <= -0.05:
        return "negative"
    else:
        return "neutral"


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [15]:
aspect_sentiment_data_train = []

for entry in reviews_train:
    sent = entry["text"]
    aspects = extract_aspects_combined(sent, aspect_keywords)
    for aspect in aspects:
        sentiment = get_sentiment_for_aspect(sent, aspect)
        if sentiment != "neutral":
            aspect_sentiment_data_train.append({
                "sentence": sent,
                "aspect": aspect,
                "sentiment": sentiment
            })

for entry in aspect_sentiment_data_train[:11]:
    print(f"Sentence : {entry['sentence']}")
    print(f"Aspect   : {entry['aspect']}")
    print(f"Sentiment: {entry['sentiment']}")
    print("-" * 60)



Sentence : I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far b

In [16]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
import pandas as pd

df_train = pd.DataFrame(aspect_sentiment_data_train)
print(df_train.head())
print(df_train['sentiment'].value_counts())
print(df_train['aspect'].value_counts())
print("Số mẫu tổng cộng:", len(df_train))


                                            sentence             aspect  \
0  I rented I AM CURIOUS-YELLOW from my video sto...  ordinary denizens   
1  I rented I AM CURIOUS-YELLOW from my video sto...          some sort   
2  I rented I AM CURIOUS-YELLOW from my video sto...       u.s. customs   
3  I rented I AM CURIOUS-YELLOW from my video sto...  the average swede   
4  I rented I AM CURIOUS-YELLOW from my video sto...  her drama teacher   

  sentiment  
0  negative  
1  negative  
2  negative  
3  negative  
4  negative  
sentiment
negative    20392
positive    17653
Name: count, dtype: int64
aspect
this movie           366
it                   354
this film            286
the film             254
i                    251
                    ... 
the average swede      1
u.s. customs           1
ordinary denizens      1
a wes craven           1
frat boy               1
Name: count, Length: 22617, dtype: int64
Số mẫu tổng cộng: 38045


In [28]:
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

# Hàm chuẩn hóa aspect bằng lemmatization
def normalize_aspect(aspect):
    doc = nlp(aspect.lower())
    return " ".join([
        token.lemma_ for token in doc
        if not token.is_punct and token.is_alpha
    ])

df_train["aspect"] = df_train["aspect"].apply(normalize_aspect)

# Bỏ các aspect chứa 'movie' hoặc 'film'
df_train = df_train[~df_train["aspect"].str.contains(r"\b(movie|film)\b", case=False)].reset_index(drop=True)

# Thống kê lại số lượng
aspect_counts_train = df_train["aspect"].value_counts()

# Chỉ giữ lại những aspect có >= 10 mẫu
valid_aspects_train = set(aspect_counts_train[aspect_counts_train >= 10].index)
df_train = df_train[df_train["aspect"].isin(valid_aspects_train)].reset_index(drop=True)

print(df_train.head())
print(df_train['sentiment'].value_counts())
print(df_train['aspect'].value_counts())
print("Số mẫu tổng cộng:", len(df_train))


                                            sentence     aspect sentiment
0  I rented I AM CURIOUS-YELLOW from my video sto...  some sort  negative
1  I rented I AM CURIOUS-YELLOW from my video sto...             positive
2  I rented I AM CURIOUS-YELLOW from my video sto...     people  negative
3  I rented I AM CURIOUS-YELLOW from my video sto...      a fan  negative
4  I rented I AM CURIOUS-YELLOW from my video sto...             negative
sentiment
negative    5152
positive    4682
Name: count, dtype: int64
aspect
I                385
                 359
it               355
they             216
he               204
                ... 
the moment        10
kevin bacon       10
the sequel        10
the scarecrow     10
kevin spacey      10
Name: count, Length: 309, dtype: int64
Số mẫu tổng cộng: 9834


  df_train = df_train[~df_train["aspect"].str.contains(r"\b(movie|film)\b", case=False)].reset_index(drop=True)


In [31]:
df_train.to_csv("/content/drive/MyDrive/Dataset/imdb_aspect_sentiment_train.csv", index=False)

In [34]:
reviews_test  = dataset["test"].select(range(300))

aspect_sentiment_data_test = []

for entry in reviews_test:
    sent = entry["text"]
    aspects = extract_aspects_combined(sent, aspect_keywords)
    for aspect in aspects:
        sentiment = get_sentiment_for_aspect(sent, aspect)
        if sentiment != "neutral":
            aspect_sentiment_data_test.append({
                "sentence": sent,
                "aspect": aspect,
                "sentiment": sentiment
            })

for entry in aspect_sentiment_data_test[:11]:
    print(f"Sentence : {entry['sentence']}")
    print(f"Aspect   : {entry['aspect']}")
    print(f"Sentiment: {entry['sentiment']}")
    print("-" * 60)

df_test = pd.DataFrame(aspect_sentiment_data_test)
print(df_test.head())
print(df_test['sentiment'].value_counts())
print(df_test['aspect'].value_counts())
print("Số mẫu tổng cộng:", len(df_test))

df_test["aspect"] = df_test["aspect"].apply(normalize_aspect)

df_test = df_test[~df_test["aspect"].str.contains(r"\b(movie|film)\b", case=False)].reset_index(drop=True)

# Thống kê lại số lượng
aspect_counts_test = df_test["aspect"].value_counts()

# Chỉ giữ lại những aspect có >= 10 mẫu
valid_aspects_test = set(aspect_counts_test[aspect_counts_test >= 10].index)
df_test = df_test[df_test["aspect"].isin(valid_aspects_test)].reset_index(drop=True)

print(df_test.head())
print(df_test['sentiment'].value_counts())
print(df_test['aspect'].value_counts())
print("Số mẫu tổng cộng:", len(df_test))

df_test.to_csv("/content/drive/MyDrive/Dataset/imdb_aspect_sentiment_test.csv", index=False)

Sentence : I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they ha

  df_test = df_test[~df_test["aspect"].str.contains(r"\b(movie|film)\b", case=False)].reset_index(drop=True)
