In [14]:
import re
import nltk
nltk.data.path.append('/Users/parkjuyong/nltk_data')
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import spacy

In [22]:
# NLTK 리소스 다운로드 (최초 1회)
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/parkjuyong/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/parkjuyong/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/parkjuyong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/parkjuyong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/parkjuyong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
reviews =[
    "I loved the movie! 😄 The plot was amazing, but the ending was horrible...",
    "Terrible movie. Waste of time! <br> 0/10",
    "It was okay; not great, not bad. Could have been better!"
]

In [24]:
# POS 기반 Lemmatization
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'): # 형용사
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [25]:
def preprocess_text(text):
    #(1)노이즈 제거
    #HTML 태그 제거 : [정규식].*? => 0개 이상의 임의 문자
    text = re.sub(r'<.*?>', '', text)
    #이모지, 특수문자 제거 : [정규식] \w : 단어문자 (영문자 a-z, A-Z, 숫자 0-9, 밑줄 _)
    text = re.sub(r'[^\w\s]', '', text)
    #숫자 제거
    text = re.sub(r'\d+', '', text)
    #공백 정리
    text = re.sub(r'\s+', ' ', text).strip()

    #(2)토큰화
    tokens = nltk.word_tokenize(text)

    #(3)불용어 제거
    stop_words = set(stopwords.words('english'))
    tokens2 = [t for t in tokens if t.lower() not in stop_words]

    #(4)어간 추출
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(t) for t in tokens]

    #(5)표제어 추출
    pos_tags = pos_tag(tokens) #표제어 추출에 사용할 품사 태깅
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_tokens_POS = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tags]

    return {
        'original': text,
        'tokens': tokens2,
        'stemmed': stemmed_tokens,
        'lemmatized': lemmatized_tokens,
        'lemmatized_POS': lemmatized_tokens_POS
    }

In [26]:
# 전처리 실행
preprocessed_reviews = [preprocess_text(r) for r in reviews]

In [27]:
# 결과 확인
for i, result in enumerate(preprocessed_reviews):
    print(f"\n---Review {i+1} ---")
    print("Originla:", result['original'])
    print("Stopword removed Tokens:", result['tokens'])
    print("Stemmed:", result['stemmed'])
    print("Lemmatized:", result['lemmatized'])
    print("Lemmatized_POS:", result['lemmatized_POS'])


---Review 1 ---
Originla: I loved the movie The plot was amazing but the ending was horrible
Stopword removed Tokens: ['loved', 'movie', 'plot', 'amazing', 'ending', 'horrible']
Stemmed: ['i', 'love', 'the', 'movi', 'the', 'plot', 'wa', 'amaz', 'but', 'the', 'end', 'wa', 'horribl']
Lemmatized: ['I', 'loved', 'the', 'movie', 'The', 'plot', 'wa', 'amazing', 'but', 'the', 'ending', 'wa', 'horrible']
Lemmatized_POS: ['I', 'love', 'the', 'movie', 'The', 'plot', 'be', 'amaze', 'but', 'the', 'end', 'be', 'horrible']

---Review 2 ---
Originla: Terrible movie Waste of time
Stopword removed Tokens: ['Terrible', 'movie', 'Waste', 'time']
Stemmed: ['terribl', 'movi', 'wast', 'of', 'time']
Lemmatized: ['Terrible', 'movie', 'Waste', 'of', 'time']
Lemmatized_POS: ['Terrible', 'movie', 'Waste', 'of', 'time']

---Review 3 ---
Originla: It was okay not great not bad Could have been better
Stopword removed Tokens: ['okay', 'great', 'bad', 'Could', 'better']
Stemmed: ['it', 'wa', 'okay', 'not', 'great', 