In [52]:
import nltk
from nltk.corpus import movie_reviews, stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import random

In [53]:
# 영화 리뷰와 해당 라벨을 데이터셋으로 로드
nltk.download('movie_reviews')
nltk.download('stopwords')

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# 데이터를 셔플링하여 무작위로 섞음
random.shuffle(documents)

# 데이터 샘플 확인
sample_review, sample_label = documents[0]
print(f"Review Sample: {' '.join(sample_review[:100])}...")
print(f"Label: {sample_label}")

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Review Sample: " gattaca " represents a solid breakthrough in the recent onslaught of science - fiction films -- it ' s a genre picture that doesn ' t rely on alien creatures or loud explosions to tell its story . the movie takes place in a futuristic world where babies are created through genetic tampering and not sexual reproduction . this allows parents to predetermine what kind of eye color , intelligence and life span they ' d like for their child , and also eliminates most pesky chances of health defects . those made the old - fashioned way are...
Label: pos


In [54]:
# 리뷰 텍스트와 라벨 분리하고, 불용어 제거
reviews = [' '.join([word for word in review if word.lower() not in stop_words]) for review, category in documents]

reviews

Output hidden; open in https://colab.research.google.com to view.

In [55]:
# 라벨을 분리
label = [category for review, category in documents]

In [56]:
# 학습에 필요한 리뷰와 라벨을 분리하여 진행
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(reviews)
y = label

X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2, random_state=111)

# NaiveBayes 분류기 학습
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# 모델 정확도를 예측
acc = accuracy_score(y_test, y_pred)
print(f"Base Model Accuarcy : {acc:.4f}")

Base Model Accuarcy : 0.8100


### 전처리를 어떤 식으로 하면 좋을까?
- 단어 정규화를 통해 성능 향상
- 어간, 표제어 추출하는 방법
- countvectorizer 빈도에 대한 기준을 다르게 하는 방법
    - 빈도가 너무 낮은 값들은 제거하고, 어느 정도 빈도를 기준을 추가
- TF-IDF로 진행해서 새롭게 벡터를 만드는 방법
- N-gram으로 추가하는 방법

In [57]:
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('movie_reviews')
nltk.download('wordnet')

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [58]:
# 1. 문장 및 단어 정규화
def normalize_text(text):
    text = text.lower() # 소문자
    text = re.sub(r'\d+','',text) # 숫자 제거
    text = re.sub(r'[^\w\s]','',text) # 구두점 제거
    return text

# 전처리 진행
normalized_reviews = [normalize_text(' '.join(review)) for review, category in documents]

In [59]:
# 2.어간 추출
stemmer = PorterStemmer()
stemmed_reviews = [' '.join([stemmer.stem(word) for word in review.split()]) for review in normalized_reviews]

In [60]:
# 3. 표제어 추출
lemmatizer = WordNetLemmatizer()
lemmatized_reviews = [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in normalized_reviews]

- 정규식, nltk 패키지로 리뷰 데이터 전처리
- 임베딩을 통한 행렬 변환
- n_gram으로 추가해서 학습

### 어간 추출

In [61]:
vectorizer = CountVectorizer()

In [62]:
# 4. countvectorizer 하이퍼 파라미터 조정
# 최소빈도수 설정해서 희귀 단어 제거 가능
vectorizer_min_df = CountVectorizer(min_df=2)

# 2번의 어간 추출, 표제어 추출로 학습 테스트
X_min_df = vectorizer_min_df.fit_transform(stemmed_reviews)

In [63]:
# 5. N-그램 사용 -> (2, 2)
vectorizer_bigram = CountVectorizer(ngram_range=(2, 2))
X_bigram = vectorizer_bigram.fit_transform(stemmed_reviews)

In [64]:
# 6. TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(stemmed_reviews)

In [65]:
# 데이터 분할

# 1. stem
X_train_stem, X_test_stem, y_train_stem, y_test_stem = train_test_split(stemmed_reviews, [label for _, label in documents], test_size=0.2, random_state=111)

# 2. X_min_df
X_train_min_df, X_test_min_df, y_train_min_df, y_test_min_df = train_test_split(X_min_df, [label for _, label in documents], test_size=0.2, random_state=111)

# 3. X_bigram
X_train_bigram, X_test_bigram, y_train_bigram, y_test_bigram = train_test_split(X_bigram, [label for _, label in documents], test_size=0.2, random_state=111)

# 4. TF-IDF
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, [label for _, label in documents], test_size=0.2, random_state=111)


# Naive Bayes 분류기 사용하여 학습, 평가

# 1. Navie Bayes 분류기 학습
model_stem = MultinomialNB()
model_stem.fit(vectorizer.fit_transform(X_train_stem), y_train_stem)
y_pred_stem = model_stem.predict(vectorizer.transform(X_test_stem))
accuracy_stem = accuracy_score(y_test_stem, y_pred_stem)

# 2. min_df Navie Bayes 분류기 학습
model_min_df = MultinomialNB()
model_min_df.fit(X_train_min_df, y_train_min_df)
y_pred_min_df = model_min_df.predict(X_test_min_df)
accuracy_min_df = accuracy_score(y_test_min_df, y_pred_min_df)

# 3. X_bigram Navie Bayes 분류기 학습
model_bigram = MultinomialNB()
model_bigram.fit(X_train_bigram, y_train_bigram)
y_pred_bigram = model_bigram.predict(X_test_bigram)
accuracy_bigram = accuracy_score(y_test_bigram, y_pred_bigram)

# 4. TF-IDF Navie Bayes 분류기 학습
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test_tfidf, y_pred_tfidf)

{
    'stem & Countvectorizer' : accuracy_stem,
    'stem & min_df' : accuracy_min_df,
    'stem & bigram': accuracy_bigram,
    'stem & tfidf' : accuracy_tfidf
}

{'stem & Countvectorizer': 0.825,
 'stem & min_df': 0.8275,
 'stem & bigram': 0.8625,
 'stem & tfidf': 0.835}

### 필수과제1
- base로만 진행했지만, 추가적으로 임베딩을 다양하게 진행해 보시면서 0.845 의 성능보다 더 올리기
- 다양한 텍스트 전처리를 통해 성능 개선하기
- 파생변수를 추가해도 괜찮음

### 표제어 추출

In [73]:
vectorizer = CountVectorizer()

# 4. countvectorizer 하이퍼파라미터 조정
# 최소빈도수 설정해서 희귀 단어 제거 가능
vectorizer_min_df = CountVectorizer(min_df=12)

# 2번의 어간추출, 표제어 추출로 학습 테스트
X_min_df = vectorizer_min_df.fit_transform(lemmatized_reviews)

# N-그램 사용 -> (2, 2)
vectorizer_bigram = CountVectorizer(ngram_range=(2, 2), min_df=12)
X_bigram = vectorizer_bigram.fit_transform(lemmatized_reviews)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(min_df=12)
X_tfidf = tfidf_vectorizer.fit_transform(lemmatized_reviews)


# 데이터 분할

# 1. stem
X_train_stem, X_test_stem, y_train_stem, y_test_stem = train_test_split(lemmatized_reviews, [label for _, label in documents], test_size=0.2, random_state=111)

# 2. X_min_df
X_train_min_df, X_test_min_df, y_train_min_df, y_test_min_df = train_test_split(X_min_df, [label for _, label in documents], test_size=0.2, random_state=111)

# 3. X_bigram
X_train_bigram, X_test_bigram, y_train_bigram, y_test_bigram = train_test_split(X_bigram, [label for _, label in documents], test_size=0.2, random_state=111)

# 4. TF-IDF
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, [label for _, label in documents], test_size=0.2, random_state=111)


# Naive Bayes 분류기 사용하여 학습, 평가

# 1. Navie Bayes 분류기 학습
model_stem = MultinomialNB()
model_stem.fit(vectorizer.fit_transform(X_train_stem), y_train_stem)
y_pred_stem = model_stem.predict(vectorizer.transform(X_test_stem))
accuracy_stem = accuracy_score(y_test_stem, y_pred_stem)

# 2. min_df Navie Bayes 분류기 학습
model_min_df = MultinomialNB()
model_min_df.fit(X_train_min_df, y_train_min_df)
y_pred_min_df = model_min_df.predict(X_test_min_df)
accuracy_min_df = accuracy_score(y_test_min_df, y_pred_min_df)

# 3. X_bigram Navie Bayes 분류기 학습
model_bigram = MultinomialNB()
model_bigram.fit(X_train_bigram, y_train_bigram)
y_pred_bigram = model_bigram.predict(X_test_bigram)
accuracy_bigram = accuracy_score(y_test_bigram, y_pred_bigram)

# 4. TF-IDF Navie Bayes 분류기 학습
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test_tfidf, y_pred_tfidf)

{
    'lemma & Countvectorizer' : accuracy_stem,
    'lemma & min_df' : accuracy_min_df,
    'lemma & bigram' : accuracy_bigram,
    'lemma & tfidf' : accuracy_tfidf
}

{'lemma & Countvectorizer': 0.82,
 'lemma & min_df': 0.8325,
 'lemma & bigram': 0.835,
 'lemma & tfidf': 0.845}

In [69]:
vectorizer_trigram = CountVectorizer(ngram_range=(3, 3), min_df=2)
X_trigram = vectorizer_trigram.fit_transform(lemmatized_reviews)
X_train_trigram, X_test_trigram, y_train_trigram, y_test_trigram = train_test_split(X_trigram, [label for _, label in documents], test_size=0.2, random_state=111)

model_trigram = MultinomialNB()
model_trigram.fit(X_train_trigram, y_train_trigram)
y_pred_trigram = model_trigram.predict(X_test_trigram)
accuracy_trigram = accuracy_score(y_test_trigram, y_pred_trigram)

{
    'Trigram': accuracy_trigram
}

{'Trigram': 0.815}

- Trigram을 사용했을 때 오히려 성능이 더 안좋아졌다.

In [70]:
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import download
from gensim.models import Word2Vec
import numpy as np
from sklearn.naive_bayes import GaussianNB

download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(f'[{string.punctuation}]', '', text.lower())
    words = [lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stop_words]
    return words

preprocessed_reviews = [preprocess_text(' '.join(review)) for review, _ in documents]

labels = [label for _, label in documents]

# Word2Vec 임베딩 방식을 적용
word2vec_model = Word2Vec(preprocessed_reviews, vector_size=100, window=5, min_count=2, workers=4)

def document_vector(words):
    words = [word for word in words if word in word2vec_model.wv.key_to_index]
    if len(words) == 0:
        return np.zeros(100)
    return np.mean(word2vec_model.wv[words], axis=0)

document_vectors = np.array([document_vector(review) for review in preprocessed_reviews])

X_train, X_test, y_train, y_test = train_test_split(document_vectors, labels, test_size=0.2, random_state=111)

model_w2v = GaussianNB()
model_w2v.fit(X_train, y_train)
y_pred_w2v = model_w2v.predict(X_test)
accuracy_w2v = accuracy_score(y_test, y_pred_w2v)

accuracy_w2v

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0.62

In [71]:
from gensim.models import FastText
from scipy.sparse import hstack

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(f'[{string.punctuation}]', '', text.lower())
    words = [lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stop_words]
    return words

preprocessed_reviews = [preprocess_text(' '.join(review)) for review, _ in documents]

labels = [label for _, label in documents]

# FastText 임베딩을 사용
fasttext_model = FastText(preprocessed_reviews, vector_size=100, window=5, min_count=2, workers=4, sg=1)

def document_vector(words):
    words = [word for word in words if word in fasttext_model.wv.key_to_index]
    if len(words) == 0:
        return np.zeros(100)
    return np.mean(fasttext_model.wv[words], axis=0)

fasttext_vectors = np.array([document_vector(review) for review in preprocessed_reviews])

raw_reviews = [' '.join(review) for review in preprocessed_reviews]

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=12)
tfidf_vectors = tfidf_vectorizer.fit_transform(raw_reviews)

combined_features = hstack([tfidf_vectors, np.array(fasttext_vectors)])

X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.2, random_state=111)

model_fasttext = GaussianNB()
model_fasttext.fit(X_train.toarray(), y_train)
y_pred_fasttext = model_fasttext.predict(X_test.toarray())
accuracy_fasttext = accuracy_score(y_test, y_pred_fasttext)

accuracy_fasttext

0.775

- 성능을 더 개선해보려 하였지만 실패하였고 bigram + stem(어간추출) 방식이 0.8625로 가장 좋았다.
- min_df를 조정하였을 때 성능이 약간 상승하였다.