In [1]:
! pip install nltk



In [22]:
import nltk
from nltk.corpus import movie_reviews, stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import random

# NLTK의 movie_reviews 데이터셋 다운로드
nltk.download('movie_reviews')

# 영화 리뷰와 해당 라벨을 데이터셋으로 로드
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# 데이터를 셔플링하여 무작위로 섞음
random.shuffle(documents)

# 데이터 샘플 확인
sample_review, sample_label = documents[0]
print(f"Review Sample: {' '.join(sample_review[:100])}...")
print(f"Label: {sample_label}")

# 불용어
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Review Sample: i was recently told that in china their had been strong protests against the release of red corner  and this is apparently because of the way it shows the injustice of many chinese laws . but if you ask me , the real truth of the matter is that the chinese critics association were determined not to punish the population into viewing richard gere running across rooftops in search for a fellow american . or more the point , anyone that allows him to bask in his own less - than - subtle presence . this is not an...
Label: neg


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# 리뷰 텍스트와 라벨 분리하고, 불용어 제거
reviews = [' '.join([word for word in review if word.lower() not in stop_words])  for review, category in documents]

## 라벨을 분리
label = [category for review, category in documents]

# 리뷰데이터 임베딩 변환 (학습에 필요한 리뷰와 라벨을 분리하여 진행)
Vactorizer = CountVectorizer()
X = Vactorizer.fit_transform(reviews)
y = label

# 데이터셋 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 111)

# Navie Bayes 분류기 학습
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# 정확도 계산
acc = accuracy_score(y_test, y_pred)
print(f"Base Model Accuracy: {acc: .4f}")

Base Model Accuracy:  0.8300


### 전처리를 어떤 식으로 하면 좋을까?
- 단어 정규화를 통해 성능 향상
- 어간, 표제어 추출하는 방법
- countvectorizer 빈도에 대한 기준을 다르게 하는 방법
 -> 빈도가 너무 낮은 값들은 제거하고, 어느 정도 빈도를 기준을 추가
- TF-IDF로 진행해서 새롭게 벡터를 만드는 방법
- N-gram으로 추가하는 방법

In [24]:
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer

# NLTK의 movie_reviews 데이터셋 다운로드
nltk.download('movie_reviews')
nltk.download('wordnet')

# 영화 리뷰와 해당 라벨을 데이터셋으로 로드
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

# 데이터를 셔플링하여 무작위로 섞음
random.shuffle(documents)

# 불용어
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# 1. 문장 및 단어 정규화
def normalize_text(text):
  text = text.lower() # 소문자
  text = re.sub(r'\d+', '', text) # 숫자제거
  text = re.sub(r'[^\w\s]', '', text) # 특수문자 제거
  return text

#전처리 진행
normalized_reviews = [normalize_text(' '.join(review)) for review, category in documents]

# 2. 어간추출 (stemming)
stemmer = PorterStemmer()
stemmed_reviews = [' '.join([stemmer.stem(word) for word in review.split()]) for review in normalized_reviews]

# 3. 표제어추출
lemmatizer = WordNetLemmatizer()
lemmatized_reviews = [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in stemmed_reviews]

### 어간추출로 진행하는 방법

In [None]:
vectorizer = CountVectorizer()

# 4. CounterVectorizer 하이퍼파라미터 조정
# 최소 빈도수 설정해서 희귀 단어 제거 가능

# 2번의 어간추출, 표제어추출로 학습 테스트
vectorizer_min_df = CountVectorizer(min_df = 2)
X_min_df = vectorizer_min_df.fit_transform(lemmatized_reviews)

# N-gram 사용 (2,2)
vectorizer_bigram = CountVectorizer(ngram_range = (2,2))
X_bigram = vectorizer_bigram.fit_transform(lemmatized_reviews)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(lemmatized_reviews)

## 데이터 분할

# 1. stem
X_train_stem, X_test_stem, y_train_stem, y_test_stem = train_test_split(stemmed_reviews, [label for _, label in documents], test_size = 0.2, random_state = 111)

# 2. min_df
X_train_min_df, X_test_min_df, y_train_min_df, y_test_min_df = train_test_split(X_min_df, [label for _, label in documents], test_size = 0.2, random_state = 111)

# 3. bigram
X_train_bigram, X_test_bigram, y_train_bigram, y_test_bigram = train_test_split(X_bigram, [label for _, label in documents], test_size = 0.2, random_state = 111)

# 4. TF-IDF
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, [label for _, label in documents], test_size = 0.2, random_state = 111)

## Naive Bayes 분류기 사용하여 학습, 평가

# 1. Navie Bayes 분류기 학습
model_stem = MultinomialNB()
model_stem.fit(vectorizer.fit_transform(X_train_stem), y_train_stem)
y_pred_stem = model_stem.predict(vectorizer.transform(X_test_stem))
accuracy_stem = accuracy_score(y_test_stem, y_pred_stem)

# 2. min_df
model_min_df = MultinomialNB()
model_min_df.fit(X_train_min_df, y_train_min_df)
y_pred_min_df = model_min_df.predict(X_test_min_df)
accuracy_min_df = accuracy_score(y_test_min_df, y_pred_min_df)

# 3. bigram
model_bigram = MultinomialNB()
model_bigram.fit(X_train_bigram, y_train_bigram)
y_pred_bigram = model_bigram.predict(X_test_bigram)
accuracy_bigram = accuracy_score(y_test_bigram, y_pred_bigram)

# 4. TF-IDF
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test_tfidf, y_pred_tfidf)

{
    'stem & CountVectorizer': accuracy_stem,
    'stem & min_df': accuracy_min_df,
    'stem & bigram': accuracy_bigram,
    'stem & tfidf': accuracy_tfidf
}

{'stem & CountVectorizer': 0.8175,
 'stem & min_df': 0.815,
 'stem & bigram': 0.8475,
 'stem & tfidf': 0.8175}

### 표제어추출로 비교

In [27]:
vectorizer = CountVectorizer()

# 4. CounterVectorizer 하이퍼파라미터 조정
# 최소 빈도수 설정해서 희귀 단어 제거 가능

# 2번의 어간추출, 표제어추출로 학습 테스트
vectorizer_min_df = CountVectorizer(min_df = 2)
X_min_df = vectorizer_min_df.fit_transform(lemmatized_reviews)

# N-gram 사용 (2,2)
vectorizer_bigram = CountVectorizer(ngram_range = (2,2))
X_bigram = vectorizer_bigram.fit_transform(lemmatized_reviews)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(lemmatized_reviews)

## 데이터 분할

# 1. stem
X_train_stem, X_test_stem, y_train_stem, y_test_stem = train_test_split(lemmatized_reviews, [label for _, label in documents], test_size = 0.2, random_state = 111)

# 2. min_df
X_train_min_df, X_test_min_df, y_train_min_df, y_test_min_df = train_test_split(X_min_df, [label for _, label in documents], test_size = 0.2, random_state = 111)

# 3. bigram
X_train_bigram, X_test_bigram, y_train_bigram, y_test_bigram = train_test_split(X_bigram, [label for _, label in documents], test_size = 0.2, random_state = 111)

# 4. TF-IDF
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, [label for _, label in documents], test_size = 0.2, random_state = 111)

## Naive Bayes 분류기 사용하여 학습, 평가

# 1. stem
model_stem = MultinomialNB()
model_stem.fit(vectorizer.fit_transform(X_train_stem), y_train_stem)
y_pred_stem = model_stem.predict(vectorizer.transform(X_test_stem))
accuracy_stem = accuracy_score(y_test_stem, y_pred_stem)

# 2. min_df
model_min_df = MultinomialNB()
model_min_df.fit(X_train_min_df, y_train_min_df)
y_pred_min_df = model_min_df.predict(X_test_min_df)
accuracy_min_df = accuracy_score(y_test_min_df, y_pred_min_df)

# 3. bigram
model_bigram = MultinomialNB()
model_bigram.fit(X_train_bigram, y_train_bigram)
y_pred_bigram = model_bigram.predict(X_test_bigram)
accuracy_bigram = accuracy_score(y_test_bigram, y_pred_bigram)

# 4. TF-IDF
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test_tfidf, y_pred_tfidf)

{
    'lemma & CountVectorizer': accuracy_stem,
    'lemma & min_df': accuracy_min_df,
    'lemma & bigram': accuracy_bigram,
    'lemma & tfidf': accuracy_tfidf
}

{'lemma & CountVectorizer': 0.815,
 'lemma & min_df': 0.815,
 'lemma & bigram': 0.8475,
 'lemma & tfidf': 0.8175}

In [31]:
X_train_bigram, X_train_min_df, X_train_tfidf

(<1600x444832 sparse matrix of type '<class 'numpy.int64'>'
 	with 918516 stored elements in Compressed Sparse Row format>,
 <1600x16033 sparse matrix of type '<class 'numpy.int64'>'
 	with 496089 stored elements in Compressed Sparse Row format>,
 <1600x25578 sparse matrix of type '<class 'numpy.float64'>'
 	with 503746 stored elements in Compressed Sparse Row format>)