In [71]:
import nltk
from nltk.corpus import movie_reviews, stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import random

In [72]:
# NLTK의 movie_reviews 데이터셋 다운로드
nltk.download('movie_reviews')

# 영화 리뷰와 해당 라벨을 데이터셋으로 로드
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# 데이터를 셔플링하여 무작위로 섞음
random.shuffle(documents)

# 데이터 샘플 확인
sample_review, sample_label = documents[0]
print(f"Review Sample: {' '.join(sample_review[:100])}...")
print(f"Label: {sample_label}")

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/jun/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


Review Sample: you think that these people only exist in the movies , but trust me , they ' re as real as life . i once talked to a guy who thought the united states government was putting satellites into orbit which could fry an individual person ' s brain with microwaves . then i sat in a room full of people who believed that the government rigged state elections . i even listened to a man who swore that nicotine was an additive that cigarette companies put in their products for the specific goal of getting people addicted . these...
Label: neg


In [73]:
## 리뷰 텍스트와 라벨 분리하고, 불용어 제거 
reviews = [' '.join([word for word in review if word.lower() not in stop_words])for review, category in documents]

In [74]:
## 라벨을 분리
label =[category for review, category in documents]

In [75]:
#학습에 필요한 리뷰랑, 라벨을 분리하여 진행
vectorizer=CountVectorizer()

# 리뷰데이터 임베딩 변환
X = vectorizer.fit_transform(reviews)
y = label

In [76]:
## 데이터셋 나누기
X_train, X_test, y_train, y_test =train_test_split(X,y ,test_size=0.2, random_state=111)

In [77]:
# Navie Bayes 분류기 학습
model =MultinomialNB()
model.fit(X_train, y_train)

#예측값은
y_pred= model.predict(X_test)

In [78]:
## 모델 정확도를 예측
acc=accuracy_score(y_test, y_pred)
print(f"Base Model Accuarcy :{acc:.4f}")

Base Model Accuarcy :0.8200


### 전처리를 어떤 식으로 하면 좋을까?
- 단어 정규화를 통해 성능이 좋아지지 않을까?
- 어간, 표제어 추출해서 성능을 좀 더 올릴 수 있지 않을까?
- countvectorizer 빈도에 대한 것을 기준을 다르게 해서 성능을 좀 더 올릴 수 있지 않을까?
    - 빈도가 너무 낮은 값들은 제거하고, 어느정도 빈도를 기준을 추가하는 것
- TF-IDF로 진행해서 새롭게 벡터를 만들어서 진행해 보는 것?
- N-gram으로 추가해서 성능을 향상시킬 수 있지 않을까?

In [79]:
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer


# NLTK의 movie_reviews 데이터셋 다운로드
nltk.download('movie_reviews')
nltk.download('wordnet')

# 영화 리뷰와 해당 라벨을 데이터셋으로 로드
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# 데이터를 셔플링하여 무작위로 섞음
random.shuffle(documents)

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/jun/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package wordnet to /Users/jun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [80]:
#1.문장 및 단어 정규화
def normalize_text(text):
    text = text.lower() #소문자 
    text = re.sub(r'\d+','',text) # 숫자 제거
    text = re.sub(r'[^\w\s]','',text) #구두점 제거
    return text

# 전처리 진행
normalized_reviews =[normalize_text(' '.join(review)) for review, category in documents]

In [81]:
# 2.어간추출, 표제어 추출
# 어간추출 (stemming)
stemmer=PorterStemmer()
stemmed_reviews =[' '.join([stemmer.stem(word) for word in review.split()]) for review in normalized_reviews]

In [82]:
# 3. 표제어 추출 
lemmatizer=WordNetLemmatizer()
lemmatized_reviews=[' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in normalized_reviews]

--- 
- 정규식, nltk 패키지로 리뷰 데이터 전처리

---
- 임베딩을 통한 행렬 변환 

--- 
- n_gram으로 추가하여서 학습

### 어간추출로 진행

In [83]:
vectorizer = CountVectorizer()

In [84]:
# 4. countvectorizer 하이퍼파라미터 조정
# 최소빈도수 설정해서 희귀 단어 제거 가능

## 2번의 어간추출, 표제어 추출로 학습 테스트
vectorizer_min_df = CountVectorizer(min_df=2)
X_min_df =vectorizer_min_df.fit_transform(lemmatized_reviews)

In [85]:
# N-그램 사용
# 2,2
vectorizer_bigram=CountVectorizer(ngram_range=(2,2))
X_bigram=vectorizer_bigram.fit_transform(lemmatized_reviews)

In [86]:
# TF-IDF
tfidf_vectorizer=TfidfVectorizer()
X_tfidf=tfidf_vectorizer.fit_transform(lemmatized_reviews)

In [89]:
## 데이터를 분할해야 한다.

#1. stem
X_train_stem, X_test_stem, y_train_stem, y_test_stem=train_test_split(stemmed_reviews, [label for _, label in documents], test_size=0.2, random_state=111)

#2. X_min_df
X_train_min_df, X_test_min_df, y_train_min_df, y_test_min_df = train_test_split(X_min_df, [label for _, label in documents], test_size=0.2, random_state=111)

#3. X_bigram
X_train_bigram, X_test_bigram, y_train_bigram, y_test_bigram = train_test_split(X_bigram, [label for _, label in documents], test_size=0.2, random_state=111)

#4. TF-IDF
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, [label for _, label in documents], test_size=0.2, random_state=111)


## Naive Bayes 분류기 사용하여 학습, 평가

#1. Navie Bayes 분류기 학습
model_stem =MultinomialNB()
model_stem.fit(vectorizer.fit_transform(X_train_stem), y_train_stem)
y_pred_stem= model_stem.predict(vectorizer.transform(X_test_stem))
accuracy_stem = accuracy_score(y_test_stem, y_pred_stem)

#2. min_df Navie Bayes 분류기 학습
model_min_df =MultinomialNB()
model_min_df.fit(X_train_min_df, y_train_min_df)
y_pred_min_df= model_min_df.predict(X_test_min_df)
accuracy_min_df = accuracy_score(y_test_min_df, y_pred_min_df)

#3. X_bigram Navie Bayes 분류기 학습
model_bigram =MultinomialNB()
model_bigram.fit(X_train_bigram, y_train_bigram)
y_pred_bigram= model_bigram.predict(X_test_bigram)
accuracy_bigram = accuracy_score(y_test_bigram, y_pred_bigram)

#4. TF-IDF Navie Bayes 분류기 학습
model_tfidf =MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf= model_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test_tfidf, y_pred_tfidf)


In [90]:
{
    'stem & Countvectorizer' : accuracy_stem,
    'stem & min_df' :accuracy_min_df,
    'stem & bigram': accuracy_bigram,
    'stem & tfidf' : accuracy_tfidf
}

{'stem & Countvectorizer': 0.82,
 'stem & min_df': 0.8275,
 'stem & bigram': 0.85,
 'stem & tfidf': 0.8025}

### 표제어 추출로 비교

In [91]:
vectorizer = CountVectorizer()

# 4. countvectorizer 하이퍼파라미터 조정
# 최소빈도수 설정해서 희귀 단어 제거 가능

## 2번의 어간추출, 표제어 추출로 학습 테스트
vectorizer_min_df = CountVectorizer(min_df=2)
X_min_df =vectorizer_min_df.fit_transform(lemmatized_reviews)

# N-그램 사용
# 2,2
vectorizer_bigram=CountVectorizer(ngram_range=(2,2))
X_bigram=vectorizer_bigram.fit_transform(lemmatized_reviews)

# TF-IDF
tfidf_vectorizer=TfidfVectorizer()
X_tfidf=tfidf_vectorizer.fit_transform(lemmatized_reviews)


## 데이터를 분할해야 한다.

#1. stem
X_train_stem, X_test_stem, y_train_stem, y_test_stem=train_test_split(lemmatized_reviews, [label for _, label in documents], test_size=0.2, random_state=111)

#2. X_min_df
X_train_min_df, X_test_min_df, y_train_min_df, y_test_min_df = train_test_split(X_min_df, [label for _, label in documents], test_size=0.2, random_state=111)

#3. X_bigram
X_train_bigram, X_test_bigram, y_train_bigram, y_test_bigram = train_test_split(X_bigram, [label for _, label in documents], test_size=0.2, random_state=111)

#4. TF-IDF
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, [label for _, label in documents], test_size=0.2, random_state=111)


## Naive Bayes 분류기 사용하여 학습, 평가

#1. Navie Bayes 분류기 학습
model_stem =MultinomialNB()
model_stem.fit(vectorizer.fit_transform(X_train_stem), y_train_stem)
y_pred_stem= model_stem.predict(vectorizer.transform(X_test_stem))
accuracy_stem = accuracy_score(y_test_stem, y_pred_stem)

#2. min_df Navie Bayes 분류기 학습
model_min_df =MultinomialNB()
model_min_df.fit(X_train_min_df, y_train_min_df)
y_pred_min_df= model_min_df.predict(X_test_min_df)
accuracy_min_df = accuracy_score(y_test_min_df, y_pred_min_df)

#3. X_bigram Navie Bayes 분류기 학습
model_bigram =MultinomialNB()
model_bigram.fit(X_train_bigram, y_train_bigram)
y_pred_bigram= model_bigram.predict(X_test_bigram)
accuracy_bigram = accuracy_score(y_test_bigram, y_pred_bigram)

#4. TF-IDF Navie Bayes 분류기 학습
model_tfidf =MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf= model_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test_tfidf, y_pred_tfidf)


In [92]:
{
    'lemma & Countvectorizer' : accuracy_stem,
    'lemma & min_df' :accuracy_min_df,
    'lemma & bigram': accuracy_bigram,
    'lemma & tfidf' : accuracy_tfidf
}

{'lemma & Countvectorizer': 0.8275,
 'lemma & min_df': 0.8275,
 'lemma & bigram': 0.85,
 'lemma & tfidf': 0.8025}

In [66]:
X_train_bigram

<1600x474149 sparse matrix of type '<class 'numpy.int64'>'
	with 912105 stored elements in Compressed Sparse Row format>

In [67]:
X_train_min_df

<1600x21210 sparse matrix of type '<class 'numpy.int64'>'
	with 503872 stored elements in Compressed Sparse Row format>

In [68]:
X_train_tfidf

<1600x34758 sparse matrix of type '<class 'numpy.float64'>'
	with 514716 stored elements in Compressed Sparse Row format>

- I Love BDA 
- 2-그램
- 'I Love', 'Love BDA'

### 필수과제1
- base로만 진행했지만, 추가적으로 임베딩을 다양하게 진행해 보시면서 0.845 의 성능보다 더 올리기
- 다양한 텍스트 전처리를 통해 성능 개선하기 
- 파생변수를 추가해도 괜찮음