# 다항분포 나이브 베이즈 (Multinomial Naive Bayes) 분류

<br>

## ✅ 다항분포 나이브 베이즈
- 데이터의 특징이 **출현 횟수로 표현 됐을 때** 사용
- e.g. 주사위를 10번 던졌을 때 1이 한 번, 2가 두 번, 3이 세 번, 4가 네 번 나왔을 경우, 주사위를 10번 던진 결과 데이터를 (1, 2, 3, 4, 0, 0)과 같이 나타낼 수 있음
    - 각 인덱스는 주사위의 면을 뜻함
    - 데이터의 숫자는 출현 횟수를 나타낸 것
- 데이터의 출현 횟수에 따라 값을 달리한 데이터에 사용
- e.g. 영화 감상평을 토대로 긍정적/부정적 리뷰 분류

In [1]:
# 다항분포 나이브 베이즈는 감성분류에 잘 쓰임

import numpy as np
import pandas as pd

# 다항분포 나이브 베이즈
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

# 성능평가
from sklearn import metrics
from sklearn.metrics import accuracy_score

<hr>

## 01. 문제 정의
##### *영화 리뷰에 다항 분포 나이브베이즈 분류 모델을 사용해 영화 리뷰가 긍정/부정인지 분류*

<hr>

## 02. 데이터 수집

In [2]:
review_list = [
                {'movie_review': 'this is great great movie. I will watch again', 'type': 'positive'},
                {'movie_review': 'I like this movie', 'type': 'positive'},
                {'movie_review': 'amazing movie in this year', 'type': 'positive'},
                {'movie_review': 'cool my boyfriend also said the movie is cool', 'type': 'positive'},
                {'movie_review': 'awesome of the awesome movie ever', 'type': 'positive'},
                {'movie_review': 'shame I wasted money and time', 'type': 'negative'},
                {'movie_review': 'regret on this move. I will never never what movie from this director', 'type': 'negative'},
                {'movie_review': 'I do not like this movie', 'type': 'negative'},
                {'movie_review': 'I do not like actors in this movie', 'type': 'negative'},
                {'movie_review': 'boring boring sleeping movie', 'type': 'negative'}
             ]

In [3]:
test_feedback_list = [
                {'movie_review': 'great great great movie ever', 'type': 'positive'},
                {'movie_review': 'I like this amazing movie', 'type': 'positive'},
                {'movie_review': 'my boyfriend said great movie ever', 'type': 'positive'},
                {'movie_review': 'cool cool cool', 'type': 'positive'},
                {'movie_review': 'awesome boyfriend said cool movie ever', 'type': 'positive'},
                {'movie_review': 'shame shame shame', 'type': 'negative'},
                {'movie_review': 'awesome director shame movie boring movie', 'type': 'negative'},
                {'movie_review': 'do not like this movie', 'type': 'negative'},
                {'movie_review': 'I do not like this boring movie', 'type': 'negative'},
                {'movie_review': 'aweful terrible boring movie', 'type': 'negative'}
             ]

In [4]:
# train df
train_df = pd.DataFrame(review_list)
test_df = pd.DataFrame(test_feedback_list)
train_df

Unnamed: 0,movie_review,type
0,this is great great movie. I will watch again,positive
1,I like this movie,positive
2,amazing movie in this year,positive
3,cool my boyfriend also said the movie is cool,positive
4,awesome of the awesome movie ever,positive
5,shame I wasted money and time,negative
6,regret on this move. I will never never what m...,negative
7,I do not like this movie,negative
8,I do not like actors in this movie,negative
9,boring boring sleeping movie,negative


<hr>

## 03. 데이터 전처리

In [5]:
# 정답 데이터 >> 숫자로 변환
train_df['target'] = train_df['type'].map({'positive':1, 'negative':0})
train_df

Unnamed: 0,movie_review,type,target
0,this is great great movie. I will watch again,positive,1
1,I like this movie,positive,1
2,amazing movie in this year,positive,1
3,cool my boyfriend also said the movie is cool,positive,1
4,awesome of the awesome movie ever,positive,1
5,shame I wasted money and time,negative,0
6,regret on this move. I will never never what m...,negative,0
7,I do not like this movie,negative,0
8,I do not like actors in this movie,negative,0
9,boring boring sleeping movie,negative,0


In [6]:
# DV와 IV 저장
df_x = train_df['movie_review']
df_y = train_df['target']

In [7]:
# 클래스 객체 생성
cv = CountVectorizer()
# default binary=False >> 중복되는 단어의 수를 count하여 빈도수를 확인할 수 있음
# True: 단어의 유무만 알 수 있음 (있다/없다)

x_traincv = cv.fit_transform(df_x)   # 학습시에는 fit_transform()

In [8]:
# 알파벳순 정렬
print("데이터에 있는 단어:\n", cv.get_feature_names_out())
print("\n단어의 수:", len(cv.get_feature_names_out()))

데이터에 있는 단어:
 ['actors' 'again' 'also' 'amazing' 'and' 'awesome' 'boring' 'boyfriend'
 'cool' 'director' 'do' 'ever' 'from' 'great' 'in' 'is' 'like' 'money'
 'move' 'movie' 'my' 'never' 'not' 'of' 'on' 'regret' 'said' 'shame'
 'sleeping' 'the' 'this' 'time' 'wasted' 'watch' 'what' 'will' 'year']

단어의 수: 37


In [9]:
# 37개씩 10개의 배열
# 빈도수가 많은 단어(강조하는 단어)에 가중치를 두기 위해 binary 옵션을 default로 설정한 것임
encoded_input = x_traincv.toarray()  # 빈도수를 알 수 있음
encoded_input

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 2,
        0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0

In [10]:
# 단어를 다시 확인
cv.inverse_transform(encoded_input[[0]])

[array(['again', 'great', 'is', 'movie', 'this', 'watch', 'will'],
       dtype='<U9')]

<hr>

## 04. 다항분포 나이브 베이즈 모델학습

In [11]:
# 다항분포 나이브 베이즈 모델 학습
mnb = MultinomialNB()
y_train = df_y.astype('int')            # 실수 >> 정수 형변환
mnb.fit(x_traincv, y_train)

In [12]:
# 테스트 데이터로 성능 평가
test_df['target'] = test_df['type'].map({'positive':1, 'negative':0})
test_df

Unnamed: 0,movie_review,type,target
0,great great great movie ever,positive,1
1,I like this amazing movie,positive,1
2,my boyfriend said great movie ever,positive,1
3,cool cool cool,positive,1
4,awesome boyfriend said cool movie ever,positive,1
5,shame shame shame,negative,0
6,awesome director shame movie boring movie,negative,0
7,do not like this movie,negative,0
8,I do not like this boring movie,negative,0
9,aweful terrible boring movie,negative,0


In [13]:
# 테스트 데이터 DV/IV 분류
test_x = test_df['movie_review']
test_y = test_df['target']

In [14]:
# **fit() 하지 않도록 주의**
x_testcv = cv.transform(test_x)

In [15]:
# 변환된 결과 확인
encoded_input = x_testcv.toarray()
encoded_input

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0

In [16]:
# 0과 1로 변환된 encoded_input 값을 문자열로 변경
cv.inverse_transform(encoded_input)

[array(['ever', 'great', 'movie'], dtype='<U9'),
 array(['amazing', 'like', 'movie', 'this'], dtype='<U9'),
 array(['boyfriend', 'ever', 'great', 'movie', 'my', 'said'], dtype='<U9'),
 array(['cool'], dtype='<U9'),
 array(['awesome', 'boyfriend', 'cool', 'ever', 'movie', 'said'],
       dtype='<U9'),
 array(['shame'], dtype='<U9'),
 array(['awesome', 'boring', 'director', 'movie', 'shame'], dtype='<U9'),
 array(['do', 'like', 'movie', 'not', 'this'], dtype='<U9'),
 array(['boring', 'do', 'like', 'movie', 'not', 'this'], dtype='<U9'),
 array(['boring', 'movie'], dtype='<U9')]

<hr>

## 05. 성능평가

In [17]:
# 테스트 데이터로 모델의 정확도 평가
predicted = mnb.predict(x_testcv)
predicted

array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

In [18]:
# 정확도
accuracy_score(test_y, predicted)

1.0

In [19]:
print(metrics.classification_report(test_y, predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [20]:
# 혼동행렬
metrics.confusion_matrix(test_y, predicted)

array([[5, 0],
       [0, 5]], dtype=int64)