# IMDB 영화평 - 나이브 베이즈

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('../00.data/IMDB/labeledTrainData.tsv', header=0, sep='\t', quoting=3)
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [2]:
import re
# <br /> 태그는 공백으로 변환
df['review'] = df.review.str.replace('<br />', ' ')
# 영어 이외의 문자는 공백으로 변환(숫자 등)
# 앞에 들어가는 ^는 not의 의미 -> a-z, A-Z가 '아닌 놈들은' 공백으로 바꿔라

df['review'] = df.review.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

In [3]:
from sklearn.model_selection import train_test_split

feature_df = df.drop(['id', 'sentiment'], axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(
    feature_df, df.sentiment, test_size =0.3, random_state=156
)

X_train.shape, X_test.shape

((17500, 1), (7500, 1))

### 나이브 베이즈 모델

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [5]:
count_vect = CountVectorizer(stop_words='english', ngram_range=(1, 2))
count_vect.fit(X_train.review)
X_train_count = count_vect.transform(X_train.review)
X_test_count = count_vect.transform(X_test.review)

In [6]:
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
pred = nb.predict(X_test_count)
accuracy_score(y_test, pred)

0.8710666666666667

In [8]:
nb.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}