#### 영화평점 감성 리뷰

In [1]:
import time
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

#### 데이터 로드

In [2]:
columns = ['name','loca','writer','review','point','emotion']
raw_datas = pd.read_csv('datas/naver_myplace_review.csv',names = columns)

#### train, test 나누기

In [3]:
target = raw_datas['emotion']
X_train, X_test, y_train, y_test = train_test_split(raw_datas, target, train_size=0.7, random_state =13,stratify = target)

#### 비율 확인
- 생각보다 비율이 너무 낮음

In [4]:
X_train.emotion.value_counts() / X_train.emotion.value_counts().sum() * 100

positive    86.370278
negative    13.629722
Name: emotion, dtype: float64

In [5]:
X_test.emotion.value_counts() / X_test.emotion.value_counts().sum() * 100

positive    86.368062
negative    13.631938
Name: emotion, dtype: float64

#### stopword 생성

In [6]:
# stop_words = pd.read_csv('datas/korean_stopwords.csv',names = ['korean'])
# stop_words = list(stop_words.korean)
# stop_words

#### 형태소 분석 함수

In [7]:
twitter = Okt()
def tw_tokenizer(text):
    tokens_ko = twitter.morphs(text)
    return tokens_ko

In [8]:
tfidf_vect = TfidfVectorizer(tokenizer=tw_tokenizer,
                             ngram_range=(1, 2), min_df=3, max_df=0.9)#,stop_words=stop_words
tfidf_vect.fit(X_train['review'])
tfidf_matrix_train = tfidf_vect.transform(X_train['review'])



In [9]:
print(len(tfidf_vect.get_feature_names()))
tfidf_vect.get_feature_names()

6338


['0',
 '0001',
 '0001 f',
 '1',
 '1 개',
 '1 도',
 '1 인',
 '1 인분',
 '1 층',
 '10',
 '100',
 '100 프로',
 '1000원',
 '10분',
 '10시',
 '11시',
 '12시',
 '15',
 '15000원',
 '19',
 '1등',
 '1시',
 '1시간',
 '2',
 '2 개',
 '2 명',
 '2 번',
 '2 인',
 '2 인분',
 '2 종류',
 '2 층',
 '20',
 '200',
 '2000원',
 '2020년',
 '20분',
 '23',
 '2시',
 '2시간',
 '2조',
 '2조 각',
 '2천원',
 '3',
 '3 개',
 '3 대',
 '3 명',
 '3 인분',
 '3 층',
 '3000원',
 '30분',
 '30분 정도',
 '34',
 '35',
 '3년',
 '3시',
 '4',
 '4 개',
 '40분',
 '40분 정도',
 '4시',
 '4시간',
 '5',
 '5 개',
 '5 층',
 '5시',
 '5천원',
 '6',
 '6000원',
 '6500원',
 '6시',
 '6천원',
 '7',
 '7시',
 '7천원',
 '8시',
 '8천원',
 '9',
 '90',
 '90 d',
 '9시',
 'b',
 'd',
 'f',
 'f 90',
 'good',
 'httpsmblognavercomblackgta',
 'jmt',
 'm',
 'sns',
 'soso',
 'u',
 'u 0001',
 'ㄷㄷ',
 'ㅁ',
 'ㅈ',
 'ㅋ',
 'ㅋㅋ',
 'ㅋㅋㅋ',
 'ㅋㅋㅋㅋ',
 'ㅋㅋㅋㅋㅋ',
 'ㅎ',
 'ㅎㅎ',
 'ㅎㅎ 다음',
 'ㅎㅎㅎ',
 'ㅎㅎㅎㅎ',
 'ㅛ',
 'ㅜ',
 'ㅜㅜ',
 'ㅜㅜㅜ',
 'ㅜㅠ',
 'ㅠ',
 'ㅠㅜ',
 'ㅠㅠ',
 'ㅠㅠ 직원',
 'ㅠㅠㅠ',
 'ㅠㅠㅠㅠ',
 'ㅠㅡㅠ',
 'ㅡ',
 'ㅡㅡ',
 '가',
 '가 가장',
 '가 궁금해서',
 '가 깔끔하고',
 '가 나서',
 '

#### 데이터 불균형 맞추기

In [11]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 13)
X_train_over, y_train_over = smote.fit_sample(tfidf_matrix_train, y_train)

In [12]:
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{'negative': 653, 'positive': 4138}

In [13]:
unique, counts = np.unique(y_train_over, return_counts=True)
dict(zip(unique, counts))

{'negative': 4138, 'positive': 4138}

In [14]:
tfidf_vect.get_feature_names()

['0',
 '0001',
 '0001 f',
 '1',
 '1 개',
 '1 도',
 '1 인',
 '1 인분',
 '1 층',
 '10',
 '100',
 '100 프로',
 '1000원',
 '10분',
 '10시',
 '11시',
 '12시',
 '15',
 '15000원',
 '19',
 '1등',
 '1시',
 '1시간',
 '2',
 '2 개',
 '2 명',
 '2 번',
 '2 인',
 '2 인분',
 '2 종류',
 '2 층',
 '20',
 '200',
 '2000원',
 '2020년',
 '20분',
 '23',
 '2시',
 '2시간',
 '2조',
 '2조 각',
 '2천원',
 '3',
 '3 개',
 '3 대',
 '3 명',
 '3 인분',
 '3 층',
 '3000원',
 '30분',
 '30분 정도',
 '34',
 '35',
 '3년',
 '3시',
 '4',
 '4 개',
 '40분',
 '40분 정도',
 '4시',
 '4시간',
 '5',
 '5 개',
 '5 층',
 '5시',
 '5천원',
 '6',
 '6000원',
 '6500원',
 '6시',
 '6천원',
 '7',
 '7시',
 '7천원',
 '8시',
 '8천원',
 '9',
 '90',
 '90 d',
 '9시',
 'b',
 'd',
 'f',
 'f 90',
 'good',
 'httpsmblognavercomblackgta',
 'jmt',
 'm',
 'sns',
 'soso',
 'u',
 'u 0001',
 'ㄷㄷ',
 'ㅁ',
 'ㅈ',
 'ㅋ',
 'ㅋㅋ',
 'ㅋㅋㅋ',
 'ㅋㅋㅋㅋ',
 'ㅋㅋㅋㅋㅋ',
 'ㅎ',
 'ㅎㅎ',
 'ㅎㅎ 다음',
 'ㅎㅎㅎ',
 'ㅎㅎㅎㅎ',
 'ㅛ',
 'ㅜ',
 'ㅜㅜ',
 'ㅜㅜㅜ',
 'ㅜㅠ',
 'ㅠ',
 'ㅠㅜ',
 'ㅠㅠ',
 'ㅠㅠ 직원',
 'ㅠㅠㅠ',
 'ㅠㅠㅠㅠ',
 'ㅠㅡㅠ',
 'ㅡ',
 'ㅡㅡ',
 '가',
 '가 가장',
 '가 궁금해서',
 '가 깔끔하고',
 '가 나서',
 '

#### 학습 및 검증

In [41]:
# lgbm
start_time = time.time()
lgbm_clf = LGBMClassifier(n_estimators = 400)
# lgbm_clf.fit(X_train_over, y_train_over)
lgbm_clf.fit(tfidf_matrix_train, y_train)
print('Fit time : ', time.time() - start_time)
skfold = StratifiedKFold(n_splits=5)
# cross_val_score(lgbm_clf, X_train_over, y_train_over, scoring=None, cv=skfold)
cross_val_score(lgbm_clf, tfidf_matrix_train, y_train, scoring=None, cv=skfold)

Fit time :  1.5997188091278076


array([0.86965589, 0.88204593, 0.8611691 , 0.87682672, 0.87056367])

In [42]:
# randomforest
start_time = time.time()
random_clf = RandomForestClassifier(random_state=13,max_depth=5)
# random_clf.fit(X_train_over, y_train_over)
random_clf.fit(tfidf_matrix_train, y_train)
print('Fit time : ', time.time() - start_time)
skfold = StratifiedKFold(n_splits=5)
# cross_val_score(random_clf, X_train_over, y_train_over, scoring=None, cv=skfold)
cross_val_score(random_clf, tfidf_matrix_train, y_train, scoring=None, cv=skfold)

Fit time :  0.1625986099243164


array([0.86339937, 0.86430063, 0.86430063, 0.86325678, 0.86325678])

In [43]:
# decisiontree
start_time = time.time()
tree_clf = DecisionTreeClassifier(random_state=13,max_depth=5)
# tree_clf.fit(X_train_over, y_train_over)
tree_clf.fit(tfidf_matrix_train, y_train)
print('Fit time : ', time.time() - start_time)
skfold = StratifiedKFold(n_splits=5)
# cross_val_score(tree_clf, X_train_over, y_train_over, scoring=None, cv=skfold)
cross_val_score(tree_clf, tfidf_matrix_train, y_train, scoring=None, cv=skfold)

Fit time :  0.03893446922302246


array([0.87904067, 0.8736952 , 0.87682672, 0.8736952 , 0.87160752])

#### 테스트 어큐러시 확인

In [44]:
tfidf_matrix_test = tfidf_vect.transform(X_test['review'])
preds = lgbm_clf.predict(tfidf_matrix_test)
accuracy_score(y_test, preds)

0.8612463485881208

In [45]:
tfidf_matrix_test = tfidf_vect.transform(X_test['review'])
preds = random_clf.predict(tfidf_matrix_test)
accuracy_score(y_test, preds)

0.8636806231742941

In [46]:
tfidf_matrix_test = tfidf_vect.transform(X_test['review'])
preds = tree_clf.predict(tfidf_matrix_test)
accuracy_score(y_test, preds)

0.8631937682570594

#### 실제 문장 넣고 확인

In [51]:
X_test[['review','emotion']].iloc[100]

review      개판이에요개판
emotion    negative
Name: 2730, dtype: object

In [52]:
lgbm_clf.predict(tfidf_vect.transform([X_test['review'][2730]]))

array(['positive'], dtype=object)

In [53]:
random_clf.predict(tfidf_vect.transform([X_test['review'][2730]]))

array(['positive'], dtype=object)

In [54]:
tree_clf.predict(tfidf_vect.transform([X_test['review'][2730]]))

array(['positive'], dtype=object)

#### 감성 분류 적용

In [55]:
word = '별로에요'
print(f'"{word}"는 lgbm에서',lgbm_clf.predict(tfidf_vect.transform([word])))
print(f'"{word}"는 random_forest에서',random_clf.predict(tfidf_vect.transform([word])))
print(f'"{word}"는 decisiontree에서',tree_clf.predict(tfidf_vect.transform([word])))

"별로에요"는 lgbm에서 ['negative']
"별로에요"는 random_forest에서 ['positive']
"별로에요"는 decisiontree에서 ['negative']


In [56]:
word = '싫어요'
print(f'"{word}"는 lgbm에서',lgbm_clf.predict(tfidf_vect.transform([word])))
print(f'"{word}"는 random_forest에서',random_clf.predict(tfidf_vect.transform([word])))
print(f'"{word}"는 decisiontree에서',tree_clf.predict(tfidf_vect.transform([word])))

"싫어요"는 lgbm에서 ['positive']
"싫어요"는 random_forest에서 ['positive']
"싫어요"는 decisiontree에서 ['positive']


In [57]:
word = '맛없어요'
print(f'"{word}"는 lgbm에서',lgbm_clf.predict(tfidf_vect.transform([word])))
print(f'"{word}"는 random_forest에서',random_clf.predict(tfidf_vect.transform([word])))
print(f'"{word}"는 decisiontree에서',tree_clf.predict(tfidf_vect.transform([word])))

"맛없어요"는 lgbm에서 ['positive']
"맛없어요"는 random_forest에서 ['positive']
"맛없어요"는 decisiontree에서 ['positive']


In [58]:
word = '불친절 해요'
print(f'"{word}"는 lgbm에서',lgbm_clf.predict(tfidf_vect.transform([word])))
print(f'"{word}"는 random_forest에서',random_clf.predict(tfidf_vect.transform([word])))
print(f'"{word}"는 decisiontree에서',tree_clf.predict(tfidf_vect.transform([word])))

"불친절 해요"는 lgbm에서 ['positive']
"불친절 해요"는 random_forest에서 ['positive']
"불친절 해요"는 decisiontree에서 ['negative']


In [37]:
word = '개판이에요'
print(f'"{word}"는 lgbm에서',lgbm_clf.predict(tfidf_vect.transform([word])))
print(f'"{word}"는 random_forest에서',random_clf.predict(tfidf_vect.transform([word])))
print(f'"{word}"는 decisiontree에서',tree_clf.predict(tfidf_vect.transform([word])))

"개판이에요"는 lgbm에서 ['positive']
"개판이에요"는 random_forest에서 ['positive']
"개판이에요"는 decisiontree에서 ['negative']
