In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
# 데이터 불러오기 (트레인,테스트)

ratings_train = pd.DataFrame(pd.read_csv('C:/Users/CHOCOLATESTARFISH/ratings_train.txt', sep='\t', quoting=3))
ratings_test = pd.DataFrame(pd.read_csv('C:/Users/CHOCOLATESTARFISH/ratings_test.txt', sep='\t', quoting=3))

In [3]:
ratings_train.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [4]:
ratings_test.head(3)

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0


In [5]:
# 데이터정제
# 토큰화
# sklearn input에 맞게 넣기
# nb 학습
# test데이터 예측
# 정확도 확인

In [6]:
# 데이터정제
# 한글만 남기고 지워줌

import re

ratings_train['document'] = ratings_train['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣]","")
ratings_test['document'] = ratings_test['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣]","")

In [7]:
ratings_train.head(3)

Unnamed: 0,id,document,label
0,9976970,아더빙진짜짜증나네요목소리,0
1,3819312,흠포스터보고초딩영화줄오버연기조차가볍지않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [8]:
ratings_test.head(3)

Unnamed: 0,id,document,label
0,6270596,굳ㅋ,1
1,9274899,,0
2,8544678,뭐야이평점들은나쁘진않지만점짜리는더더욱아니잖아,0


In [9]:
# null값 제거 (한글이 아닌 리뷰들은 공백으로 변환됨 -> null값으로 변환 -> 제거)

ratings_train['document'].replace('', np.nan, inplace=True)
ratings_train.isnull().sum()

id             0
document    1260
label          0
dtype: int64

In [10]:
ratings_test['document'].replace('',np.nan,inplace=True)
ratings_test.isnull().sum()

id            0
document    425
label         0
dtype: int64

In [11]:
ratings_train = ratings_train.dropna(how='any')
ratings_test = ratings_test.dropna(how='any')

In [12]:
print(len(ratings_train))
print(len(ratings_test))

148740
49575


In [13]:
# 토큰화 (okt 사용)

from konlpy.tag import Okt
okt = Okt()

In [14]:
okt_tokens = okt.morphs('흠포스터보고초딩영화줄오버연기조차가볍지않구나')
okt_tokens

['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍지', '않구나']

In [15]:
oktTag = []
for token in okt_tokens:
    oktTag += okt.pos(token)
print(oktTag)

[('흠', 'Noun'), ('포스터', 'Noun'), ('보고', 'Noun'), ('초딩', 'Noun'), ('영화', 'Noun'), ('줄', 'Noun'), ('오버', 'Noun'), ('연기', 'Noun'), ('조차', 'Noun'), ('가볍지', 'Adjective'), ('않구나', 'Verb')]


In [16]:
from collections import Counter
Counter(oktTag).most_common()

[(('흠', 'Noun'), 1),
 (('포스터', 'Noun'), 1),
 (('보고', 'Noun'), 1),
 (('초딩', 'Noun'), 1),
 (('영화', 'Noun'), 1),
 (('줄', 'Noun'), 1),
 (('오버', 'Noun'), 1),
 (('연기', 'Noun'), 1),
 (('조차', 'Noun'), 1),
 (('가볍지', 'Adjective'), 1),
 (('않구나', 'Verb'), 1)]

In [17]:
stopPos = ['Determiner','Adverb','Conjunction','Josa','PreEomi','Eomi','Suffix','Exclamation','Puncutation', 'Foreign', 'Alpha', 'Number', 'Unknown', 'KoreanParticle', 'Hashtag', 'URL']
stopwords = ['흠','줄','조차'] 
word = []
for tag in oktTag:
    if tag[1] not in stopPos:
        if tag[0] not in stopwords:
            word.append(tag[0])
print(word)

['포스터', '보고', '초딩', '영화', '오버', '연기', '가볍지', '않구나']


In [18]:
# 다시 데이터로 (train -> test 순서로 진행)

In [19]:
%time

X_train = []
for sentence in ratings_train['document']:
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True) # stem을 True로 하면 어느정도 정규화해줌(않지 -> 않다)
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    X_train.append(temp_X)

Wall time: 0 ns


In [20]:
print(X_train[:3])

[['아더', '빙', '진짜', '짜증나다', '목소리'], ['포스터', '보고', '초딩', '영화', '오버', '연기', '가볍다', '않다'], ['너', '무재', '밓었', '다그', '래서', '보다', '추천', '한', '다']]


In [43]:
X_train_new = []
for item in X_train:
    X_train_new.append(str(item))

In [21]:
X_test = []
for sentence in ratings_test['document']:
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True)
    temp_X = [word for word in temp_X if not word in stopwords]
    X_test.append(temp_X)

In [44]:
X_test_new = []
for item in X_test:
    X_test_new.append(str(item))

In [46]:
X_train_new

["['아더', '빙', '진짜', '짜증나다', '목소리']",
 "['포스터', '보고', '초딩', '영화', '오버', '연기', '가볍다', '않다']",
 "['너', '무재', '밓었', '다그', '래서', '보다', '추천', '한', '다']",
 "['교도소', '이야기', '구먼', '솔직하다', '재미', '는', '없다', '평점', '조정']",
 "['사이', '몬페', '그', '의', '익살스럽다', '연기', '가', '돋보이다', '영화', '스파이더맨', '에서', '늙다', '보이다', '하다', '크다', '스틴던스트', '가', '너무나도', '이쁘다', '보이다']",
 "['막', '걸음', '마', '떼다', '초등학교', '학년', '생인', '살다', '영화', 'ㅋㅋㅋ', '별', '반개', '도', '아깝다', '움']",
 "['원작', '의', '긴장감', '을제', '대로', '살리다', '하다']",
 "['별', '반개', '도', '아깝다', '다욕', '나오다', '이응경', '길용우', '연기', '생활', '이', '몇년', '인지', '정말', '발', '로', '해도', '그것', '보단', '낫다', '납치', '감금', '만', '반복', '반복', '이', '드라마', '는', '가족', '도', '없다', '연기', '못', '하', '는', '사람', '만', '모', '엿', '네']",
 "['액션', '이', '없다', '재미있다', '몇', '안되다', '영화']",
 "['왜케', '평점', '이', '낮다', '꽤볼', '만', '한', '데', '헐리우드', '식', '화려하다', '너무', '길들이다']",
 "['걍', '인피니트', '가', '짱', '이다', '진짜', '짱', '이다']",
 "['볼때', '마다', '눈물나다', '죽다', '년대', '의', '향수', '자극', '허진호', '는', '감성', '절제', '멜로', '의', '달인', '

In [47]:
X_test_new

["['굳다', 'ㅋ']",
 "['뭐', '야', '이', '평점', '들', '은', '나쁘다', '않다', '만점', '짜다', '리', '는', '더', '더욱', '아니다']",
 "['지루하다', '않다', '완전', '막장', '임돈', '주다', '보기', '에는']",
 "['만', '아니다', '별', '다섯', '개', '주다', '왜', '로나', '와', '서다', '제', '심기', '를', '불편하다']",
 "['음악', '이', '주가', '되다', '최고', '의', '음악', '영화']",
 "['진정하다', '쓰레기']",
 "['마치', '미국', '애니', '에서', '튀어나오다', '한', '창의력', '없다', '로봇', '디자인', '부터가', '고개', '를', '젖다', '다']",
 "['갈수록', '개판', '되다', '중국영화', '유치하다', '없다', '폼', '잡다', '끝나다', '안되다', '무기', '에', '유치하다', '남무', '아', '그리다', '동사서독', '같다', '영화', '가', '이건', '류', '아', '류작', '이다']",
 "['이별', '의', '아픔', '뒤', '에', '찾아오다', '새롭다', '인연', '의', '기쁨', '모든', '사람', '이', '그렇다', '않다']",
 "['괜찮다', '오랜', '만', '포켓몬스터', '잼밌', '어', '요']",
 "['한국', '독립영화', '의', '한계', '그렇게', '아버지', '가', '되다', '비교', '되다']",
 "['청춘', '은', '아름답다', '다그', '아름답다', '움', '은', '이성', '을', '흔들다', '찰나', '의', '아름답다', '움', '을', '자다', '포착', '한', '섬세하다', '아름답다', '수채화', '같다', '퀴어', '영화', '이다']",
 "['눈', '에', '보이다', '반전', '이다', '영화', '의', '흡인', '력', '은'

In [31]:
# y_train, y_test

y_train = np.array(ratings_train['label'])

In [32]:
y_train

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [33]:
y_test = np.array(ratings_test['label'])

In [34]:
y_test

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [48]:
cv = CountVectorizer()
X_train_counts = cv.fit_transform(X_train_new)

In [49]:
clf = MultinomialNB().fit(X_train_counts, y_train)

In [50]:
print(clf.predict(cv.transform(X_test_new)))

[1 1 0 ... 0 0 0]


In [51]:
print(clf.predict_proba(cv.transform(X_test_new)))

[[0.08747907 0.91252093]
 [0.27481815 0.72518185]
 [0.99571452 0.00428548]
 ...
 [0.61955377 0.38044623]
 [0.97085406 0.02914594]
 [0.82740606 0.17259394]]


In [58]:
pred = clf.predict(cv.transform(X_test_new))

In [55]:
# 정확도

from sklearn.metrics import accuracy_score

In [59]:
sklearn.metrics.accuracy_score(y_test, pred)

0.8259203227433182