In [97]:
from konlpy.tag import Komoran

In [98]:
komoran = Komoran()
print(komoran.morphs(u'우왕 코모란도 오픈소스가 되었어요'))
print(komoran.nouns(u'오픈소스에 관심 많은 멋진 개발자님들!'))
print(komoran.pos(u'혹시 바람과 함께 사라지다 봤어?'))

['우왕', '코', '모란', '도', '오픈', '소스', '가', '되', '었', '어요']
['오픈', '소스', '관심', '개발자']
[('혹시', 'MAG'), ('바람과 함께 사라지다', 'NNP'), ('보', 'VV'), ('았', 'EP'), ('어', 'EF'), ('?', 'SF')]


# 네이버 영화 리뷰 텍스트 분석

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib

In [100]:
data = pd.read_csv("https://raw.githubusercontent.com/haram4th/ablearn/main/ratings_train.txt", sep='\t')

In [101]:
data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [102]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        150000 non-null  int64 
 1   document  149995 non-null  object
 2   label     150000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB


In [103]:
data.isna().sum()

id          0
document    5
label       0
dtype: int64

In [104]:
data[data['document'].isna() ==True]

Unnamed: 0,id,document,label
25857,2172111,,1
55737,6369843,,1
110014,1034280,,0
126782,5942978,,0
140721,1034283,,0


In [105]:
data = data.dropna()

In [106]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 149995 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        149995 non-null  int64 
 1   document  149995 non-null  object
 2   label     149995 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.6+ MB


In [107]:
data['label'].value_counts()

label
0    75170
1    74825
Name: count, dtype: int64

In [108]:
import string

In [109]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [110]:
def clean(x):    
    cleaned = []
    for i in x:
        if i in string.punctuation:
            cleaned.append(i.replace(i, " "))
        else:
            cleaned.append(i)
    cleaned = ''.join(cleaned)
    cleaned = cleaned.replace("   ", " ").replace("  ", " ")
    return cleaned            

In [111]:
clean("흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나")

'흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나'

In [112]:
data['document'] = data['document'].apply(clean)

In [113]:
data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 솔직히 재미는 없다 평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화 스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [114]:
data.loc[4]['document']

'사이몬페그의 익살스런 연기가 돋보였던 영화 스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다'

In [115]:
data['doc_len'] = data['document'].apply(len)

In [116]:
data.head()

Unnamed: 0,id,document,label,doc_len
0,9976970,아 더빙 진짜 짜증나네요 목소리,0,17
1,3819312,흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나,1,28
2,10265843,너무재밓었다그래서보는것을추천한다,0,17
3,9045019,교도소 이야기구먼 솔직히 재미는 없다 평점 조정,0,26
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화 스파이더맨에서 늙어보이기만 했던 커스틴 ...,1,61


In [117]:
data.groupby('label')['doc_len'].mean()

label
0    34.062498
1    33.109549
Name: doc_len, dtype: float64

In [118]:
test = pd.read_csv("https://raw.githubusercontent.com/haram4th/ablearn/main/ratings_test.txt", sep='\t')
test.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


In [119]:
test.isna().sum()

id          0
document    3
label       0
dtype: int64

In [120]:
test= test.dropna()

In [121]:
test['document'] = test['document'].apply(clean)

In [122]:
test.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임 돈주고 보기에는,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데 왜 3D로 나와서 제 심기를 불편하게 하죠,0


In [129]:
data['document'].apply(len) == 0

0         False
1         False
2         False
3         False
4         False
          ...  
149995    False
149996    False
149997    False
149998    False
149999    False
Name: document, Length: 149995, dtype: bool

# 문자를 숫자형 벡터로 변환해주는 라이브러리
* countvectorizer: 단어의 빈도를 세어서 숫자로 변환해주는 것
* TfidfVectorizer: 자주 나오는 단어에 가중치를 주어서 숫자로 변환해 주는 것

In [131]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [124]:
# from konlpy.tag import Komoran
# komoran = Komoran()
# def tw_tokenizer(text):
#     tokens_ko = komoran.morphs(text)
#     return tokens_ko

In [133]:
cv = CountVectorizer(lowercase=False)
cv.fit(data['document'])
X = cv.transform(data['document'])


In [134]:
X

<149995x293925 sparse matrix of type '<class 'numpy.int64'>'
	with 1074747 stored elements in Compressed Sparse Row format>

In [135]:
y = data['label']

In [139]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=7)

In [140]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.8047601586719557
              precision    recall  f1-score   support

           0       0.79      0.84      0.81     30158
           1       0.82      0.77      0.80     29840

    accuracy                           0.80     59998
   macro avg       0.81      0.80      0.80     59998
weighted avg       0.81      0.80      0.80     59998



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [141]:
dct = DecisionTreeClassifier(max_depth=3)
dct.fit(X_train, y_train)
pred = dct.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.5143504783492783
              precision    recall  f1-score   support

           0       0.96      0.04      0.07     30158
           1       0.51      1.00      0.67     29840

    accuracy                           0.51     59998
   macro avg       0.73      0.52      0.37     59998
weighted avg       0.74      0.51      0.37     59998



In [143]:
rfc = RandomForestClassifier(max_depth=3, n_estimators=1000, random_state=7)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.7309410313677123
              precision    recall  f1-score   support

           0       0.68      0.88      0.77     30158
           1       0.83      0.58      0.68     29840

    accuracy                           0.73     59998
   macro avg       0.75      0.73      0.72     59998
weighted avg       0.75      0.73      0.72     59998



In [144]:
from sklearn.naive_bayes import MultinomialNB

In [145]:
model = MultinomialNB()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.8199439981332711
              precision    recall  f1-score   support

           0       0.85      0.78      0.81     30158
           1       0.79      0.86      0.83     29840

    accuracy                           0.82     59998
   macro avg       0.82      0.82      0.82     59998
weighted avg       0.82      0.82      0.82     59998



In [146]:
cv2 = CountVectorizer(lowercase=False)
cv2.fit(test['document'])
X2 = cv2.transform(test['document'])

In [149]:
X2

<49997x127153 sparse matrix of type '<class 'numpy.int64'>'
	with 358911 stored elements in Compressed Sparse Row format>

In [147]:
y2 = test['label']

In [148]:
pred_test = model.predict(X2)
print(accuracy_score(y2, pred_test))
print(classification_report(y2, pred_test))

ValueError: X has 127153 features, but MultinomialNB is expecting 293925 features as input.