# Text Analysis - 텍스트 분석

---

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import mglearn

plt.rc('figure', figsize=(10, 6))

from matplotlib import rcParams
rcParams['font.family'] = 'New Gulim'
rcParams['font.size'] = 10
rcParams['axes.unicode_minus'] = False

# 1 영화 리뷰 감성 분석 - IMDB

### 1.1  데이터 로딩 - IMDB

In [None]:
from sklearn.datasets import load_files

reviews_train = load_files('data/aclImdb/train/')
reviews_test  = load_files('data/aclImdb/test/')

# 텍스트와 레이블을 포함하고 있는 Bunch 오브젝트를 반환합니다.
text_train, y_train = reviews_train.data, reviews_train.target
text_test,  y_test  = reviews_test.data,  reviews_test.target

In [None]:
text_train = [doc.replace(b'<br />', b' ') for doc in text_train]
text_test  = [doc.replace(b'<br />', b' ') for doc in text_test]

In [None]:
print('text_train의 길이:', len(text_train))
print('text_train[0]:\n', text_train[0])

In [None]:
print('text_test의 길이:', len(text_test))
print('text_test[0]:\n', text_test[0])

In [None]:
print('클래스별 샘플 수 (훈련 데이터):',   np.bincount(y_train))
print('클래스별 샘플 수 (테스트 데이터):', np.bincount(y_test))

### 1.2  BOW 표현

In [None]:
# 샘플 데이터
bards_words =['The fool doth think he is wise,',
              'but the wise man knows himself to be a fool']

In [None]:
# BOW 모델 생성 및 학습
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(bards_words)

In [None]:
print('어휘 사전의 크기:', len(vect.vocabulary_))
print('어휘 사전의 내용:\n', vect.vocabulary_)

In [None]:
#  BOW 적용
bag_of_words = vect.transform(bards_words)

In [None]:
# 밀집 행렬
bag_of_words.toarray()

### 1.3 영화 리뷰 감성분석 - BOW

#### 1.3.1 영화 리뷰 감성분석 - BOW

In [None]:
# BOW 모델 생성 및 학습
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(text_train)

In [None]:
# BOW 적용 - train data
X_train = vect.transform(text_train)

In [None]:
%%time
from sklearn.linear_model import LogisticRegression

lreg = LogisticRegression(C=0.1, max_iter=5000)
lreg.fit(X_train, y_train)

print('학습 점수: {:.2f}'.format(lreg.score(X_train, y_train)))

In [None]:
# BOW 적용 - test data
X_test = vect.transform(text_test)

print('테스트 점수: {:.2f}'.format(lreg.score(X_test, y_test)))

#### 1.3.2 영화 리뷰 감성분석 - BOW(min_df=5)

In [None]:
# BOW 모델 생성 및 학습
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(min_df=5).fit(text_train)

In [None]:
# BOW 적용 - train data
X_train = vect.transform(text_train)

In [None]:
%%time
from sklearn.linear_model import LogisticRegression

lreg = LogisticRegression(C=0.1, max_iter=5000)
lreg.fit(X_train, y_train)

print('학습 점수: {:.2f}'.format(lreg.score(X_train, y_train)))

In [None]:
# BOW 적용 - test data
X_test = vect.transform(text_test)

print('테스트 점수: {:.2f}'.format(lreg.score(X_test, y_test)))

#### 1.3.3 영화 리뷰 감성분석 - 불용어 적용

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [None]:
len(ENGLISH_STOP_WORDS)

In [None]:
# BOW 모델 생성 및 학습
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(min_df=5, stop_words='english').fit(text_train)

In [None]:
# BOW 적용 - train data
X_train = vect.transform(text_train)

In [None]:
%%time
from sklearn.linear_model import LogisticRegression

lreg = LogisticRegression(C=0.1, max_iter=5000)
lreg.fit(X_train, y_train)

print('학습 점수: {:.2f}'.format(lreg.score(X_train, y_train)))

In [None]:
# BOW 적용 - test data
X_test = vect.transform(text_test)

print('테스트 점수: {:.2f}'.format(lreg.score(X_test, y_test)))

#### 1.3.4 영화 리뷰 감성분석 - n-gram

In [None]:
# BOW(n-gram) 모델 생성 및 학습
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(ngram_range=(1, 3)).fit(text_train)

In [None]:
# BOW(n-gram) 적용 - train data
X_train = vect.transform(text_train)

In [None]:
%%time
from sklearn.linear_model import LogisticRegression

lreg = LogisticRegression(C=0.1, max_iter=5000)
lreg.fit(X_train, y_train)

print('학습 점수: {:.2f}'.format(lreg.score(X_train, y_train)))

In [None]:
# BOW(n-gram) 적용 - test data
X_test = vect.transform(text_test)

print('테스트 점수: {:.2f}'.format(lreg.score(X_test, y_test)))

### 1.4 영화 리뷰 감성 분석 - TF–IDF
$$\text{tfidf}(w, d) = \text{tf} \times (\log\big(\frac{N + 1}{N_w + 1}\big) + 1)$$

In [None]:
# TF-IDF 모델 생성 및 학습
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df=5).fit(text_train)

In [None]:
# TF-IDF 적용 - train data
X_train = vect.transform(text_train)

In [None]:
%%time
from sklearn.linear_model import LogisticRegression

lreg = LogisticRegression(C=0.1, max_iter=5000)
lreg.fit(X_train, y_train)

print('학습 점수: {:.2f}'.format(lreg.score(X_train, y_train)))

In [None]:
# BOW 적용 - test data
X_test = vect.transform(text_test)

print('테스트 점수: {:.2f}'.format(lreg.score(X_test, y_test)))

# 2 토픽 모델링과 문서 군집화

- LDA(Latent Dirichlet Allocation) - 잠재 디리클레 할당

### 2.1 Vectorizaton: text -> vector

In [None]:
vect = CountVectorizer(max_features=10000, max_df=.15)
X = vect.fit_transform(text_train)

### 2.2 LDA 모델 생성 및 변환 - 10개 토픽

In [None]:
%%time
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10, learning_method='batch', max_iter=25, random_state=0)

document_topics = lda.fit_transform(X)

In [None]:
print('lda.components_.shape:', lda.components_.shape)

In [None]:
# 정렬
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

# 피처 이름 추출
feature_names = np.array(vect.get_feature_names_out())

#### 2.2.1 10개의 토픽을 출력

In [None]:
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names, sorting=sorting, topics_per_chunk=5, n_words=10)

### 2.3 LDA 모델 생성 및 변환 - 100개 토픽

In [None]:
%%time
from sklearn.decomposition import LatentDirichletAllocation

lda100 = LatentDirichletAllocation(n_components=100, learning_method='batch', max_iter=25, random_state=0)
document_topics100 = lda100.fit_transform(X)

In [None]:
# 토픽 선택(100개 중에서 선택)
topics = np.array([7, 16, 24, 25, 28, 36, 37, 41, 45, 51, 53, 54, 63, 89, 97])

In [None]:
# 정렬
sorting = np.argsort(lda100.components_, axis=1)[:, ::-1]

# 피처 이름 추출
feature_names = np.array(vect.get_feature_names_out())

#### 2.3.1 선택된 토픽을 출력

In [None]:
mglearn.tools.print_topics(topics=topics, feature_names=feature_names, sorting=sorting, topics_per_chunk=5, n_words=20)

#### 2.3.2 음악적인 토픽(45번)

In [None]:
# 졍렬
music = np.argsort(document_topics100[:, 45])[::-1]

# 이 토픽이 가장 비중이 큰 문서 다섯개 출력
for i in music[:10]:
    # 첫 두 문장 출력
    print(b'.'.join(text_train[i].split(b'.')[:2]) + b'.\n')

#### 2.3.3 LDA로 학습한 토픽 가중치

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 10))
topic_names = ['{:>2} '.format(i) + ' '.join(words) for i, words in enumerate(feature_names[sorting[:, :2]])]

# 두 개의 열이 있는 막대 그래프
for col in [0, 1]:
    start = col * 50
    end = (col + 1) * 50
    ax[col].barh(np.arange(50), np.sum(document_topics100, axis=0)[start:end])
    
    ax[col].set_yticks(np.arange(50))
    ax[col].set_yticklabels(topic_names[start:end], ha='left', va='top')
    ax[col].invert_yaxis()
    ax[col].set_xlim(0, 2000)
    yax = ax[col].get_yaxis()
    yax.set_tick_params(pad=130)
plt.tight_layout()
plt.show()

---

In [None]:
# End of file