In [5]:
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


## 8.2 BoW모델
### BoW란?

In [8]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer() #각각의 문서에 있는 단어 카운트를 기반으로 BoW모델 생성
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs) #BoW 모델의 어휘 사전 구축 -> 세 문장을 희소한 특성 벡터로 변환

print(count.vocabulary_)  #해당 단어의 인덱스값

print(bag.toarray()) #문장마다 인덱스에 해당하는 단어가 있는 수를 나타냄

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


### tf-idf

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())
#단어 빈도가 크면 작은 tf-idf값을 가짐

[[0.         0.43370786 0.         0.55847784 0.55847784 0.
  0.43370786 0.         0.        ]
 [0.         0.43370786 0.         0.         0.         0.55847784
  0.43370786 0.         0.55847784]
 [0.50238645 0.44507629 0.50238645 0.19103892 0.19103892 0.19103892
  0.29671753 0.25119322 0.19103892]]


### 텍스트 데이터 정제

In [12]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) #HTML 태그 삭제
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text) #이모티콘 찾기
    text = (re.sub('[\W]+', ' ', text.lower()) + #[\W]+ : 텍스트에서 단어가 아닌 문자 제거
            ' '.join(emoticons).replace('-', '')) # '-' 삭제
    return text
print(preprocessor(df.loc[0, 'review'][-50:]))
print(preprocessor("</a>This :) is :( a test :-)!"))
df['review'] = df['review'].apply(preprocessor)

is seven title brazil not available
this is a test :) :( :)


### 문서를 토큰으로 나누기
  

In [13]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

#공백 문자를 기준으로 개별 단어로 나누기
def tokenizer(text):
    return text.split()

#어간 추출
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer('runners like running and thus they run') #['runners', 'like', 'running', 'and', 'thus', 'they', 'run']
tokenizer_porter('runners like running and thus they run') #['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [15]:
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords #불용어

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sunk2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'like', 'run', 'run', 'lot']

## 3. 문서 분류를 위한 로지스틱 회귀 모델 훈련


In [18]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0, solver='liblinear'))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           n_jobs=-1)

gs_lr_tfidf.fit(X_train, y_train) #모델 훈련

print('최적의 매개변수 조합: %s ' % gs_lr_tfidf.best_params_)
print('CV 정확도: %.3f' % gs_lr_tfidf.best_score_)

clf = gs_lr_tfidf.best_estimator_
print('테스트 정확도: %.3f' % clf.score(X_test, y_test))

## 4. 대용량 데이터 처리

In [36]:
import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')

#불용어 제외 및 토큰화
def tokenizer(text): 
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

#한번에 문서 하나씩 읽어서 반환
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # 헤더 넘기기
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [37]:
next(stream_docs(path='movie_data.csv'))

('"In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />""Murder in Greenwich"" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich f

In [38]:
#siza 매개변수에 지정한 만큼 문서 변환
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [39]:
#HashingVectorizer : 데이터 종류에 상관없이 해싱 트릭 사용
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier


vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)

from distutils.version import LooseVersion as Version

clf = SGDClassifier(loss='log_loss', random_state=1)

doc_stream = stream_docs(path='movie_data.csv')

In [40]:
#외부 메모리 학습
import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:26


In [41]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('정확도: %.3f' % clf.score(X_test, y_test))
clf = clf.partial_fit(X_test, y_test)

정확도: 0.868


## 5. 잠재 디리클레 할당을 사용한 토픽 모델링

In [None]:
import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8')


#LDA 입력을 넣을 BoW행렬 만들기
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english',
                        max_df=.1, #최대 문서 빈도 10%로 지정 -> 너무 자주 등장하는 단어 제외 => 관련성이 적으니
                        max_features=5000) #가장 자주 등장하는 단어 500개로 단어수 ㅈ한 -> 추론 성능 향상
X = count.fit_transform(df['review'].values)

#LDA 수행
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10, #10개의 토픽
                                random_state=123, 
                                learning_method='batch')
X_topics = lda.fit_transform(X)
lda.components_.shape

In [None]:
n_top_words = 5 #각 토픽당 상위 단어 5개 출력
feature_names = count.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()\
                        [:-n_top_words - 1:-1]]))

In [None]:
#호러 영화 리뷰 중 관련성이 가장 높은 3개 출력
horror = X_topics[:, 5].argsort()[::-1]

for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\n공포 영화 #%d:' % (iter_idx + 1))
    print(df['review'][movie_idx][:300], '...')