In [1]:
import numpy as np
import pandas as pd

# Part 1.

Part 1 은 Bag-of-words 모델을 사용하여 IMDB 리뷰의 sentiment 를 분석한다. Bag-of-words 는 문장에 들어있는 단어의 빈도 수를 근거로 분류를 하는 방법으로, 문법이나 반어적 표현을 해석하는 데에는 한계가 존재한다. (https://en.wikipedia.org/wiki/Bag-of-words_model) 이를 보완하기 위한 방법으로 n-gram (연속되는 n 개의 단어로 분석)이 있다. 이 노트북에서는 Bag-of-words 모델을 사용하여 감상 분류를 해본다.

## 순서
1) 데이터 전처리

2) 전처리된 review 를 vectorize. 이때 bag-of-words 모델 사용

3) Random Forest 모델로 학습 후 test set 에 적용

In [2]:
train = pd.read_csv('labeledTrainData.tsv', header=0, sep='\t', quoting=3)
print(train.head(),'\n')
print(train['review'][0][:1000])

         id  sentiment                                             review
0  "5814_8"          1  "With all this stuff going down at the moment ...
1  "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2  "7759_3"          0  "The film starts with a manager (Nicholas Bell...
3  "3630_4"          0  "It must be assumed that those who praised thi...
4  "9495_8"          1  "Superbly trashy and wondrously unpretentious ... 

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the

## 1) 데이터 전처리
아래와 같은 전처리 과정을 거쳐서 의미있는 단어들의 구성으로 만든다.
1. html 제거 (w/ BeautifulSoup)
2. 특수문자 제거 (w/ re)
3. 줄글을 단어 리스트로 변환
4. 기능어 제거 (w/ nltk stopword)
5. stemmer 로 어간 추출
6. 리스트를 text 로 변환 후 반환

### 1. html 제거

In [3]:
# train['review'][0] 을 대상으로 전처리 함수를 만들고, 그 후 apply 로 일괄 적용
# html 은 BeautifulSoup 으로 제거
from bs4 import BeautifulSoup
train_html_parser = BeautifulSoup(train['review'][0], 'html.parser')
train_html_parser.get_text()[:1000]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 2

### 2. 특수문자 제거 

In [4]:

# 특수문자 제거는 정규식을 사용. 알파벳이 아닌 str 을 공백으로 대체
import re
train_letters = re.sub('[^a-zA-Z]', ' ', train_html_parser.get_text())
#train_letters = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', ' ', train_html_parser.get_text()) #이렇게 할 경우 I've 에 사용되는 따옴표도 제거됨.
#train_html_parser.get_text()[:1000]
train_letters[:1000]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    m

### 3. 소문자로 바꾼 후 리스트로 변환

In [5]:
# 특수문자를 없앤 후 소문자로 변환 후, words list 를 만든다. 
train_words_list = train_letters.lower().split()
train_words_list[:200]

['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with',
 'mj',
 'i',
 've',
 'started',
 'listening',
 'to',
 'his',
 'music',
 'watching',
 'the',
 'odd',
 'documentary',
 'here',
 'and',
 'there',
 'watched',
 'the',
 'wiz',
 'and',
 'watched',
 'moonwalker',
 'again',
 'maybe',
 'i',
 'just',
 'want',
 'to',
 'get',
 'a',
 'certain',
 'insight',
 'into',
 'this',
 'guy',
 'who',
 'i',
 'thought',
 'was',
 'really',
 'cool',
 'in',
 'the',
 'eighties',
 'just',
 'to',
 'maybe',
 'make',
 'up',
 'my',
 'mind',
 'whether',
 'he',
 'is',
 'guilty',
 'or',
 'innocent',
 'moonwalker',
 'is',
 'part',
 'biography',
 'part',
 'feature',
 'film',
 'which',
 'i',
 'remember',
 'going',
 'to',
 'see',
 'at',
 'the',
 'cinema',
 'when',
 'it',
 'was',
 'originally',
 'released',
 'some',
 'of',
 'it',
 'has',
 'subtle',
 'messages',
 'about',
 'mj',
 's',
 'feeling',
 'towards',
 'the',
 'press',
 'and',
 'also',
 'the',
 'obvious',
 'message',
 'of',
 'drugs',

### 4. 기능어 제거 (불용어 제거)

In [9]:
# 기능어를 제거하여 의미있는 단어들로만 구성한다. nltk stopwords 라이브러리 사용
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
train_meaningWords = [w for w in train_words_list if not w in stopwords]
train_meaningWords[:200]
# nltk 실행시 리소스 다운로드가 필요하다는 결과를 얻을 수 있음. 터미널에서 nltk.download('') 로 다운로드하면 해결됨
# 결과로부터 all, this 와 같은 기능어가 제거된 것 확인 가능

['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary',
 'watched',
 'wiz',
 'watched',
 'moonwalker',
 'maybe',
 'want',
 'get',
 'certain',
 'insight',
 'guy',
 'thought',
 'really',
 'cool',
 'eighties',
 'maybe',
 'make',
 'mind',
 'whether',
 'guilty',
 'innocent',
 'moonwalker',
 'part',
 'biography',
 'part',
 'feature',
 'film',
 'remember',
 'going',
 'see',
 'cinema',
 'originally',
 'released',
 'subtle',
 'messages',
 'mj',
 'feeling',
 'towards',
 'press',
 'also',
 'obvious',
 'message',
 'drugs',
 'bad',
 'kay',
 'visually',
 'impressive',
 'course',
 'michael',
 'jackson',
 'unless',
 'remotely',
 'like',
 'mj',
 'anyway',
 'going',
 'hate',
 'find',
 'boring',
 'may',
 'call',
 'mj',
 'egotist',
 'consenting',
 'making',
 'movie',
 'mj',
 'fans',
 'would',
 'say',
 'made',
 'fans',
 'true',
 'really',
 'nice',
 'actual',
 'feature',
 'film',
 'bit',
 'finally',
 'starts',
 'minutes',
 'excluding',
 'smooth',
 'crim

### 5. 어간 추출  
listening 과 litens, liten 은 같은 의미를 지녔기 때문에 어근을 추출하는 작업이 필요함. 이때 사용되는 것이 stemmer 인데, 여기서는 snowball stemmer 를 사용한다. http://www.nltk.org/howto/stem.html

In [10]:
from nltk.stem.snowball import SnowballStemmer
#SnowballStemmer 에서 제공하는 언어
print(" ".join(SnowballStemmer.languages))

arabic danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish


In [11]:
stemmer = SnowballStemmer('english')
stemmer.stem('watching')

'watch'

In [13]:
word_stem = [stemmer.stem(w) for w in train_meaningWords]
word_stem[:20]

['stuff',
 'go',
 'moment',
 'mj',
 'start',
 'listen',
 'music',
 'watch',
 'odd',
 'documentari',
 'watch',
 'wiz',
 'watch',
 'moonwalk',
 'mayb',
 'want',
 'get',
 'certain',
 'insight',
 'guy']

### 전처리 함수 만들기
review 글이 전처리된 words 로 구성된 text 를 반환하는 함수.
새로웉 컬럼을 만들거나, text 를 출력하는 것은 결과를 처리하는 것에 따름

In [28]:
def review_to_words(sentence):
    parsed_sentence = BeautifulSoup(sentence, 'html.parser').get_text()
    letters_sentence = re.sub('[^a-zA-Z]', ' ', parsed_sentence)
    words_lst = letters_sentence.lower().split()
    
    # stopwords 제거
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    meaning_words_lst = [w for w in words_lst if not w in stopwords]
    
    # 어근 추출
    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer('english')
    stem_words_lst = [stemmer.stem(w) for w in meaning_words_lst]
    
    return (' '.join(stem_words_lst))


In [None]:
%time train['review_to_words']=train['review'].apply(review_to_words)

In [17]:
train.head(3)

Unnamed: 0,id,sentiment,review,review_to_words
0,"""5814_8""",1,"""With all this stuff going down at the moment ...","""with stuff go moment mj i'v start listen musi..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...","""\""the classic war worlds\"" timothi hine enter..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...","""the film start manag (nichola bell) give welc..."


#### 멀티프로세싱
25000개의 데이터를 전처리할 때 더 표율적으로 하기 위하여 멀티프로세싱을 적용해볼 수 있다. 

In [30]:
# 참고 : https://gist.github.com/yong27/7869662
# http://www.racketracer.com/2016/07/06/pandas-in-parallel/

In [31]:
import multiprocessing
import pandas as pd
import numpy as np

def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)

def apply_by_multiprocessing(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = multiprocessing.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs)
            for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))

In [32]:
%time clean_train_reviews = apply_by_multiprocessing(train['review'], review_to_words, workers=4)

CPU times: user 152 ms, sys: 210 ms, total: 363 ms
Wall time: 51.7 s


In [33]:
type(clean_train_reviews)

pandas.core.series.Series

## 2) 전처리된 review 를 vectorize
Bag-of-words 모델: 각 review 에서 사용된 단어를 통틀어 하나의 dictionary 를 만들고, 각 review 에서 dictionary 의 단어의 사용 횟수에 대하여 vector 생성

vector 로 변경할 때에는 sklearn 의 Countvector 를 사용 (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)


In [34]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(lowercase = True, 
                             preprocessor = None,
                             tokenizer = None, 
                             stop_words = None,
                             analyzer = 'word',
                             max_features=20000)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [35]:
# vectorizer 객체에 전처리된 review 를 바탕으로 dictionary 만든 후 벡터화 시키도록 한다. 이때 fit_transform 매서드 사용.
# fit_transform 할 경우 vectorizer 도 학습이 되게 되고, 이를 바로 적용한 값을 출력한다.
%time train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features

CPU times: user 3.33 s, sys: 288 ms, total: 3.62 s
Wall time: 3.67 s


<25000x20000 sparse matrix of type '<class 'numpy.int64'>'
	with 2301515 stored elements in Compressed Sparse Row format>

In [36]:
# vectorizer 를 실행한 결과, 각 row 별로 20,000 개의 단어 사용 횟수를 나타내는 vector 로 구성된 matrix 가 나온다. 
# 이를 array 형태로 바꾼다.
train_data_features = train_data_features.toarray()
print(train_data_features.shape)

(25000, 20000)


In [37]:
# vectorizer 의 vocabulary dict 를 확인하는 작업
vocab = vectorizer.get_feature_names()
vocab[:10]

['aa',
 'aaa',
 'aag',
 'aam',
 'aamir',
 'aankhen',
 'aapk',
 'aardman',
 'aaron',
 'ab']

In [38]:
# 각 단어가 나온 횟수 세우기
dist = np.sum(train_data_features, axis=0)
print(len(vocab))
print(len(dist))

20000
20000


In [39]:
vocab_count_dict={}
for tag, count in zip(vocab, dist):
    vocab_count_dict[tag]=count
vocab_count_dict

{'aa': 5,
 'aaa': 9,
 'aag': 26,
 'aam': 15,
 'aamir': 5,
 'aankhen': 6,
 'aapk': 6,
 'aardman': 12,
 'aaron': 48,
 'ab': 22,
 'aback': 9,
 'abandon': 288,
 'abba': 14,
 'abbey': 24,
 'abbi': 30,
 'abbot': 29,
 'abbott': 30,
 'abbrevi': 6,
 'abc': 125,
 'abduct': 55,
 'abductor': 5,
 'abe': 24,
 'abel': 9,
 'aberr': 6,
 'abet': 17,
 'abhay': 28,
 'abhishek': 9,
 'abhorr': 20,
 'abid': 28,
 'abigail': 26,
 'abil': 562,
 'abject': 11,
 'abl': 1259,
 'abli': 27,
 'abnorm': 16,
 'abo': 14,
 'aboard': 37,
 'abod': 6,
 'abolish': 6,
 'abomin': 83,
 'aborigin': 69,
 'abort': 92,
 'abound': 63,
 'abraham': 93,
 'abras': 14,
 'abridg': 12,
 'abroad': 38,
 'abrupt': 136,
 'abscond': 6,
 'absenc': 118,
 'absent': 83,
 'absente': 5,
 'absolut': 1850,
 'absolutley': 5,
 'absolv': 6,
 'absorb': 154,
 'absorpt': 6,
 'abstract': 49,
 'absurd': 427,
 'absurdist': 18,
 'abu': 40,
 'abund': 73,
 'abus': 398,
 'abut': 5,
 'abysm': 110,
 'abyss': 19,
 'ac': 10,
 'academ': 41,
 'academi': 298,
 'acceler': 1

## 3) 랜덤포레스트 모델로 학습 후 테스트
랜덤포레스트는 다양한 트리 모델을 만들고, 데이터를 랜덤하게 뽑은 트리에 적용한 후에 나오는 값의 평균으로 학습하는 모델. (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

### train data set 으로 학습

In [40]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100)

# train 데이터로 학습
random_forest = random_forest.fit(train_data_features, train['sentiment'])

### test set 에 적용

In [41]:
# 테스트 데이터의 전처리 과정
test = pd.read_csv('testData.tsv', header=0, quoting=3, sep='\t')
%time clean_test_reviews = apply_by_multiprocessing(test['review'], review_to_words, workers=4)

CPU times: user 159 ms, sys: 211 ms, total: 370 ms
Wall time: 50.1 s


In [42]:
# test 에서는 train 데이터로부터 만들어진 vocab vector 로 벡터화 해야하므로 fit_transform 이 아니라 transform 을 사용해야한다.
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

result = random_forest.predict(test_data_features)

In [43]:
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

In [44]:
output.to_csv('part1_bag-of-words.csv', index=False, quoting=3)

### Score: 0.84660

### Bi-gram 으로 vectorization

In [49]:
bigram_vectorization = CountVectorizer(lowercase = True, 
                             preprocessor = None,
                             tokenizer = None, 
                             stop_words = None,
                             analyzer = 'word',
                             max_features=20000,
                             ngram_range=(2,5))
bigram_vectorization

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=1,
        ngram_range=(2, 5), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [50]:
%time bi_train_features = bigram_vectorization.fit_transform(clean_train_reviews)

KeyboardInterrupt: 

In [47]:
bigram_vectorization.get_feature_names()

['abbott costello',
 'abil make',
 'abl captur',
 'abl enjoy',
 'abl find',
 'abl get',
 'abl keep',
 'abl make',
 'abl see',
 'abl sit',
 'abl take',
 'abl watch',
 'abraham lincoln',
 'abrupt end',
 'absolut amaz',
 'absolut aw',
 'absolut beauti',
 'absolut best',
 'absolut brilliant',
 'absolut fantast',
 'absolut hate',
 'absolut hilari',
 'absolut horribl',
 'absolut love',
 'absolut noth',
 'absolut one',
 'absolut perfect',
 'absolut reason',
 'absolut ridicul',
 'absolut sens',
 'absolut stun',
 'absolut terribl',
 'absolut worst',
 'abus father',
 'abus husband',
 'academi award',
 'accept role',
 'accident kill',
 'accord dvd',
 'accord imdb',
 'accord movi',
 'accur depict',
 'accur portray',
 'accus murder',
 'achiev goal',
 'acid trip',
 'acquir tast',
 'across board',
 'across countri',
 'across eye',
 'across film',
 'across movi',
 'across screen',
 'act abil',
 'act absolut',
 'act act',
 'act actor',
 'act actual',
 'act also',
 'act although',
 'act amaz',
 'act atr

In [48]:
from sklearn.ensemble import RandomForestClassifier
forest_bigram = RandomForestClassifier(n_estimators=100)

# train 데이터로 학습
forest_bigram = forest_bigram.fit(bi_train_features, train['sentiment'])

KeyboardInterrupt: 