### Kaggle Competetion
# Bag of Words meets Bag of Popcorn
## - IMDB 영화 리뷰 감정분석

## I. 데이터 불러오기

In [1]:
import pandas as pd

# sentiment가 있는 학습 데이터
train = pd.read_csv('data/labeledTrainData.tsv', 
                    header = 0,        # 파일의 첫번째 줄에 열 이름이 있음
                    delimiter = '\t',  # 구분자가 TAB
                    quoting = 3)       # 쌍따옴표 무시
# sentiment가 없는 테스트 데이터
test = pd.read_csv('data/testData.tsv', 
                    header = 0, delimiter = '\t', quoting = 3)
train.shape

(25000, 3)

In [3]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
test.shape

(25000, 2)

In [5]:
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [7]:
train.describe()

Unnamed: 0,sentiment
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [8]:
train['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [10]:
train['review'][0][:700]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely lik'

## II. 데이터 정제 및 텍스트 전처리
1. HTML 태그 제거
2. 알파벳 이외의 문자 공백으로
3. 불용어 제거
4. 어간 추출

### 1. HTML 태그 제거

In [2]:
from bs4 import BeautifulSoup

example1 = BeautifulSoup(train['review'][0], 'html5lib')
example1.get_text()[:700]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyw'

### 2. 알파벳 이외의 문자 공백으로

In [3]:
import re

letters_only = re.sub('[^a-zA-Z]', ' ', example1.get_text())
letters_only[:700]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyw'

### 3. 불용어 제거

In [4]:
# 소문자로 변환
lower_case = letters_only.lower()
# 문장 -> 토큰
words = lower_case.split()
print(len(words))
words[:8]

437


['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the']

In [5]:
import nltk
from nltk.corpus import stopwords
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [6]:
words =[w for w in words if not w in stopwords.words('english')]
print(len(words))
words[:8]

219


['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching']

### 4. 어간 추출

In [7]:
# 포터 스테머(보수적) 사용 예
p_stemmer = nltk.stem.PorterStemmer()
print(p_stemmer.stem('maximum'))
print('The stemmed form of running is: {}'.format(p_stemmer.stem('running')))
print('The stemmed form of runs is: {}'.format(p_stemmer.stem('runs')))
print('The stemmed form of run is: {}'.format(p_stemmer.stem('run')))

maximum
The stemmed form of running is: run
The stemmed form of runs is: run
The stemmed form of run is: run


In [8]:
# 랭커스터 스테머(적극적) 사용 예
l_stemmer = nltk.stem.LancasterStemmer()
print(l_stemmer.stem('maximum'))
print('The stemmed form of running is: {}'.format(l_stemmer.stem('running')))
print('The stemmed form of runs is: {}'.format(l_stemmer.stem('runs')))
print('The stemmed form of run is: {}'.format(l_stemmer.stem('run')))

maxim
The stemmed form of running is: run
The stemmed form of runs is: run
The stemmed form of run is: run


In [9]:
# 처리 전 단어
words[:10]

['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary']

In [10]:
# 스노우볼 스테머 사용
stemmer = nltk.stem.snowball.SnowballStemmer('english')
words = [stemmer.stem(w) for w in words]
# 처리 후 단어
words[:10]

['stuff',
 'go',
 'moment',
 'mj',
 'start',
 'listen',
 'music',
 'watch',
 'odd',
 'documentari']

#### 음소 표기법(Lemmatization)
단어의 보조 정리 또는 사전 형식에 의해 식별되는 단일 항목으로 분석될 수 있도록 굴절된 형태의 단어를 그룹화하는 과정

In [11]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
print(wnl.lemmatize('fly'))
print(wnl.lemmatize('flies'))

fly
fly


In [12]:
words = [wnl.lemmatize(w) for w in words]
# 처리 후 단어
words[:10]

['stuff',
 'go',
 'moment',
 'mj',
 'start',
 'listen',
 'music',
 'watch',
 'odd',
 'documentari']

### 문자열 처리

In [13]:
# 앞의 과정을 함수로 제작
def review2words(raw_review):
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    words = letters_only.lower().split()
    meaningful_words = [w for w in words if not w in stopwords.words('english')]
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    lemmatized_words = [wnl.lemmatize(w) for w in stemming_words]
    return(' '.join(lemmatized_words))

In [14]:
clean_review = review2words(train['review'][0])
clean_review

'stuff go moment mj start listen music watch odd documentari watch wiz watch moonwalk mayb want get certain insight guy thought realli cool eighti mayb make mind whether guilti innoc moonwalk part biographi part featur film rememb go see cinema origin releas subtl messag mj feel toward press also obvious messag drug bad kay visual impress cours michael jackson unless remot like mj anyway go hate find bore may call mj egotist consent make movi mj fan would say made fan true realli nice actual featur film bit final start minut exclud smooth crimin sequenc joe pesci convinc psychopath power drug lord want mj dead bad beyond mj overheard plan nah joe pesci charact rant want peopl know suppli drug etc dunno mayb hate mj music lot cool thing like mj turn car robot whole speed demon sequenc also director must patienc saint came film kiddi bad sequenc usual director hate work one kid let alon whole bunch perform complex danc scene bottom line movi peopl like mj one level anoth think peopl stay

### 모든 데이터 처리

In [15]:
num_reviews = len(train)
num_reviews

25000

In [16]:
clean_train_reviews = []
import time
now = time.localtime()
print("%02d:%02d" % (now.tm_min, now.tm_sec))

for i in range(0, 1000):
    clean_train_reviews.append(review2words(train['review'][i]))
    
now = time.localtime()
print("%02d:%02d" % (now.tm_min, now.tm_sec))

 9:50
11:41


In [None]:
for i in range(1000, num_reviews):
    if (i+1) % 100 == 0:
        print((i+1)/100, end=' ')
    clean_train_reviews.append(review2words(train['review'][i]))

In [52]:
# 출처: https://gist.github.com/yong27/7869662
# 과정설명: http://www.racketracer.com/2016/07/06/pandas-in-parallel
from multiprocessing import Pool
import numpy as np

def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)

def apply_by_multiprocessing(df, func, **kwargs):
    # 키워드 항목 중 workers 파라메터를 꺼냄
    workers = kwargs.pop('workers')
    # 위에서 가져온 workers 수로 프로세스 풀을 정의
    pool = Pool(processes = workers)
    # 실행할 함수와 데이터프레임을 워커의 수 만큼 나눠 작업
    result = pool.map(_apply_df, [(d, func, kwargs) for d in np.array_split(df, workers)])
    pool.close()
    # 작업 결과를 합쳐서 반환
    return pd.concat(list(result))

In [None]:
%time clean_train_reviews = apply_by_multiprocessing(\
            train['review'], review2words, workers=4)

### Word Cloud

In [17]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

def displayWordCloud(data=None, bgc='white', width=800, height=600):
    wc = WordCloud(stopwords = STOPWORDS,
                    background_color = bgc, width=width,
                    height=height).generate(data)
    plt.figure(figsize=(15,10))
    plt.imshow(wc)
    plt.axis('off')
    plt.show()

In [None]:
%time displayWordCloud(' '.join(clean_train_reviews))

In [18]:
x = clean_train_reviews[0]
x = str(x).split()
print(len(x))
x[:10]
print(len(set(x)))

219
156


In [None]:
# 단어 수
train['num_words'] = clean_train_reviews.apply(lambda x: len(str(x).split()))
# 중복을 제거한 단어 수
train['num_unique_words'] = clean_train_reviews.apply(lambda x: len(set(str(x).split())))

#### 데이터 시각화

In [None]:
import seaborn as sns

fig, axes = plt.subplots(ncols=2)
fig.set_size_inches(18, 6)
print('리뷰별 단어 평균 값 :', train['num_words'].mean())
print('리뷰별 단어 중간 값 :', train['num_words'].median())
sns.distplot(train['num_words'], bin=100, ax=axes[0])
axes[0].axvline(train['num_words'].median(), linestyle='dashed')
axes[0].set_title('리뷰별 단어 수 분포')

print('리뷰별 고유단어 평균 값 :', train['num_unique_words'].mean())
print('리뷰별 고유단어 중간 값 :', train['num_unique_words'].median())
sns.distplot(train['num_unique_words'], bin=100, color='g', ax=axes[1])
axes[0].axvline(train['num_unique_words'].median(), linestyle='dashed')
axes[0].set_title('리뷰별 고유단어 수 분포')