In [1]:
import csv
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd

from tensorflow import keras
from keras.datasets import imdb

Using TensorFlow backend.


In [2]:
train = pd.read_csv('./labeledTrainData.tsv', 'r', delimiter='\t', encoding= 'utf-8')
test = pd.read_csv('./testData.tsv', 'r', delimiter='\t', encoding= 'utf-8')


In [3]:
train['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [4]:
train['review'][0][:100]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching t"

In [5]:
train.isnull().sum()

id           0
sentiment    0
review       0
dtype: int64

In [6]:
from bs4 import BeautifulSoup

example1 = BeautifulSoup(train['review'][0])
print(train['review'][0][:700])
example1.get_text()[:700]

With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like


"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anywa"

In [7]:
# 정규표현식을 사용하게 하는 모듈 설치
import re

# ^ : 시작을 의미, 알파벳 소대문자로 시작하는 문자만 추출
letters_only = re.sub('[^a-zA-z]',' ',example1.get_text())
letters_only[:100]

'With all this stuff going down at the moment with MJ i ve started listening to his music  watching t'

In [8]:
# 모두 소문자로 변환한다.
lower_case = letters_only.lower()

# 문자를 나눈다. => 토큰화
words = lower_case.split()
print(len(words))
words[:10]

437


['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with']

In [9]:
import nltk

from nltk.stem import PorterStemmer

nltk.download('stopwords')

from nltk.corpus import stopwords
stopwords.words('english')[:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [10]:
# 반복문을 이용하여 stopwords를 제거한 토큰들
words = [w for w in words if not w in stopwords.words('english')]

In [11]:
def review_to_words(raw_review):
    stemmer = PorterStemmer()
    # 1. HTML 제거
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    # 2. 영문자가 아닌 문자는 공백으로 변환
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    # 3. 소문자 변환
    words = letters_only.lower().split()
    # 4. Stopwords를 세트로 변환
    # 파이썬에서는 리스트보다 세트로 찾는게 훨씬 빠르다.
    stops = set(stopwords.words('english'))
    # 5. Stopwords 제거
    meaningful_words = [w for w in words if not w in stops]
    # 6. 어간추출
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    # 7. 공백으로 구분된 문자열로 결합하여 결과를 반환
    return(' '.join(stemming_words))


In [12]:
# 예시로 첫 리뷰만 적용

clean_review = review_to_words(train['review'][0])
clean_review

'stuff go moment mj start listen music watch odd documentari watch wiz watch moonwalk mayb want get certain insight guy thought realli cool eighti mayb make mind whether guilti innoc moonwalk part biographi part featur film rememb go see cinema origin releas subtl messag mj feel toward press also obviou messag drug bad kay visual impress cours michael jackson unless remot like mj anyway go hate find bore may call mj egotist consent make movi mj fan would say made fan true realli nice actual featur film bit final start minut exclud smooth crimin sequenc joe pesci convinc psychopath power drug lord want mj dead bad beyond mj overheard plan nah joe pesci charact rant want peopl know suppli drug etc dunno mayb hate mj music lot cool thing like mj turn car robot whole speed demon sequenc also director must patienc saint came film kiddi bad sequenc usual director hate work one kid let alon whole bunch perform complex danc scene bottom line movi peopl like mj one level anoth think peopl stay 

In [13]:
# 5000개 단위로 상태를 찍도록 개선, test data set도 동일하게 적용
clean_train_reviews = []
for i in range(0,len(train['review'])) :
    if (i+1) % 5000 == 0:
        print('Review {} of {}'.format(i+1, train['review']))
    clean_train_reviews.append(review_to_words(train['review'][i]))
    
    
clean_test_reviews = []
for i in range(0,len(test['review'])) :
    if (i+1) % 5000 == 0:
        print('Review {} of {}'.format(i+1, test['review']))
    clean_test_reviews.append(review_to_words(test['review'][i]))    
    

Review 5000 of 0        With all this stuff going down at the moment w...
1        \The Classic War of the Worlds\" by Timothy Hi...
2        The film starts with a manager (Nicholas Bell)...
3        It must be assumed that those who praised this...
4        Superbly trashy and wondrously unpretentious 8...
                               ...                        
24995    It seems like more consideration has gone into...
24996    I don't believe they made this film. Completel...
24997    Guy is a loser. Can't get girls, needs to buil...
24998    This 30 minute documentary Buñuel made in the ...
24999    I saw this movie as a child and it broke my he...
Name: review, Length: 25000, dtype: object
Review 10000 of 0        With all this stuff going down at the moment w...
1        \The Classic War of the Worlds\" by Timothy Hi...
2        The film starts with a manager (Nicholas Bell)...
3        It must be assumed that those who praised this...
4        Superbly trashy and wondrously u

In [14]:
# 리뷰의 토큰을 피쳐로 변환
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

# 튜토리얼과 다르게 파라미터 값을 수정
vectorizer = CountVectorizer(analyzer = 'word', 
                             tokenizer = None,
                             preprocessor = None, 
                             stop_words = None, 
                             min_df = 2, # 토큰이 나타날 최소 문서 개수
                             ngram_range=(1, 3),
                             max_features = 20000)

# 속도 개선을 위해 파이프라인을 사용하도록 개선
pipeline = Pipeline([('vect', vectorizer),])

# 벡터화
train_data_features = pipeline.fit_transform(clean_train_reviews)
train_data_features



<25000x20000 sparse matrix of type '<class 'numpy.int64'>'
	with 2757814 stored elements in Compressed Sparse Row format>

In [16]:
train_data_features.shape

(25000, 20000)

In [17]:
vocab = vectorizer.get_feature_names()
print(len(vocab))
vocab[:10]

20000


['aag',
 'aaron',
 'ab',
 'abandon',
 'abbey',
 'abbi',
 'abbot',
 'abbott',
 'abc',
 'abduct']

In [18]:
from sklearn.ensemble import RandomForestClassifier

# 랜덤포레스트 분류기를 사용
forest = RandomForestClassifier(n_estimators = 100, n_jobs = -1, random_state=2018)
forest = forest.fit(train_data_features, train['sentiment'])

In [19]:
from sklearn.model_selection import cross_val_score
score = np.mean(cross_val_score(forest,train_data_features,train['sentiment'], cv=10, scoring='roc_auc'))

In [20]:
score

0.9269520960000002

In [21]:
# test 데이터를 벡터화 함
test_data_features = pipeline.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# 벡터화한 test 데이터를 넣고 예측한다.
result = forest.predict(test_data_features)

In [23]:
output = pd.DataFrame(data = {'id':test['id'], 'sentent':result})
output.head()

Unnamed: 0,id,sentent
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1


In [24]:
result

array([1, 0, 1, ..., 0, 1, 0], dtype=int64)

In [25]:
good = 0;
bad  = 0
for r in result:
    if r == 1:
        good += 1
    else :
        bad += 1

In [31]:
len(result)

25000

In [35]:
good = good / len(result)

In [36]:
bad = bad / len(result)

In [51]:
print('긍정적 리뷰 비율 :',good ,end = ' ')
print('부정적 리뷰 비율 :',bad ,end = ' ')

긍정적 리뷰 비율 : 0.49992 부정적 리뷰 비율 : 0.50008 