In [8]:
import pandas as pd
import os
import numpy as np

import re
from time import time
# from ml_module.progbar import ProgBar

In [4]:
class ProgBar():
    def __init__(self, step = 100):
        self.step = int(step/20)
        self.count = 1
        self.progress = 0

    def update(self):
        if self.count % self.step == 0 :
            self.progress += 1
            print('\r[%s%s]' % ('#' * self.progress, ' '*(20-self.progress)), end = '')
        self.count += 1

## Preprocessing
### Labeling

In [7]:
path = './aclImdb/'

pbar = ProgBar(50000)
labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()

for s in ('test', 'train'):
    for name in ('pos','neg'):
        subpath = '%s/%s' % (s, name)
        dirpath = path + subpath
        for file in os.listdir(dirpath):
            with open(os.path.join(dirpath, file), 'r', encoding='UTF8') as f:
                txt = f.read()
            df = df.append([[txt, labels[name]]], ignore_index=True)
            pbar.update()

df.columns = ['review', 'sentiment']

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./input/movie_review.csv', index=False)

import pandas as pd
df = pd.DataFrame()
df = pd.read_csv('./input/movie_review.csv')
print(df.head())
print(df.tail())

[####################]                                              review  sentiment
0  Based on an actual story, John Boorman shows t...          1
1  This is a gem. As a Film Four production - the...          1
2  I really like this show. It has drama, romance...          1
3  This is the best 3-D experience Disney has at ...          1
4  Of the Korean movies I've seen, only three had...          1
                                                  review  sentiment
49995  My comments may be a bit of a spoiler, for wha...          0
49996  The "saucy" misadventures of four au pairs who...          0
49997  Oh, those Italians! Assuming that movies about...          0
49998  Eight academy nominations? It's beyond belief....          0
49999  Not that I dislike childrens movies, but this ...          0


In [9]:
def preprocessor(text) :
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)|\^.?\^', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

df = pd.read_csv('./input/movie_review.csv')

stime = time()
print('전처리 시작')
df['review'] = df['review'].apply(preprocessor)
print('전처리 완료: 소요시간 [%d] 초' % (time() - stime))

df.to_csv('./input/refined_movie_review.csv', index=False)

전처리 시작
전처리 완료: 소요시간 [5] 초


## Preprocessing
### Removing Special Characters

#### toeknizer

In [11]:
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

porter = PorterStemmer()
stop = stopwords.words('english')

def sgd_tokenizer(text) :
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)|\^.?\^', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeongwhanchoi/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
porter = PorterStemmer()
stop = stopwords.words('english')

def tokenizer(text) :
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]


## Logistic Regression

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pickle

# from chap8.mylib.tokenizer import tokenizer, tokenizer_porter

df = pd.read_csv('./input/refined_movie_review.csv')
x_train = df.loc[:35000, 'review'].values
y_train = df.loc[:35000, 'sentiment'].values
x_test = df.loc[35000:, 'review'].values
y_test = df.loc[35000:, 'sentiment'].values

tfidf = TfidfVectorizer()
lr = LogisticRegression(C=10.0, penalty = 'l2', random_state=0)

lr_tfidf = Pipeline([('vect', tfidf), ('clf', lr)])

stime = time()
print('머신러닝 시작')
lr_tfidf.fit(x_train, y_train)
print('머신러닝 종료')

y_pred = lr_tfidf.predict(x_test)
print('테스트 종료: 소요시간 [%d]초' % (time() - stime))
print('정확도: %.3f' % accuracy_score(y_test, y_pred))

curDir = os.getcwd()
dest = os.path.join(curDir, 'classifier')
if not os.path.exists(dest) :
    os.makedirs(dest)

pickle.dump(lr_tfidf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)
print('머신러닝 데이터 저장 완료')



머신러닝 시작




머신러닝 종료
테스트 종료: 소요시간 [9]초
정확도: 0.832
머신러닝 데이터 저장 완료


## Interaction

In [18]:
df = pd.read_csv('./input/refined_movie_review.csv')

x_train = df.loc[:35000, 'review'].values
y_train = df.loc[:35000, 'sentiment'].values
x_test = df.loc[35000:, 'review'].values
y_test = df.loc[35000:, 'sentiment'].values

curDir = os.getcwd()
clf = pickle.load(open (os.path.join(curDir, 'classifier', 'classifier.pkl'), 'rb'))
# clf = pickle.load(open (os.path.join(curDir, 'classifier', 'best_classifier.pkl'), 'rb'))
y_pred = clf.predict(x_test)
print('테스트 정확도: %.3f' % accuracy_score(y_test, y_pred))
label = {0: '부정적 의견', 1: '긍정적 의견'}
while True:
    txt = input('영문으로 리뷰를 작성하세요: ')
    if txt == '':
        break
    example = [txt]
    print('예측: %s\n확율: %.3f%%' % (label[clf.predict(example)[0]], np.max(clf.predict_proba(example))*100))

테스트 정확도: 0.832
영문으로 리뷰를 작성하세요: I wasted my time
예측: 부정적 의견
확율: 89.999%
영문으로 리뷰를 작성하세요: 


## Optimal Parameter with Grid

In [20]:
porter = PorterStemmer()
stop = stopwords.words('english')

def sgd_tokenizer(text) :
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)|\^.?\^', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

import datetime

if __name__ == "__main__" :
    stop = stopwords.words('english')
    df = pd.read_csv('./input/refined_movie_review.csv')
    x_train = df.loc[:35000, 'review'].values
    y_train = df.loc[:35000, 'sentiment'].values
    x_test = df.loc[35000:, 'review'].values
    y_test = df.loc[35000:, 'sentiment'].values

    tfidf = TfidfVectorizer(lowercase=False)

    param_grid = [{'vect__ngram_range':[(1,1)], 'vect__stop_words':[stop, None],
                   'vect__tokenizer' : [tokenizer, tokenizer_porter],
                   'clf__penalty' : ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]},
                   {'vect__ngram_range':[(1,1)], 'vect__stop_words':[stop, None],
                   'vect__tokenizer' : [tokenizer, tokenizer_porter],
                   'vect__use_idf' : [False],'vect__norm' : [None],
                   'clf__penalty' : ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}]

    lr = LogisticRegression(random_state=0)

    lr_tfidf = Pipeline([('vect', tfidf), ('clf', lr)])
    gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)


    print('머신러닝 최적 파라미터 계산 시작: %s' % (datetime.datetime.now()))
    gs_lr_tfidf.fit(x_train, y_train)
    print('머신러닝 최적 파라미터 계산 종료: %s' % (datetime.datetime.now()))
    print(gs_lr_tfidf.best_params_)

    clf=gs_lr_tfidf.best_estimator_
    print('테스트 정확도: %.3f' % clf.score(x_test, y_test))

    stime = time()
    y_pred = lr_tfidf.predict(x_test)
    print('테스트 종료: 소요시간 [%d]초' % (time() - stime))
    print('정확도: %.3f' % accuracy_score(y_test, y_pred))

    curDir = os.getcwd()
    dest = os.path.join(curDir, 'classifier')
    if not os.path.exists(dest) :
        os.makedirs(dest)

    pickle.dump(clf, open(os.path.join(dest, 'best_classifier.pkl'), 'wb'), protocol=4)
    print('머신러닝 데이터 저장 완료')

머신러닝 최적 파라미터 계산 시작: 2018-11-16 16:35:55.533640
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
exception calling callback for <Future at 0x1a2c24b160 state=finished raised BrokenProcessPool>
sklearn.externals.joblib.externals.loky.process_executor._RemoteTraceback: 
'''
Traceback (most recent call last):
  File "/Users/jeongwhanchoi/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/externals/loky/process_executor.py", line 393, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "/Users/jeongwhanchoi/anaconda3/lib/python3.6/multiprocessing/queues.py", line 113, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'tokenizer' on <module 'sklearn.externals.joblib.externals.loky.backend.popen_loky_posix' from '/Users/jeongwhanchoi/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/externals/loky/backend/popen_loky_posix.py'>
'''

The above exception was the direct cause of the following exception:

Traceback (most recent 

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.