In [69]:
import urllib.request
import numpy as np
import re


In [70]:
def preprocess_text(text):
   
    # input  text ='이 SF영화 재미^^;; 없는데.....왜 7점이지???'
    # output text ='$이_SF영화_재미_없는데_왜_7점이지$'
    
    non_alpha_numeric_hangul = re.compile('[^0-9a-zA-Z\u3131-\u3163\uac00-\ud7a3]')
    SS = '$'  # Sentence Separator
    
    t = non_alpha_numeric_hangul.sub('_', text)
    text = re.sub(r'_+', '_', t)
    return SS+text+SS

In [71]:
def prepare_data_file(FILE_PATH):
    
    # file load from URL
    with urllib.request.urlopen(FILE_PATH) as f:
        lines = f.read().decode('utf-8').split('\n')

    data, target = [], []
    for l in lines[1:]:   # 첫번째 줄은 skip
        try:
            _, text, label = l.strip().split('\t')   # ID\treview\tlabel 
        except ValueError:
            pass
        text = text.strip()
        if text == '': continue
        data.append(preprocess_text(text))
        target.append(int(label))
            
    return data, target

In [72]:
def extract_features(data, MAX_FEATURES):
    
    # 학습데이터 data로부터 bigram feature 추출
    # MAX_FEATURES 만큼만 feature로 사용
    # 발생 빈도로 정렬
    
    FEATURES = dict()
    features_dict = dict()
    
    for line in data:
        uni_list = list(line)
        bi_list = [''.join(uni_list[z:z+2]) for z in range(0, len(uni_list)-1)]
        for bigram in bi_list:
            if bigram in FEATURES:
                FEATURES[bigram] += 1
            else:
                FEATURES[bigram] = 1
    
    features_list = [(x, f) for (x, f) in FEATURES.items()]
    features_list.sort(reverse=True, key=lambda z:z[1])
    
    for (idx, (x, y)) in enumerate(features_list[:MAX_FEATURES]):
        features_dict[x] = idx

    return features_dict

In [73]:
def make_feature_vector(feature_set, data, target):

    fv_base = [0 for _ in range(0, len(feature_set))]
    feature_list = []
    for (x, label) in zip(data, target):
        uni_list = list(x)
        fv = fv_base[:]
        bi_list = [''.join(uni_list[z:z+2]) for z in range(0, len(uni_list)-1)]
        for bigram in bi_list: 
            if bigram in feature_set:
                fv[feature_set[bigram]] = 1
                
        feature_list.append(fv + [label])
    feature_list = np.array(feature_list)
    np.random.shuffle(feature_list)
    
    return feature_list[:, :-1], feature_list[:, -1]

In [74]:
# NAVER MOVIE REVIEW corpus로부터 데이터 loading
TRAIN_FILE = 'https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt'
TEST_FILE = 'https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt'

print('prepare_data_file START...')
train_data, train_target = prepare_data_file(TRAIN_FILE)
test_data, test_target = prepare_data_file(TEST_FILE)
print('prepare_data_file END...')


prepare_data_file START...
prepare_data_file END...


In [75]:
# Bigram Feature 추출
print('extract_features START...')
MAX_FEATURES = 500  # 사용할 feature 개수
feature_set = extract_features(train_data, MAX_FEATURES)
with open('features.out', 'w', encoding='utf8') as fo:
    fo.write('\n'.join([x+'\t'+str(idx) for x, idx in feature_set.items()]))
    
print('extract_features END...')

extract_features START...
extract_features END...


In [76]:
# 입력 파일을 feature vector로 변환
print('make_feature_vector START...')
x_train, y_train = make_feature_vector(feature_set, train_data, train_target)
x_test,  y_test  = make_feature_vector(feature_set, test_data, test_target)
print('make_feature_vector END...')

make_feature_vector START...
make_feature_vector END...


In [77]:
# Machine Learning Tool을 사용하여 학습

#from sklearn.naive_bayes import MultinomialNB

#print('train START...')
#model = MultinomialNB()
#model.fit(x_train, y_train)
#print('train END...')


In [78]:
# Model의 정확도 평가
#print('eval START...')
#print("훈련 세트 정확도: {:.3f}".format(model.score(x_train, y_train)))  # Eval
#print("테스트 세트 정확도: {:.3f}".format(model.score(x_test, y_test)))  # Eval
#print('eval END...')

In [79]:
# 수행해 보기
#text = '최고의 명작... 왜 이제서야 이 영화를 봤을꼬??'   
#text = preprocess_text(text)
#x_test, _ = make_feature_vector(feature_set, [text], [None])
#result = model.predict(x_test)
#print(text, '==> ', ['Negative', 'Positive'][result[0]])

In [80]:
from sklearn.tree import DecisionTreeClassifier

print('train START...')
tree = DecisionTreeClassifier(random_state=0)
tree.fit(x_train, y_train)
print('train END...')

train START...
train END...


In [81]:
print('eval START...')
print("훈련 세트 정확도: {:.3f}".format(tree.score(x_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(tree.score(x_test, y_test)))
print('eval END...')

eval START...
훈련 세트 정확도: 0.979
테스트 세트 정확도: 0.694
eval END...


In [82]:
text = '최고의 명작... 왜 이제서야 이 영화를 봤을꼬??'   
text = preprocess_text(text)
x_test, _ = make_feature_vector(feature_set, [text], [None])
result = tree.predict(x_test)
print(text, '==> ', ['Negative', 'Positive'][result[0]])

$최고의_명작_왜_이제서야_이_영화를_봤을꼬_$ ==>  Positive
