In [5]:
import os
import random
import re
import pickle
import numpy as np

from hangul_utils import split_syllables
from tqdm import tqdm
from keras.preprocessing import sequence
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from konlpy.tag import Mecab

### _Load data_

In [6]:
PWD = './../data/rawData/'
FILES_neg = []
FILES_pos = []

for path, dirs, files in os.walk(PWD):
    for i in range(len(files)):
        if files[i].find('pos') != -1:
            FILES_pos.append(files[i])
        else:
            FILES_neg.append(files[i])
            
def fileTolist(file_name):
    
    '''
    
    load raw text to list
    and return list about moview reviews
    
    '''
    
    allReviews = []
    for i in range(len(file_name)):
        with open(PWD+file_name[i], 'r', encoding='utf-8') as f:
                reviews = f.readlines()
        for j in range(len(reviews)):
            allReviews.append(reviews[j].replace('  \n', ''))
    
    return allReviews

### _split function_

In [7]:
def textToPhon(text):
    '''
    
    text divided into phoneme
    and return text
    
    '''

    sp_text = []
    hangul = re.compile('[^\u3131-\u3163\uac00-\ud7a3]+')
    for split in text:
        review = hangul.sub('', split_syllables(split))
        '''if len(review)!=0:
            sp_text.append(review)'''
        sp_text.append(review)
    
    return sp_text

def textToMorp(text):
    
    '''
    
    text divided into morpheme
    and return text
    
    '''
    
    mc = Mecab()
    sp_text = []
    for i in range(len(text)):
        sp_text.append(mc.morphs(text[i]))
        
    return sp_text

def textToWord(text):
    
    '''
    
    text divided into word
    and return text
    
    '''
    
    sp_text = []
    for i in range(len(text)):
        sp_text.append(text[i].split())
    
    return sp_text

### _Make dictionary_

In [8]:
def makeDict_phon(text, path):
    
    '''
    
    make dictionary for movie revews
    
    '''
    
    unqPhon = []
    
    print('make phon-dictionary...')
    
    for i in tqdm(range(len(text))):
        for phon in (text)[i]:
            if phon not in unqPhon:
                unqPhon.append(phon)        

    print('# of unique Phoneme : {}\nexample : {}'.format(len(unqPhon), unqPhon))
    
    
    phon_label = {ch : i+1 for i, ch in enumerate(unqPhon)}
    label_phon = {i+1 : ch for i, ch in enumerate(unqPhon)}
    
    
    #save dictionary
    with open(path+'dictionary_phon.pkl', 'wb') as p:
        pickle.dump(label_phon, p)
       
    return unqPhon, phon_label


def makeDict_morp(text, path):
    
    '''
    
    make dictionary for movie revews
    
    '''
    
    unqMorp = []
    
    print('make morp-dictionary...')
    
    for i in tqdm(range(len(text))):
        for morp in (text)[i]:
            if morp not in unqMorp:
                unqMorp.append(morp)
    
    print('# of unique Morpheme : {}\nexample : {}'.format(len(unqMorp), random.sample(unqMorp, 10)))
    
    Morp = []

    for i in tqdm(range(len(text))):
        for j in range(len(text[i])):
            Morp.append(text[i][j])

    newMorp = []
    for i in range(len(Morp)):
        hangul = re.compile('[-=.#/?:^~!$}0-9]')
        Morp[i] = hangul.sub('', Morp[i])
        if len(Morp[i])!=0:
            newMorp.append(Morp[i])

    morps_count={}
    for morp in newMorp:
        if morp in morps_count:
            morps_count[morp] += 1
        else:
            morps_count[morp] = 1
    sorted_morps = sorted([(k,v) for k,v in morps_count.items()],
                           key=lambda morp_count: -morp_count[1])[:10000]
    print(sorted_morps)

    label_morp = {i+1 : ch[0] for i, ch in enumerate(sorted_morps)}
    morp_label = {y:x for x,y in label_morp.items()}
    #save dictionary
    with open(path+'dictionary_morp.pkl', 'wb') as p:
        pickle.dump(label_morp, p)
        
    return unqMorp, morp_label


def makeDict_word(text, path):
    
    '''
    
    make dictionary for movie reviews
    
    '''
    
    unqWord = []
    
    print('make word-dictionary...')
    
    for i in tqdm(range(len(text))):
        for word in (text)[i]:
            if word not in unqWord:
                unqWord.append(word)        

    print('# of unique Word : {}\nexample : {}'.format(len(unqWord), random.sample(unqWord, 10)))
    
    Word = []

    for i in tqdm(range(len(text))):
        for j in range(len(text[i])):
            Word.append(text[i][j])

    newWord = []
    for i in range(len(Word)):
        hangul = re.compile('[-=.#/?:^~!$}0-9]')
        Word[i] = hangul.sub('', Word[i])
        if len(Word[i])!=0:
            newWord.append(Word[i])

    words_count={}
    for word in newWord:
        if word in words_count:
            words_count[word] += 1
        else:
            words_count[word] = 1
    sorted_words = sorted([(k,v) for k,v in words_count.items()], 
                           key=lambda word_count: -word_count[1])[:40000]
    print(sorted_words)
    
    label_word = {i+1 : ch[0] for i, ch in enumerate(sorted_words)}
    word_label = {y:x for x,y in label_word.items()}
    
    #save dictionary
    with open(path+'dictionary_word.pkl', 'wb') as p:
        pickle.dump(label_word, p)
       
    return unqWord, word_label

### _Make array_

###### phoneme

In [9]:
def phonToArray(neg, pos, phon_label, np_path):
    
    '''
    NO ONE_HOT
    
    make array using text and phon label
    and return array
    
    '''
    
    #make array
    negPhonArray = np.asarray([[phon_label[w] for w in sent if w in phon_label.keys()] for sent in neg])
    posPhonArray = np.asarray([[phon_label[w] for w in sent if w in phon_label.keys()] for sent in pos])

    #make X, y
    X = np.concatenate((negPhonArray, posPhonArray), axis = 0)
    
    #2-dimension y
    y_neg = [[1,0] for _ in range(len(negPhonArray))]
    y_pos = [[0,1] for _ in range(len(posPhonArray))]
    y = np.asarray(y_neg+y_pos)
        
    #shuffle and save
    np.random.seed(618);np.random.shuffle(X)
    np.random.seed(618);np.random.shuffle(y)
    
    np.savez(np_path+'X_phon.npz', X)
    np.savez(np_path+'y_phon.npz', y)
    
    return X, y



def phonToArray_oneHot(neg, pos, phon_label):
    
    '''
    
    ONE_HOT & PADDING
    
    make array using text and phon label
    and return array
    
    '''
    
    #make array
    negPhonArray = np.asarray([[phon_label[w] for w in sent if w in phon_label.keys()] for sent in neg])
    posPhonArray = np.asarray([[phon_label[w] for w in sent if w in phon_label.keys()] for sent in pos])

    #make X-ONE_HOT, y
    X = np.concatenate((negPhonArray, posPhonArray), axis = 0)
    
    #confirm max length for X
    maxlen = []
    for i in range(len(X)):
        maxlen.append(len(X[i]))
    
    X = sequence.pad_sequences(X, maxlen=max(maxlen))
    ohe = OneHotEncoder(52)
    newX = []
    
    print('set one-hot vector for phoneme...')
    for i in tqdm(range(len(X))):
            newX.append(ohe.fit_transform(np.reshape(X[i], (-1, 1))).toarray())
    
    newX = np.asarray(newX)
    
    print('change blank to 0...')
    for i in tqdm(range(len(newX))):
        for j in range(len(newX[i])):
            if newX[i][j][0] == 1:
                newX[i][j][0] = 0

    y_neg = [[1,0] for _ in range(45000)]
    y_pos = [[0,1] for _ in range(45000)]
    y = np.asarray(y_neg+y_pos)

    np.random.seed(618);np.random.shuffle(newX)
    np.random.seed(618);np.random.shuffle(y)
    
    X_train, X_test, y_train, y_test = train_test_split(newX, y, test_size=0.3)
    
    np.savez('./data/phon_npz/Xtrain_phon_oneHot.npz', X_train)
    np.savez('./data/phon_npz/Xtest_phon_oneHot.npz', X_test)
    np.savez('./data/phon_npz/ytrain_phon_oneHot.npz', y_train)
    np.savez('./data/phon_npz/ytest_phon_oneHot.npz', y_test)

    return X_train, X_test, y_train, y_test


###### morpheme

In [10]:
def morpToArray(neg, pos, morp_label, np_path):
    
    '''
    
    make array using text and morp label
    and return array
    and no padding
    
    '''
    
    #make array
    negMorpArray = np.asarray([[morp_label[w] for w in sent if w in morp_label.keys()] for sent in neg])
    posMorpArray = np.asarray([[morp_label[w] for w in sent if w in morp_label.keys()] for sent in pos])
    
    # delete length of array[i] == 0
    negArray, posArray = [], []
    for p in range(len(negMorpArray)):
        if len(negMorpArray[p]) != 0:
            negArray.append(negMorpArray[p])
    for p in range(len(posMorpArray)):
        if len(posMorpArray[p]) != 0:
            posArray.append(posMorpArray[p])
    
    negArray = np.asarray(negArray)
    posArray = np.asarray(posArray)
    #make X, y
    X = np.concatenate((negArray, posArray), axis = 0)
    
    y_neg = [[1,0] for _ in range(len(negArray))]
    y_pos = [[0,1] for _ in range(len(posArray))]
    y = np.asarray(y_neg+y_pos)
    
    #shuffle and save
    np.random.seed(618);np.random.shuffle(X)
    np.random.seed(618);np.random.shuffle(y)
    
    np.savez(np_path+'X_morp.npz', X)
    np.savez(np_path+'y_morp.npz', y)
    
    return X, y

###### word

In [11]:
def wordToArray(neg, pos, word_label, np_path):
    
    '''
    
    make array using text and word label
    and return array
    and no padding
    
    '''
    
    #make array
    negWordArray = np.asarray([[word_label[w] for w in sent if w in word_label.keys()] for sent in neg])
    posWordArray = np.asarray([[word_label[w] for w in sent if w in word_label.keys()] for sent in pos])

    # delete length of array[i] == 0
    negArray, posArray = [], []
    for p in range(len(negWordArray)):
        if len(negWordArray[p]) != 0:
            negArray.append(negWordArray[p])
    for p in range(len(posWordArray)):
        if len(posWordArray[p]) != 0:
            posArray.append(posWordArray[p])
    #make X, y
    X = np.concatenate((negArray, posArray), axis = 0)

    y_neg = [[1,0] for _ in range(len(negArray))]
    y_pos = [[0,1] for _ in range(len(posArray))]
    y = np.asarray(y_neg+y_pos)

    #shuffle and save
    np.random.seed(618);np.random.shuffle(X)
    np.random.seed(618);np.random.shuffle(y)
    
    np.savez(np_path+'X_word.npz', X)
    np.savez(np_path+'y_word.npz', y)
    
    return X, y

# RUN

In [12]:
neg = fileTolist(FILES_neg)
pos = fileTolist(FILES_pos)

random_neg = random.sample(neg, len(neg))
random_pos = random.sample(pos, len(neg))

dict_path = '../data/dict/'

phon_path = '../data/dataNp/phon/'
morp_path = '../data/dataNp/morp/'
word_path = '../data/dataNp/word/'

In [13]:
#phoneme
neg_phon = textToPhon(random_neg)
pos_phon = textToPhon(random_pos)

unqPhon, phon_label = makeDict_phon(neg_phon+pos_phon, dict_path)
X, y = phonToArray(neg_phon, pos_phon, phon_label, phon_path)

print('X shape : {}\n'.format(X.shape, X[0]))
print('y shape : {}\n'.format(y.shape))

  5%|▍         | 4252/90522 [00:00<00:02, 42458.90it/s]

make phon-dictionary...


100%|██████████| 90522/90522 [00:01<00:00, 49374.76it/s]


# of unique Phoneme : 51
example : ['ㄴ', 'ㅓ', 'ㅁ', 'ㅜ', 'ㅃ', 'ㅎ', 'ㅏ', 'ㄱ', 'ㅗ', 'ㅈ', 'ㅣ', 'ㄹ', 'ㅅ', 'ㅡ', 'ㅌ', 'ㅆ', 'ㅇ', 'ㅛ', 'ㅋ', 'ㅑ', 'ㅂ', 'ㄷ', 'ㅔ', 'ㄶ', 'ㅟ', 'ㅕ', 'ㅊ', 'ㅐ', 'ㅄ', 'ㅝ', 'ㅘ', 'ㅍ', 'ㄸ', 'ㅚ', 'ㄲ', 'ㅒ', 'ㅖ', 'ㅢ', 'ㅉ', 'ㅠ', 'ㄵ', 'ㅙ', 'ㄻ', 'ㅀ', 'ㄺ', 'ㅞ', 'ㄼ', 'ㄾ', 'ㄿ', 'ㄳ', 'ㄽ']
X shape : (90522,)

y shape : (90522, 2)



In [14]:
#morpheme
neg_morph = textToMorp(random_neg)
pos_morph = textToMorp(random_pos)

unqMorp, morp_label = makeDict_morp(neg_morph+pos_morph, dict_path)

X, y = morpToArray(neg_morph, pos_morph, morp_label, morp_path)
print('X shape : {}\nexample : {}\n'.format(X.shape, X[0]))
print('y shape : {}\n'.format(y.shape))

print('preprocess for morpheme clear..!')

  1%|          | 776/90522 [00:00<00:11, 7703.91it/s]

make morp-dictionary...


100%|██████████| 90522/90522 [00:56<00:00, 1611.01it/s]
 60%|█████▉    | 54138/90522 [00:00<00:00, 270650.90it/s]

# of unique Morpheme : 24197
example : ['살아갔', '123', '설탕', '수첩', '괴', '이을', '어두웠', '야만성', '잊어버렸', '깊숙히']


100%|██████████| 90522/90522 [00:00<00:00, 273028.36it/s]


[('이', 37241), ('는', 34933), ('고', 29358), ('영화', 27516), ('하', 25806), ('가', 18945), ('다', 18471), ('도', 17665), ('보', 17399), ('은', 16492), ('에', 15851), ('어요', 14709), ('게', 14699), ('들', 14374), ('을', 13489), ('의', 13309), ('한', 13212), ('네요', 12814), ('좋', 12095), ('었', 11524), ('습니다', 10703), ('있', 10544), ('너무', 9758), ('봤', 9299), ('지', 9123), ('음', 8797), ('했', 8657), ('재밌', 8520), ('없', 8213), ('를', 8171), ('았', 8135), ('만', 7774), ('는데', 7174), ('지만', 6672), (',', 6595), ('잘', 6589), ('기대', 6102), ('나', 5860), ('같', 5847), ('기', 5839), ('재미있', 5724), ('아이', 5655), ('않', 5597), ('것', 5590), ('ㅎ', 5584), ('지루', 5549), ('적', 5528), ('생각', 5434), ('내용', 5307), ('잼', 5249), ('그냥', 5183), ('안', 5073), ('볼', 5063), ('아', 5018), ('과', 4828), ('스토리', 4796), ('였', 4733), ('연기', 4574), ('던', 4351), ('좀', 4347), ('으로', 4338), ('어', 4288), ('보다', 4104), ('재미', 4083), ('로', 4040), ('수', 3933), ('듯', 3932), ('거', 3816), ('감동', 3773), ('인', 3742), ('정말', 3713), ('별로', 3491), ('되', 3445), ('

X shape : (90196,)
example : [122, 122, 122, 1, 301, 122, 122, 122]

y shape : (90196, 2)

preprocess for morpheme clear..!


In [15]:
neg_word = textToWord(random_neg)
pos_word = textToWord(random_pos)

unqWord, word_label = makeDict_word(neg_word+pos_word, dict_path)

X, y = wordToArray(neg_word, pos_word, word_label, word_path)
print('X shape : {}\nexample : {}\n'.format(X.shape, X[0]))
print('y shape : {}\n'.format(y.shape))

print('preprocess for morpheme clear..!')

  0%|          | 0/90522 [00:00<?, ?it/s]

make word-dictionary...


100%|██████████| 90522/90522 [17:59<00:00, 83.88it/s] 
100%|██████████| 90522/90522 [00:00<00:00, 495779.40it/s]


# of unique Word : 125054
example : ['진짜!!', '감사했습니다.', '장산범보단', '됨..', '잼날줄알았는데', '아닌듯....운전씬은', 'end', '엉덩이는', '낱낱이', '어이없었음']
[('영화', 7533), ('너무', 7111), ('그냥', 3489), ('정말', 2784), ('좀', 2712), ('잘', 2549), ('더', 2013), ('진짜', 2006), ('많이', 1574), ('수', 1565), ('재밌게', 1496), ('조금', 1349), ('봤는데', 1346), ('본', 1311), ('재미있게', 1281), ('보고', 1224), ('역시', 1209), ('내용이', 1149), ('좋은', 1134), ('별로', 1122), ('것', 1108), ('생각보다', 1104), ('다', 1102), ('보는', 1081), ('있는', 1064), ('봤어요', 1052), ('아이들이', 1047), ('영화를', 1004), ('영화가', 1001), ('스토리', 999), ('봤습니다', 988), ('스토리가', 902), ('넘', 886), ('연기', 851), ('ㅠㅠ', 842), ('영화는', 840), ('왜', 813), ('그', 803), ('이', 800), ('볼', 793), ('완전', 791), ('꼭', 741), ('재밌어요', 740), ('없고', 728), ('하는', 726), ('없는', 715), ('뭔가', 714), ('ㅎㅎ', 713), ('느낌', 712), ('그래도', 710), ('연기가', 706), ('영화입니다', 696), ('아이가', 687), ('재미', 679), ('기대를', 673), ('내내', 667), ('보기', 659), ('하지만', 638), ('한', 635), ('함께', 628), ('연기는', 621), ('또', 605), ('재밌었어요', 603), ('참',

X shape : (81586,)
example : [17455]

y shape : (81586, 2)

preprocess for morpheme clear..!
