# Train Data Preprocessing 

## 1) Importing Libraries & Datasets

#### (1) importing Libraries

In [None]:
!git clone https://github.com/kakao/khaiii.git
!pip install cmake
!mkdir build
!cd build && cmake /content/khaiii
!cd /content/build/ && make all
!cd /content/build/ && make resource
!cd /content/build && make install
!cd /content/build && make package_python
!pip install /content/build/package_python

In [None]:
from khaiii import KhaiiiApi
import pandas as pd
import numpy as np
import re

#### (2) Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
X_train=pd.read_csv("/content/drive/Shareddrives/20-2 KUBIG 자연어처리 프로젝트/최종 트레이닝데이터/x_train.csv")
y_train=pd.read_csv("/content/drive/Shareddrives/20-2 KUBIG 자연어처리 프로젝트/최종 트레이닝데이터/y_train.csv")

#### (3) Check Data

In [None]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,document
0,0,평점에 스포 달린 영화는 무조건 점
1,1,재밌습니다 잘 만든 영화라는
2,2,딱 점
3,3,만남과 헤어짐 그리고 재회
4,4,롱기스트 야드를 먼저 봐서 그런지 진짜 쓰레기 특히 주인공 진짜 맘에 안 든다


#### (4) Morpheme Analysis

In [None]:
api = KhaiiiApi()

In [None]:
significant_tags = ['NNG', 'NNP', 'NNB', 'VV', 'VA', 'VX', 'MAG', 'MAJ', 'XSV', 'XSA']

In [None]:
def pos_text(texts):
    corpus = []
    for sent in texts:
        pos_tagged = ''
        for word in api.analyze(sent):
            for morph in word.morphs:
                if morph.tag in significant_tags:
                    pos_tagged += morph.lex + '/' + morph.tag + ' '
        corpus.append(pos_tagged.strip())
    return corpus

In [None]:
tagged_corpus=pos_text(X_train["document"])

In [None]:
p1 = re.compile('[가-힣A-Za-z0-9]+/NN. [가-힣A-Za-z0-9]+/XS.')
p2 = re.compile('[가-힣A-Za-z0-9]+/NN. [가-힣A-Za-z0-9]+/XSA [가-힣A-Za-z0-9]+/VX')
p3 = re.compile('[가-힣A-Za-z0-9]+/VV')
p4 = re.compile('[가-힣A-Za-z0-9]+/VX')

## 2) Preprocessing

#### (1) Stemming

In [None]:
def stemming_text(text):
    corpus = []
    for sent in text:
        ori_sent = sent
        mached_terms = re.findall(p1, ori_sent)
        for terms in mached_terms:
            ori_terms = terms
            modi_terms = ''
            for term in terms.split(' '):
                lemma = term.split('/')[0]
                tag = term.split('/')[-1]
                modi_terms += lemma
            modi_terms += '다/VV'
            ori_sent = ori_sent.replace(ori_terms, modi_terms)
        
        mached_terms = re.findall(p2, ori_sent)
        for terms in mached_terms:
            ori_terms = terms
            modi_terms = ''
            for term in terms.split(' '):
                lemma = term.split('/')[0]
                tag = term.split('/')[-1]
                if tag != 'VX':
                    modi_terms += lemma
            modi_terms += '다/VV'
            ori_sent = ori_sent.replace(ori_terms, modi_terms)

        mached_terms = re.findall(p3, ori_sent)
        for terms in mached_terms:
            ori_terms = terms
            modi_terms = ''
            for term in terms.split(' '):
                lemma = term.split('/')[0]
                tag = term.split('/')[-1]
                modi_terms += lemma
            if '다' != modi_terms[-1]:
                modi_terms += '다'
            modi_terms += '/VV'
            ori_sent = ori_sent.replace(ori_terms, modi_terms)

        mached_terms = re.findall(p4, ori_sent)
        for terms in mached_terms:
            ori_terms = terms
            modi_terms = ''
            for term in terms.split(' '):
                lemma = term.split('/')[0]
                tag = term.split('/')[-1]
                modi_terms += lemma
            if '다' != modi_terms[-1]:
                modi_terms += '다'
            modi_terms += '/VV'
            ori_sent = ori_sent.replace(ori_terms, modi_terms)
        corpus.append(ori_sent)
    return corpus

In [None]:
stemming_corpus = stemming_text(tagged_corpus)

In [None]:
for i in range(0, 30):
    print(stemming_corpus[i])

평점/NNG 스포/NNG 달리다/VV 영화/NNG 무조건/MAG 점/NNG
재미있/VA 잘/MAG 만들다/VV 영화/NNG
딱/MAG 점/NNG
만남/NNG 헤어지다/VV 그리고/MAJ 재회/NNG
롱/NNG 기스트/NNP 야/NNG 드/NNP 먼저/MAG 보다/VV 그렇/VA 진짜/NNG 쓰레기/NNG 특히/MAG 주인공/NNG 진짜/NNG 맘/NNG 안/MAG 들다/VV
오늘/NNG 고지전/NNG 다시/MAG 보다/VV 나라/NNG 위하다/VV 휴전/NNG 직전/NNG 싸우다/VV 전사하다/VV 참전/NNG 용사/NNG 감사/NNG 드다/VV 리/XSV
술/NNG 담배/NNG 더/MAG 해롭/VA 것/NNB 막장/NNG 드라마/NNG
정말/MAG 좋/VA 특히/MAG 영상/NNG 음악/NNG
ㅡ/NNP
예고편/NNG 낚이다/VV 최악/NNG 영화/NNG 중/NNB
현실/NNG 재밌/VA
하/XSA 영화/NNG 만들다/VV 내다/VV 수/NNB 없/VA 구로/NNP 사와/NNG 재능/NNG
우/MAG 굳ㅋ/MAG 재미/NNG 있/VA
쩌르다/VV 진짜/MAG 국영/NNG 형/NNG 최고/NNG
쓰레기/NNG 영화/NNG 다/MAG 있다/VV 인물/NNG 관계/NNG 정리/NNG 안/MAG 하다/VV 주다/VV 줄거리/NNG 없/VA 내용/NNG ㅉ/NNG
무섭/VA 오페라/NNG 유령/NNG 프레데/NNP 터/NNG
결혼하다/VV 보다/VV 미혼/NNG 때/NNG 또/MAG 감동/NNG 받다/VV 영화/NNG
캐서린/NNP 제타/NNG 존스/NNP 보톡스/NNG 자연스럽다/VV 영화/NNG 마치/MAG 전성기/NNG 훨씬/MAG 지나다/VV 톱스타/NNG 뒤늦/VA 공연/NNG 하다/VV 듯하다/VV 느낌/NNG 마디/NNG 철/NNG 지나다/VV 급/NNG 액션/NNG
이상하다/VV 것/NNB 만들다/VV 것/NNB 왠지/MAG 시간/NNG 지나다/VV 허접하다/VV 지다/VV
드라마/NNG 재밌/VA 보다/VV 것/NNB 없/VA 것/NNB 같/VA 구다

In [None]:
def text_cleaning(text):
   hangul=re.compile("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]+")
   result=hangul.sub('', text)
   return result

In [None]:
for sent in stemming_corpus:
  map(text_cleaning, sent)

In [None]:
corpus=[]
for i in range(0, len(stemming_corpus)):
  corpus.append(text_cleaning(stemming_corpus[i]))

In [None]:
clean_corpus=corpus

In [None]:
corpus=[]
for sent in clean_corpus:
  corpus.append(sent.split())

In [None]:
corpus_comma=corpus

In [None]:
for i in range(0, 30):
    print(corpus_comma[i])

['평점', '스포', '달리다', '영화', '무조건', '점']
['재미있', '잘', '만들다', '영화']
['딱', '점']
['만남', '헤어지다', '그리고', '재회']
['롱', '기스트', '야', '드', '먼저', '보다', '그렇', '진짜', '쓰레기', '특히', '주인공', '진짜', '맘', '안', '들다']
['오늘', '고지전', '다시', '보다', '나라', '위하다', '휴전', '직전', '싸우다', '전사하다', '참전', '용사', '감사', '드다', '리']
['술', '담배', '더', '해롭', '것', '막장', '드라마']
['정말', '좋', '특히', '영상', '음악']
['ㅡ']
['예고편', '낚이다', '최악', '영화', '중']
['현실', '재밌']
['하', '영화', '만들다', '내다', '수', '없', '구로', '사와', '재능']
['우', '굳ㅋ', '재미', '있']
['쩌르다', '진짜', '국영', '형', '최고']
['쓰레기', '영화', '다', '있다', '인물', '관계', '정리', '안', '하다', '주다', '줄거리', '없', '내용', 'ㅉ']
['무섭', '오페라', '유령', '프레데', '터']
['결혼하다', '보다', '미혼', '때', '또', '감동', '받다', '영화']
['캐서린', '제타', '존스', '보톡스', '자연스럽다', '영화', '마치', '전성기', '훨씬', '지나다', '톱스타', '뒤늦', '공연', '하다', '듯하다', '느낌', '마디', '철', '지나다', '급', '액션']
['이상하다', '것', '만들다', '것', '왠지', '시간', '지나다', '허접하다', '지다']
['드라마', '재밌', '보다', '것', '없', '것', '같', '구다']
['결말', '좀', '슬프', '다', '전두엽', '수순', '끝내', '받다', '맥머피', '이제', '예전', '맥머피', '어쩌다',

#### (2) Stopwords

In [None]:
path="/content/drive/Shareddrives/20-2 KUBIG 자연어처리 프로젝트/korean_stopwords.txt"
with open(path, encoding='utf-8') as f:
  stopwords=f.readlines()
stopwords=[x.strip() for x in stopwords]

In [None]:
corpus=[]
for sentence in corpus_comma:
    temp_X = [word for word in sentence if not word in stopwords]
    corpus.append(temp_X)

In [None]:
removed_stopword_corpus = corpus

In [None]:
for i in range(0, 30):
    print(removed_stopword_corpus[i])

['평점', '스포', '달리다', '영화', '무조건', '점']
['재미있', '잘', '만들다', '영화']
['점']
['만남', '헤어지다', '재회']
['롱', '기스트', '드', '먼저', '보다', '그렇', '진짜', '쓰레기', '특히', '주인공', '진짜', '맘', '안', '들다']
['오늘', '고지전', '다시', '보다', '나라', '위하다', '휴전', '직전', '싸우다', '전사하다', '참전', '용사', '감사', '드다', '리']
['술', '담배', '더', '해롭', '막장', '드라마']
['정말', '좋', '특히', '영상', '음악']
['ㅡ']
['예고편', '낚이다', '최악', '영화', '중']
['현실', '재밌']
['영화', '만들다', '내다', '수', '없', '구로', '사와', '재능']
['우', '굳ㅋ', '재미', '있']
['쩌르다', '진짜', '국영', '형', '최고']
['쓰레기', '영화', '다', '인물', '관계', '정리', '안', '하다', '주다', '줄거리', '없', '내용', 'ㅉ']
['무섭', '오페라', '유령', '프레데', '터']
['결혼하다', '보다', '미혼', '감동', '받다', '영화']
['캐서린', '제타', '존스', '보톡스', '자연스럽다', '영화', '전성기', '지나다', '톱스타', '뒤늦', '공연', '하다', '듯하다', '느낌', '마디', '철', '지나다', '급', '액션']
['이상하다', '만들다', '왠지', '지나다', '허접하다', '지다']
['드라마', '재밌', '보다', '없', '같', '구다']
['결말', '슬프', '다', '전두엽', '수순', '끝내', '받다', '맥머피', '이제', '예전', '맥머피', '어쩌다', '수', '없이', '치프', '죽이다', '주다', '영혼', '탈출', '시도하다', '하다', '거', '같', '빌리', '맥머피', '죽음', 

In [None]:
len(removed_stopword_corpus)

99438

## 3) Word Embedding

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=removed_stopword_corpus, size=200, window=5, min_count=3, workers=4, sg=1)

In [None]:
model.wv.vectors.shape

(15542, 200)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
t = Tokenizer()
t.fit_on_texts(removed_stopword_corpus)
vocab_size = len(t.word_index) + 1

print(vocab_size)

44701


In [None]:
X_encoded = t.texts_to_sequences(removed_stopword_corpus)
print(X_encoded[:10])

[[25, 2239, 776, 1, 651, 17], [20, 21, 16, 1], [17], [1895, 1653, 5514], [2780, 21548, 288, 749, 2, 84, 24, 49, 225, 98, 24, 346, 10, 61], [317, 15543, 54, 2, 460, 145, 12520, 3192, 614, 12521, 9293, 9294, 1654, 1218, 123], [1534, 1655, 28, 21549, 212, 33], [8, 6, 225, 206, 144], [121], [743, 495, 86, 1, 39]]


In [None]:
max_len=max(len(l) for l in X_encoded)
print(max_len)

66


In [None]:
X_train=pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_train=np.array(y_train)
print(X_train)

[[   25  2239   776 ...     0     0     0]
 [   20    21    16 ...     0     0     0]
 [   17     0     0 ...     0     0     0]
 ...
 [10343  3498  1864 ...     0     0     0]
 [ 5904   369     2 ...     0     0     0]
 [  526   220     2 ...     0     0     0]]


In [None]:
embedding_matrix = np.zeros((vocab_size, 200))

In [None]:
def get_vector(word):
    if word in model:
        return model[word]
    else:
        return None

In [None]:
for word, i in t.word_index.items(): 
    temp = get_vector(word) 
    if temp is not None: 
        embedding_matrix[i] = temp

  
  This is separate from the ipykernel package so we can avoid doing imports until


# 4) Modeling

In [None]:
from tensorflow.keras.layers import Embedding, Dense, GRU, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
model = Sequential()
e = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=max_len, trainable=False)
model.add(e)
model.add(GRU(128))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation="sigmoid"))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)

Epoch 1/15
Epoch 00001: val_acc improved from -inf to 0.50096, saving model to best_model
Epoch 2/15
Epoch 00002: val_acc did not improve from 0.50096
Epoch 3/15
Epoch 00003: val_acc did not improve from 0.50096
Epoch 4/15
Epoch 00004: val_acc did not improve from 0.50096
Epoch 5/15
Epoch 00005: val_acc did not improve from 0.50096
Epoch 00005: early stopping


# 5) Evaluate Model

In [None]:
test= pd.read_csv("/content/drive/Shareddrives/20-2 KUBIG 자연어처리 프로젝트/Test_Real_final.csv")
test.head()

Unnamed: 0,label,0
0,1,굳 ㅋ
1,0,
2,0,뭐 야 이 평점들은 나쁘진 않지만 점 짜리는 더더욱 아니잖아
3,0,지루하지는 않은데 완전 막장 임 돈 주고 보기에는
4,0,만 아니었어도 별 다섯 개 줬을 텐데 왜 로 나와서 제 심기를 불편하게 하죠


In [None]:
test = test.dropna(how = 'any')
tagged_corpus=pos_text(test["0"])
stemming_corpus = stemming_text(tagged_corpus)
for sent in stemming_corpus:
  map(text_cleaning, sent)
corpus=[]
for i in range(0, len(stemming_corpus)):
  corpus.append(text_cleaning(stemming_corpus[i]))
clean_corpus = corpus
corpus=[]
for sent in clean_corpus:
  corpus.append(sent.split())
corpus_comma=corpus
corpus=[]
for sentence in corpus_comma:
    temp_X = [word for word in sentence if not word in stopwords]
    corpus.append(temp_X)
removed_stopword_corpus = corpus
Test = removed_stopword_corpus

model_test = Word2Vec(sentences=Test, size=100, window=5, min_count=3, workers=4, sg=1)

In [None]:
y_test=test["label"]

In [None]:
len(y_test)

52244

In [None]:
t = Tokenizer()
t.fit_on_texts(Test)
vocab_size = len(t.word_index) + 1
print(vocab_size)
X_encoded = t.texts_to_sequences(Test)
print(X_encoded[:10])
max_len=max(len(l) for l in X_encoded)
print(max_len)

X_test=pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_test=np.array(y_test)

embedding_matrix = np.zeros((vocab_size, 100))
def get_vector(word):
    if word in model_test:
        return model_test[word]
    else:
        return None

for word, i in t.word_index.items(): 
    temp = get_vector(word) 
    if temp is not None: 
        embedding_matrix[i] = temp

37744
[[907, 23], [24, 273, 15, 20, 548, 5498, 15], [15, 79, 254, 188, 76, 8, 2], [122, 89, 8, 243, 21, 15848, 597, 3], [152, 15849, 9, 27, 152, 1], [57], [270, 302, 2505, 186, 15850, 4, 1527, 2741, 16, 3365, 2300, 3], [1042, 5499, 454, 281, 44, 4, 2397, 300, 141, 40, 10, 9, 15851, 15852, 808, 10934, 15853, 13, 1, 15854], [2869, 1263, 294, 2506, 419, 2398, 64, 2021, 26, 71, 15], [101, 4932, 122, 10935, 15855]]
448


  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [None]:
loaded_model = load_model('best_model')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))




 테스트 정확도: 0.4875
