In [None]:
pip install tensorflow-addons

In [None]:
! git clone https://github.com/e9t/nsmc.git

In [None]:
!sudo apt-get install g++ openjdk-7-jdk # Install Java 1.7+
# !sudo apt-get install python-dev; pip install konlpy     # Python 2.x
!sudo apt-get install python3-dev; pip3 install konlpy   # Python 3.x
!sudo apt-get install curl
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

In [None]:
import pandas as pd
import re
import numpy as np

from tqdm import tqdm
import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow import keras
from gensim.models import Word2Vec

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, BatchNormalization, Flatten, Concatenate, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras import datasets, layers, Model


from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import binary_accuracy
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from gensim import models
import gensim

from pykospacing import Spacing
spacing = Spacing()

from konlpy.tag import Mecab, Okt
mecab = Mecab()
okt = Okt()

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns

import random
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)

%matplotlib inline 

In [None]:

train = pd.read_csv("/content/drive/MyDrive/머신러닝, 딥러닝 논문/Convolutional_Neural_Networks_for_Sentence_Classification/train_spacing.csv", encoding='utf-8')
test = pd.read_csv("/content/drive/MyDrive/머신러닝, 딥러닝 논문/Convolutional_Neural_Networks_for_Sentence_Classification/test_spacing.csv", encoding='utf-8')


In [None]:

train.fillna('NaN', inplace=True) 
test.fillna('NaN', inplace=True) 

# **전처리**

In [None]:
def clean_text(texts):
    corpus = []
    for i in tqdm(range(0, len(texts))):
        
        review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"]', '',str(texts[i])) #remove punctuation
        review = re.sub(r'\d+','', str(texts[i]))# remove number
        review = review.lower() #lower case
        review = re.sub(r'\s+', ' ', review) #remove extra space
        review = re.sub('[-=+,#:;//●<>▲\?:^$.☆!★()Ⅰ@*\"※~>`\'…》→←·]', ' ', review)
        review = re.sub(r"^\s+", '', review) #remove space from start
        review = re.sub(r'\s+$', '', review) #remove space from the end
        review = re.sub("[一-龥]",'', review) # remove hanja
        corpus.append(review)
    return corpus


train.data_text = list(clean_text(train.document))
test.data_text = list(clean_text(test.document))
clear_text1, clear_text2=  [], []


for i in range(len(train.data_text)):
    clear_text1.append(str(train.data_text[i]).replace('\\n', ''))

for i in range(len(test.data_text)):
    clear_text2.append(str(test.data_text[i]).replace('\\n', ''))


train['clear_text'] = clear_text1
test['clear_text'] = clear_text2

In [None]:
"""
{'Adjective': '형용사',
 'Adverb': '부사',
 'Alpha': '알파벳',
 'Conjunction': '접속사',
 'Determiner': '관형사',
 'Eomi': '어미',
 'Exclamation': '감탄사',
 'Foreign': '외국어, 한자 및 기타기호',
 'Hashtag': '트위터 해쉬태그',
 'Josa': '조사',
 'KoreanParticle': '(ex: ㅋㅋ)',
 'Noun': '명사',
 'Number': '숫자',
 'PreEomi': '선어말어미',
 'Punctuation': '구두점',
 'ScreenName': '트위터 아이디',
 'Suffix': '접미사',
 'Unknown': '미등록어',
 'Verb': '동사'}
"""
def pos(df):
    try:
        pos_text = []
        for sentence in df:
            sentence = okt.pos(sentence, stem=True, norm=True) # norm : If True, normalize tokens. stem : If True, stem tokens. join : If True, returns joined sets of morph and tag.

            temp = []
            for (word, pos) in sentence:
                if pos not in ['Suffix', 'Josa', 'Determiner', 'Modifier', 'KoreanParticle', 'Exclamation', 'Conjunction'] : # 접미사, 조사, 관형사,  Modifier?, 감탄사, 접속사
                    temp.append(word)
            pos_text.append(' '.join(temp)) 
        return pos_text
    
    except:
        pass

train["pos_text"] = pos(train.clear_text)
test["pos_text"] = pos(test.clear_text)

In [None]:
train_corpus = [text for text in tqdm(train.pos_text)]
test_corpus = [text for text in tqdm(test.pos_text)]


* ## **불용어 처리**



In [None]:
STOPWORDSPATH ="/content/drive/MyDrive/Colab Notebooks/한국어불용어100.txt"
stopwords = [
             '로', '은', '는', '였', '엿', '고', '를', '을', '에', '도', '의', '입니다.', 'ㅋ' ,'ㅋㅋ', 'ㅋㅋㅋ', 'ㅎ', 'ㅎㅎ','ㅎㅎㅎ', '어요', '께', '게', '과', '왔', 'ㄷ', 'ㅠ', 'ㅠㅠ', '네요', '거', '더라구요', '었', '에서', '으로', '봤', '는데', '겟', '겠', '았', '앗', '밋', '와', '뭐', '습니다', '된', 'ㅇ','ㅇㅇ', '합니다', '했', '아요', '요',
             '입니다', '듯', '지만', '인데', '까지', '로써', '로서', '된다', '임', '그리고', '그래서', '던', '에요', 'ㅜ','ㅜㅜ', '해', '요', '냐', '한다', '셈', "끼리", '_', '더군요','니까', '에게', '%', '라는','으루', '는지', '잖아요', '잖아', '세요', '네여', 
             '하다', '나다', '돼다', '되다', '이다', '싶다', '있다', '어떻다', '같다', '것', '그', '데'

                    ]
with open(STOPWORDSPATH) as f:
    for line in f:
        line = line.split('\t')

        stopwords.append(line[0])
        # print(line[0])

In [None]:
def clear_sw(s):
    sentence = s.split()
    temp = []
    for token in sentence:
        if token not in stopwords:
            temp.append(token)
    temp = ' '.join(temp)
    return temp
clear_corpus_train = []
clear_corpus_test = []
# for sentence in train_corpus:
for sentence in train_corpus:
    clear_corpus_train.append(clear_sw(sentence)) 
for sentence in test_corpus:
    clear_corpus_test.append(clear_sw(sentence)) 

train_label = np.array(train.label)
test_label = np.array(test.label, dtype=np.int32)

# **Tokenizing, padding**

In [None]:
max_length = 42
# model_type = 'CNN-static'
# train_sentences = datastore.document.values
# train_labels = datastore.label.values
# train_ids = datastore.id.values

# test_sentences = test_data.document.values
# test_labels = test_data.label.values
# test_ids = test_data.id.values

#############################################
#############################################

tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(clear_corpus_train)

train_word_index = tokenizer.word_index
train_vocabulary_inv = tokenizer.index_word# {v: k for k, v in tokenizer.word_index.items()}

train_sequences = tokenizer.texts_to_sequences(clear_corpus_train)
test_sequences = tokenizer.texts_to_sequences(clear_corpus_test)

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# **pre-trained 모델 적용(fasttext)**

In [None]:
#-*- coding: ISO-8859-1' -*-
# encoding = 'ISO-8859-1'
def load_dic(dic_file) :
    embeddings_index = dict()
    f = open(dic_file,  encoding = 'ISO-8859-1')
    for i, line in enumerate(f):
        if i == 0 : continue
    try :
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except :
        print(i)
        print(values)
    f.close()

    return embeddings_index

embeddings_index = load_dic('/content/drive/MyDrive/pre_trained_data/cc.ko.300.vec')

In [None]:
def gen_embedding_matrix(vocab, embeddings_index, embedding_size) :
    vocabulary_size = len(vocab) + 1

    embedding_matrix = np.zeros((vocabulary_size, embedding_size))
    for word, index in vocab.items():
        if index > vocabulary_size - 1:
            break
        else:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector
    return embedding_matrix

embedding_matrix = gen_embedding_matrix(train_word_index, embeddings_index, embedding_size=200)

# **모델 설정 및 layer 쌓기**

In [None]:
vocab_size = len(train_word_index) +1 # 40821
embedding_dim = 200
max_length = max_length 

filter_sizes = (3, 4, 5)
hidden_dims = 100

num_filters = 100

input_shape = (max_length, ) # 18, 200

cnn_model_input = Input(shape = input_shape)
cnn_embedding = Embedding(vocab_size , embedding_dim, input_length=max_length,weights=[embedding_matrix], trainable=True)(cnn_model_input) 
cnn_Dropout_1 = Dropout(0.2)(cnn_embedding) 

cnn_Convs_block = []
for size in filter_sizes:
    cnn_Conv_1 = Conv1D(filters=num_filters, kernel_initializer='glorot_normal', kernel_size=size, padding='valid', activation='relu', strides=1)(cnn_Dropout_1)
    
    cnn_MaxP = MaxPooling1D(pool_size=2,padding='valid')(cnn_Conv_1)
    cnn_Conv_1 = Conv1D(64, 3, kernel_initializer='glorot_normal', padding='same', activation='relu', strides=1)(cnn_MaxP)
    cnn_MaxP = MaxPooling1D(pool_size=2,padding='valid')(cnn_Conv_1)
    cnn_Flatten = Flatten()(cnn_MaxP)
    cnn_Convs_block.append(cnn_Flatten)

concate_cnn_layers = Concatenate()(cnn_Convs_block) if len(cnn_Convs_block) > 1 else cnn_Convs_block[0]

cnn_Dropout_2 = Dropout(0.5)(concate_cnn_layers)
cnn_Dense_1 = Dense(1024, kernel_initializer='glorot_normal', activation='relu')(cnn_Dropout_2)

cnn_Dropout_3 = Dropout(0.5)(concate_cnn_layers)
cnn_Dense_2 = Dense(256, activation='relu')(cnn_Dropout_3)

cnn_Dropout_4 = Dropout(0.2)(cnn_Dense_2)
cnn_Dense_3 = Dense(64, activation='relu')(cnn_Dropout_4)

cnn_Dropout_5 = Dropout(0.2)(cnn_Dense_3)
cnn_Dense_4 = Dense(16, activation='relu')(cnn_Dropout_5)

model_output = Dense(1, activation='sigmoid')(cnn_Dense_4)



* ## optimizer : Radam

In [None]:
model = Model(cnn_model_input, model_output)
model.compile(loss='binary_crossentropy', 
              optimizer= tfa.optimizers.RectifiedAdam(learning_rate=7.0e-4, total_steps = 5000, warmup_proportion=0.2, min_lr=1e-5, epsilon=1e-08, clipnorm=1.0), 
              metrics=['accuracy'])
model.summary()

In [None]:
model.fit(train_padded, train_label, batch_size=512, epochs=20,
          validation_split=0.2, verbose=2)

# **결과**

In [None]:
pred = model.predict(test_padded)
result = []
for i in pred:
    if i>=0.5:
        result.append(1)
    else:
        result.append(0)
print(f"{ round(sum(result==test['label'])/len(test)*100, 4) }%")