In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install bpemb



### **훈련**

ratings_train, ratings_test 파일을 불러옴

In [None]:
from keras import *
from keras.layers import *
from keras.utils import to_categorical

import re
import os
import sys
import collections

from bpemb import BPEmb
import numpy as np


d_emb = 300 # Embedding Dimension: 25 / 50 / 100 / 200 / 300
vocab_size = 100000 # Vocab Size: 5000 / 10000 / 25000 / 50000 / 100000 / 200000 
batch_size = 128
seq_max = 200

TRAIN_DIR = '/content/drive/MyDrive/dataset/ratings_train.txt'
TEST_DIR = '/content/drive/MyDrive/dataset/ratings_test.txt'

pre_trained = True # If True, Init Embedding Weight with Pre-Trained Vectors
quad_layer = True # If True, Use 4 CNN Layer

filter_size = 100

if quad_layer:
    windows = [2, 3, 4, 5]
    
else:
    windows = [2, 3, 4]

In [None]:
if pre_trained:
    bpemb_ko = BPEmb(lang="ko", dim=d_emb, vs=vocab_size) 
else:
    bpemb_ko = BPEmb(lang="ko", vs=vocab_size)

In [None]:
def parsing(data):
    first_t = data.find('\t')
    second_t = data[first_t + 1:].find('\t') + first_t + 1
    _id = data[:first_t]
    document = data[first_t + 1:second_t]
    label = data[second_t + 1:]
    
    return _id, document, label


def add_padding(arr, max_len):
    results = []
    for tmp in arr:
        for i in range(len(tmp), max_len):
            tmp.append(0)
        results.append(tmp)

    return results


def raw_to_data(raw):
    x = []
    y = []
    for data in raw[1:]:

        _, document, label = parsing(data)

        tmp = bpemb_ko.encode_ids(document)

        if seq_max < len(tmp):
            continue

        if len(tmp) == 0:
            continue
    
        x.append(tmp)
        y.append(label)
        
    return np.array(add_padding(x.copy(), seq_max)), to_categorical(np.array(y.copy(), dtype='int32'))

with open(TRAIN_DIR, "r", encoding='utf8') as f:
    train_raw_data = f.readlines()


with open(TEST_DIR, "r", encoding='utf8') as f:
    test_raw_data = f.readlines()
    
x_train, y_train = raw_to_data(train_raw_data)
x_test, y_test = raw_to_data(test_raw_data)

print('x_train shape: ', x_train.shape)
print('y_train shape: ', y_train.shape)
print('x_test shape: ', x_test.shape)
print('y_test shape: ', y_test.shape)

x_train shape:  (149995, 200)
y_train shape:  (149995, 2)
x_test shape:  (49997, 200)
y_test shape:  (49997, 2)


In [None]:
class SentimentalCNN:
    def __init__(self, vocab_size, d_emb, seq_max, weight=None, trainable=False):

        if weight is None:
            self.emb_layer = Embedding(vocab_size, d_emb, input_length=seq_max)
            
        else:
            self.emb_layer = Embedding(vocab_size, d_emb, input_length=seq_max, weights=[weight], trainable=trainable)

            self.reshape_layer = Reshape((seq_max, d_emb, 1), input_shape=(seq_max, d_emb)) 
            
            self.bi_cnn = Conv2D(filter_size * 1, kernel_size=(windows[0], d_emb), activation='relu', input_shape=(seq_max, d_emb, 1))
            self.tri_cnn = Conv2D(filter_size * 1, kernel_size=(windows[1], d_emb), activation='relu', input_shape=(seq_max, d_emb, 1))
            self.quad_cnn = Conv2D(filter_size * 1, kernel_size=(windows[2], d_emb), activation='relu', input_shape=(seq_max, d_emb, 1))
            
            if quad_layer:
                self.penta_cnn = Conv2D(filter_size * 1, kernel_size=(windows[3], d_emb), activation='relu', input_shape=(seq_max, d_emb, 1))
                
            self.output_layer = Dense(2, activation='softmax')

    def compile(self, optimizer="adam"):
        _input = Input(shape=(None, ), dtype='int32')

        emb = self.emb_layer(_input)
        reshape_emb = self.reshape_layer(emb)
            
        bi_res = self.bi_cnn(reshape_emb)
        tri_res = self.tri_cnn(reshape_emb)
        quad_res = self.quad_cnn(reshape_emb)
            
        bi_res = Dropout(0.3)(bi_res)
        tri_res = Dropout(0.3)(tri_res)
        quad_res = Dropout(0.3)(quad_res)
                
        a = MaxPool2D(pool_size=(seq_max - windows[0] - 1, 1))(bi_res)
        b = MaxPool2D(pool_size=(seq_max - windows[1] - 1, 1))(tri_res)
        c = MaxPool2D(pool_size=(seq_max - windows[2] - 1, 1))(quad_res)
            
        if quad_layer:
            penta_res = self.penta_cnn(reshape_emb)
            penta_res = Dropout(0.3)(penta_res)
            d = MaxPool2D(pool_size=(seq_max - windows[3] - 1, 1))(penta_res)
            result = Concatenate(axis=-1)([a, b, c, d])
            
        else:
            result = Concatenate(axis=-1)([a, b, c])
                
        result = Flatten()(result)
        result = self.output_layer(result)

        self.model = Model(_input, result)
        self.model.compile(
            loss='categorical_crossentropy',
            optimizer=optimizer,
            metrics=['accuracy']
        )
            
    def predict(self, _input):
        x = bpemb_ko.encode_ids(_input)
        x = np.array(add_padding([x], seq_max))
                    
        res = self.model.predict(x)
                
        if res[0][0] < res[0][1]:
            return 1
                    
        else:
            return 0

In [None]:
if pre_trained:
    sent_cnn = SentimentalCNN(
        vocab_size=vocab_size,
        d_emb=d_emb,
        seq_max=seq_max,
        weight=bpemb_ko.vectors,
        trainable=True
    )
            
else:
    sent_cnn = SentimentalCNN(
        vocab_size=vocab_size,
        d_emb=d_emb,
        seq_max=seq_max
    )

    
sent_cnn.compile(optimizers.Adam(lr=0.005, decay=1e-6))
sent_cnn.model.build(input_shape=(seq_max, ))
sent_cnn.model.summary()
hist = sent_cnn.model.fit(x_train, y_train,
                          epochs=5,
                          batch_size=batch_size,
                          verbose=1,
                          validation_data=(x_test, y_test))

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    30000000    input_3[0][0]                    
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, 200, 300, 1)  0           embedding_2[0][0]                
__________________________________________________________________________________________________
conv2d_8 (Conv2D)               (None, 199, 1, 100)  60100       reshape_2[0][0]                  
____________________________________________________________________________________________

### **결과**

ko_data를 불러와 실행시켜 최종 분석 파일 출력

In [None]:
import pandas as pd
ko = pd.read_csv('/content/drive/MyDrive/dataset/ko_data.csv',encoding='cp949')

In [None]:
result_data = [['Id','Predicted']]
for idx in range(len(ko)):
  sentence = ko['Sentence'][idx]
  logit = sent_cnn.predict(sentence)
  result_data.append([idx,logit])

df = pd.DataFrame(result_data)
df.to_csv('/content/drive/MyDrive/dataset/ko_result.csv',index=False, header=False, encoding='cp949')

In [None]:
df.head()

Unnamed: 0,0,1
0,Id,Predicted
1,0,1
2,1,0
3,2,1
4,3,1
