In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%cd /content/drive/MyDrive/nlp/relation_extraction_colab

/content/drive/.shortcut-targets-by-id/1EUPf2jO5GIuqe8drdlYxVZpxZd9rV6kp/relation_extraction_colab


In [6]:
from process_data import RE_DataEncoder
from BaseModel import BaseModel

import numpy as np
import seaborn as sns
import pickle

from keras.layers import Dense, Embedding, Conv1D

from tensorflow.keras.layers import Embedding, Dense, Dropout, Input, concatenate, Reshape, LSTM
from tensorflow.keras.layers import GlobalMaxPool1D
from keras.models import Model


def gen_glove_vector():
    with open('glove.6B.300d.txt', 'r', encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            curr_word = w_line[0]
            word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
    emb_matrix = np.zeros((Encoder.word_size+1, 300)) # vì word_index nó không quan tâm đến num_words đã set, nên 
    for word, index in Encoder.word_index.items():
        embedding_vector = word_to_vec_map.get(word)
        if embedding_vector is not None:
            emb_matrix[index, :] = embedding_vector
    return emb_matrix


class CNN_model(BaseModel):
    def build_model(self, using=['word_emb', 'position_emb', 'gram_emb', 'sp_emb']):
        input_sentence = Input(shape=(max_len,), name='sentence')

        embed_sentence_glove = Embedding(input_dim=Encoder.word_size+1, 
                                    output_dim=300, #cần tương thích vs tham số weights bên dưới
                                    input_length=max_len,
                                    weights = [emb_matrix],
                                    name='sentence_glove', mask_zero=True)(input_sentence)

        input_e1_pos = Input(shape=(max_len,), name='e1_position')
        embed_e1_pos = Embedding(126,200, input_length=max_len, mask_zero=True)(input_e1_pos)
        input_e2_pos = Input(shape=(max_len,), name='e2_position')
        embed_e2_pos = Embedding(122,200, input_length=max_len, mask_zero=True)(input_e2_pos)

        input_grammar = Input(shape=(max_len,), name='grammar_relation')
        embed_grammar = Embedding(45,100, input_length=max_len, mask_zero=True)(input_grammar)

        input_sp = Input(shape=(max_len,), name='shortest_path')
        embed_sp = Embedding(62, 500, input_length=max_len, mask_zero=True)(input_sp)
        
        input_list=[]
        if 'word_emb' in using:
            input_list.append(embed_sentence_glove)
        if 'position_emb' in using:
            input_list.extend([embed_e1_pos, embed_e2_pos])
        if 'gram_emb' in using:
            input_list.append(embed_grammar)
        if 'sp_emb' in using:
            input_list.append(embed_sp)

        visible = concatenate(input_list)
        interp = Conv1D(filters=306, kernel_size=5, activation='relu')(visible)
        interp = GlobalMaxPool1D()(interp)
        interp = Reshape((1, 306))(interp)
        interp = LSTM(122, dropout=.4)(interp)
        output = Dense(19, activation='softmax')(interp)
        self.model = Model(inputs=[input_sentence, input_e1_pos, input_e2_pos, input_grammar, input_sp], outputs=output)

In [4]:
with open('data/data_encoder.obj', 'rb') as f:
    Encoder=pickle.load(f)

vocab_size=Encoder.vocab_size
max_len= Encoder.max_len
emb_matrix=gen_glove_vector()

X_train = np.load('data/X_train.npy')
X_test = np.load('data/X_test.npy')
y_train = np.load('data/y_train.npy')
y_test = np.load('data/y_test.npy')

In [7]:
#CNN 
cnn_model=CNN_model()
cnn_model.build_model()
cnn_model.train_model(X_train, y_train, epochs=5)
cnn_model.evaluate(X_test, y_test, Encoder.dict_labels)
cnn_model.save_model('cnn_lstm')

# cnn_model=CNN_model()
# cnn_model.load_model('cnn_lstm')
# cnn_model.evaluate(X_test, y_test, Encoder.dict_labels)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Full classes:
                           precision    recall  f1-score   support

      Cause-Effect(e1,e2)       0.88      0.91      0.89       134
      Cause-Effect(e2,e1)       0.86      0.90      0.88       194
   Component-Whole(e1,e2)       0.81      0.81      0.81       162
   Component-Whole(e2,e1)       0.78      0.69      0.73       150
 Content-Container(e1,e2)       0.77      0.87      0.82       153
 Content-Container(e2,e1)       0.78      0.74      0.76        39
Entity-Destination(e1,e2)       0.88      0.89      0.88       291
Entity-Destination(e2,e1)       0.00      0.00      0.00         1
     Entity-Origin(e1,e2)       0.78      0.88      0.83       211
     Entity-Origin(e2,e1)       0.90      0.79      0.84        47
 Instrument-Agency(e1,e2)       0.56      0.68      0.61        22
 Instrument-Agency(e2,e1)       0.74      0.69      0.71       134
 Member-Collection(e1,e2)       0.81      0.53      0.64        