# Modules Installation

In [None]:
!pip install web_anno_tsv
!pip install underthesea
!pip install tensorflow



# Functions

In [None]:
import re

def getData(basePath, optionPath, data1=[], data2=[]):
  for folderName in os.listdir(basePath + optionPath):
    path = basePath + optionPath
    if folderName.endswith('.conll'):
      path += folderName + '/'
      for filename in os.listdir(path):
        if filename.endswith('.tsv'):
          path += filename
          with open_web_anno_tsv(path) as f:
            try:
              for i, sentence in enumerate(f):
                data1.append(re.sub(r'[^\w\d\s]', '', sentence.text))
                for j, annotation in enumerate(sentence.annotations):
                  data2.append([annotation.text, annotation.label, i])
            except:
              continue
  return data1, data2

In [None]:
from underthesea import word_tokenize

def getTokens(data):
  result = []
  for line in data:
    result.append(word_tokenize(line))
  return result

In [None]:
import numpy as np

def sum(vectors, size):
  result = np.zeros(size)
  for vector in vectors:
    result = np.add(result, vector)
  return result

In [None]:
def getTokenByNgrams(sent):
  sent = re.sub(r'[^\w\d\s]', '', sent)
  token = word_tokenize(sent)
  bigrams = []
  trigrams = []
  for i in range(len(token)-1):
    bigrams.append('{}_{}'.format(token[i], token[i+1]))
  for item in trigrams:
    trigrams.append('{}_{}_{}'.format(token[i], token[i+1], token[i+2]))
  return bigrams + trigrams

In [None]:
def token2vec(model, tokens):
  vec = []
  for token in tokens:
    values = token.split('_')
    tmpVecs = []
    for value in values:
      try:
        tmpVecs.append(model.wv[value])
      except:
        continue
    vec.append(sum(tmpVecs, 100))
  return vec

# Data Preparation

In [None]:
# Connect drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Relation entities dictionary
relationships = {
    'LOCATED': [('PERSON', 'LOCATION'), ('ORGANIZATION', 'LOCATION')],
    'IS_LOCATED': [('LOCATION', 'PERSON'), ('LOCATION', 'ORGANIZATION')],
    'PART-WHOLE': [('LOCATION', 'LOCATION'), ('ORGANIZATION', 'ORGANIZATION'), ('ORGANIZATION', 'LOCATION')],
    'WHOLE-PART': [('LOCATION', 'LOCATION'), ('ORGANIZATION', 'ORGANIZATION'), ('LOCATION', 'ORGANIZATION')],
    'PERSONAL–SOCIAL': [('PERSON', 'PERSON')],
    'ORGANIZATION–AFFILICATION': [('PERSON', 'ORGANIZATION'), ('PERSON', 'LOCATION'), ('ORGANIZATION', 'ORGANIZATION'), ('LOCATION', 'ORGANIZATION')],
    'AFFILICATION-ORGANIZATION': [('ORGANIZATION', 'PERSON'), ('LOCATION', 'PERSON'), ('ORGANIZATION', 'ORGANIZATION'), ('ORGANIZATION', 'LOCATION')],
}

In [None]:
# Get data from file
from web_anno_tsv import open_web_anno_tsv
import os

basePath = './drive/MyDrive/MassiveData/'
devPath = 'VLSP2020/VLSP2020_RE_dev/'
trainPath = 'VLSP2020/VLSP2020_RE_train/'
testPath = 'VLSP2020_RE_test/'

devSent, devLabel = getData(basePath, devPath)
trainSent, trainLabel = getData(basePath, trainPath)
testSent, testLabel = getData(basePath, testPath)

print('DEV DATA:')
print(len(devSent))
print(devSent[0])
print(len(devLabel))
print(devLabel[:2])
print('TRAIN DATA:')
print(len(trainSent))
print(trainSent[0])
print(len(trainLabel))
print(trainLabel[:2])
print('TEST DATA:')
print(len(testSent))
print(testSent[0])
print(len(testLabel))
print(testLabel[:2])

DEV DATA:
778
U16 Việt Nam dội mưa gôn vào lưới Mông Cổ Không nằm ngoài dự đoán U16 Việt Nam đã có chiến thắng dễ dàng trước U16 Mông Cổ  Tỷ số chung cuộc là 90 Như vậy tại bảng I vòng loại U16 châu Á 2018  U16 Việt Nam và U16 Australia tạm bằng điểm nhau Hai đội sẽ đối đầu trực tiếp để phân ngôi thứ vào chiều 249 tới U16 Việt Nam thắng dễ U16 Mông Cổ  Trong trận ra quân tại bảng I vòng loại U16 châu Á 2018 gặp U16 Campuchia  dù bị gỡ hòa 11 và bị mất người ở phút 20 nhưng U16 Việt Nam vẫn chơi xuất sắc để có chiến thắng chung cuộc 52 Bước vào trận thứ 2 gặp chủ nhà U16 Mông Cổ  U16 Việt Nam tràn đầy tự tin hướng tới một chiến thắng đậm nhằm tạo đà tâm lý trước cuộc quyết đấu với U16 Australia vào chiều ngày 249 tới Trước một đối thủ bị đánh giá thấp hơn về mọi mặt U16 Việt Nam không gặp nhiều khó khăn để làm chủ cuộc chơi và nhanh chóng có bàn vượt lên dẫn trước do công của Nguyên Hoàng ngay phút thứ 13 10 phút sau Thanh Trung số 7 nâng tỷ số lên 20 từ chấm phạt đền sau khi thủ môn đố

# Preprocessing

In [None]:
# Word2vec model
from gensim.models import Word2Vec

sentences = getTokens(devSent)
modelW2V = Word2Vec(sentences=sentences, size=100, window=5, min_count=1, workers=4)
print(len(modelW2V.wv.vocab))
print(modelW2V.wv['học'])

24175
[-6.93123490e-02  8.30389500e-01  2.48795107e-01  5.22420645e-01
 -1.68590043e-02 -3.06406468e-01  3.08981478e-01  1.45688698e-01
 -2.21776277e-01  3.60210724e-02 -2.05216929e-01  1.87274194e+00
  9.46120977e-01  2.06890941e-01  3.37640435e-01 -3.50866020e-01
 -2.82857507e-01  1.13848098e-01  1.62697956e-01 -4.14211035e-01
 -4.53993917e-01  1.59385090e-03  9.72450793e-01 -5.22391438e-01
 -2.91964114e-01  1.80574954e-02  6.51769519e-01 -4.66526121e-01
 -4.90313411e-01 -1.54860044e+00  7.63378441e-02 -5.63335598e-01
  2.65152007e-01  5.38328409e-01  2.99835414e-01 -2.25577667e-01
 -1.29824176e-01 -6.88299477e-01  1.38061373e-02  1.44123793e-01
 -2.33598948e-01 -2.06595566e-02 -3.53856266e-01  1.71862411e+00
 -1.80758312e-01  7.53067017e-01 -1.07908082e+00 -1.26604068e+00
  6.94004476e-01  1.25839235e-02 -3.99133563e-01  1.07967389e+00
 -1.86187983e-01 -9.88634825e-01  9.40245032e-01 -1.89787194e-01
  1.02241993e+00  4.56993222e-01 -1.07957661e-01  9.15585339e-01
 -3.59731019e-01 -2

In [None]:
# Word embedding - Entity
entity = []
size = 100
for i in range(len(devLabel)):
  vectors = []
  for item in word_tokenize(devLabel[i][0]):
    try:
      vectors.append(modelW2V.wv[item])
    except:
      continue
  entity.append(sum(vectors, size))
entity = np.array(entity)
print(len(entity))
print(entity[:5])

20435
[[ 9.02046025e-01  5.96048176e-01  6.63512111e-01 -4.91902933e-01
   2.98648927e-01  1.90999048e-01  4.90861163e-01  4.68006954e-01
  -6.46609396e-01  1.36671501e+00  9.45960015e-01  2.84238398e+00
   2.01802027e+00  2.37799801e-01  2.51163486e-01  1.49675697e+00
   4.30397540e-01  2.23950535e+00  6.18352711e-01 -7.92132705e-01
  -1.93238705e+00  6.42701127e-02  1.02904835e+00 -1.71223873e+00
   3.34829748e-01  2.80324742e-01  8.16576213e-01 -9.36333865e-01
   5.38722351e-01 -1.75958592e+00  5.11766672e-02 -9.57907677e-01
   3.71036641e-01  1.79993039e+00 -2.52766743e-01  7.11578652e-02
  -7.87073433e-01  5.76186068e-02 -4.98641893e-01  1.01414967e+00
  -1.42051507e-01  6.24081753e-02  7.36079812e-01  2.67582178e+00
   4.98459250e-01  2.05384171e+00 -2.47094572e+00 -1.50892311e+00
   1.01583779e+00  1.56290110e-01 -6.60838202e-01  2.33445507e+00
  -1.22750092e+00 -1.75381601e+00  2.53968406e+00 -1.45491791e+00
   1.95290673e+00  4.88287345e-01 -6.55684233e-01  1.59483683e+00
  -9

In [None]:
# Label Encoding - Name of Entity
from sklearn.preprocessing import LabelEncoder

nameOfEntity = [item[1] for item in devLabel]
entityLabelEncoder = LabelEncoder()
encodedEntity = np.array(entityLabelEncoder.fit_transform(nameOfEntity))
print(encodedEntity[:10])

[3 1 3 3 2 3 3 3 3 2]


In [None]:
# Train test split for SVM model data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(entity, encodedEntity, test_size=0.2, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(16348, 100)
(16348,)
(4087, 100)
(4087,)


In [None]:
# Train SVM model for determine entity
from sklearn.svm import SVC

modelSVM = SVC(kernel='linear')
modelSVM.fit(X_train, y_train)

SVC(kernel='linear')

In [None]:
# Test SVM model
from sklearn.metrics import classification_report

y_pred = modelSVM.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.73      0.61      0.66      1439
           2       0.92      0.30      0.45       150
           3       0.86      0.37      0.52      1102
           4       0.55      0.93      0.70      1396

    accuracy                           0.65      4087
   macro avg       0.76      0.55      0.58      4087
weighted avg       0.71      0.65      0.63      4087



In [None]:
# Train SVM model for determine entity - full train data
from sklearn.svm import SVC

modelSVM = SVC(kernel='linear')
modelSVM.fit(entity, encodedEntity)

SVC(kernel='linear')

In [None]:
# Assign relationship label for vector of entity couples
import random as rd

entityCoupleVectors = []
nameOfRelationship = []
for i in range(len(entity)-1):
  if devLabel[i][2] == devLabel[i+1][2]:
    pick = []
    relationshipLabel = 'OTHERS'
    for key in relationships.keys():
      if (devLabel[i][1], devLabel[i+1][1]) in relationships[key]:
        pick.append(key)
    if len(pick) != 0:
      index = rd.randint(0, len(pick)-1)
      relationshipLabel = pick[index]
    entityCoupleVectors.append(np.add(entity[i], entity[i+1]))
    nameOfRelationship.append(relationshipLabel)
entityCoupleVectors = np.array(entityCoupleVectors)
print(entityCoupleVectors.shape)
print(entityCoupleVectors[:10])
print(len(nameOfRelationship))
print(nameOfRelationship[:10])

(20434, 100)
[[ 0.94950821  0.73737697  0.73929703 -0.46345811  0.31915152  0.17213948
   0.56335504  0.50489896 -0.68058293  1.42429229  0.95240314  3.22668512
   2.26162141  0.29889115  0.29407947  1.5314326   0.43443784  2.40885799
   0.68620959 -0.91103929 -2.08811148  0.06238643  1.18306544 -1.87453222
   0.28495832  0.29388108  0.94933904 -1.04867657  0.48347128 -2.04171575
   0.02523378 -1.11144141  0.40416962  1.98053521 -0.21100227  0.06839452
  -0.87667215 -0.03499948 -0.51680946  1.10931962 -0.18190408  0.05803499
   0.72564436  3.02260569  0.49220377  2.23024341 -2.72413924 -1.76369537
   1.184244    0.16275917 -0.75034976  2.57029033 -1.31511766 -1.9718173
   2.79228032 -1.55807433  2.18889912  0.56716915 -0.73611919  1.78086809
  -1.04590616 -0.78886664  1.77061833  2.35624012  0.65296415  0.5549279
  -0.52749618 -0.2734767  -0.06555002  0.98616583 -1.26203081 -2.49984795
  -1.64679908 -1.20358605 -0.67578344 -2.82341953 -0.85009358  2.13552004
   0.44210072  0.07665512  

In [None]:
# Label encoding - Name of Relationship
from sklearn.preprocessing import LabelEncoder

relationshipLabelEncoder = LabelEncoder()
encodedRelationship = np.array(relationshipLabelEncoder.fit_transform(nameOfRelationship))
print(encodedRelationship.shape)
print(encodedRelationship[:10])

(20434,)
[2 7 7 4 4 3 0 5 4 4]


# Training

In [None]:
# RNN Model
from keras.models import Sequential
from keras.layers import Dense, Input

modelRNN = Sequential()
modelRNN.add(Dense(128, input_shape=(entityCoupleVectors.shape[1],), activation='relu'))
modelRNN.add(Dense(512, activation='tanh'))
modelRNN.add(Dense(512, activation='tanh'))
modelRNN.add(Dense(512, activation='tanh'))
modelRNN.add(Dense(512, activation='tanh'))
modelRNN.add(Dense(8, activation='softmax'))
modelRNN.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
modelRNN.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 128)               12928     
                                                                 
 dense_7 (Dense)             (None, 512)               66048     
                                                                 
 dense_8 (Dense)             (None, 512)               262656    
                                                                 
 dense_9 (Dense)             (None, 512)               262656    
                                                                 
 dense_10 (Dense)            (None, 512)               262656    
                                                                 
 dense_11 (Dense)            (None, 8)                 4104      
                                                                 
Total params: 871,048
Trainable params: 871,048
Non-tr

In [None]:
# Train RNN model
from keras.callbacks import ModelCheckpoint
import os

dirname = 'checkpoints'
batch_size = 1024
epochs = 500
if not os.path.isdir(dirname):
  os.system('mkdir {}'.format(dirname))
modelRNN.fit(entityCoupleVectors, encodedRelationship, batch_size=batch_size, epochs=epochs, validation_split=0.2, verbose=1, callbacks=[ModelCheckpoint(
    filepath='./checkpoints/{epoch:03d}-{accuracy:.2f}.hdf5',
    save_weights_only=True,
    monitor='accuracy',
    mode='max',
    save_best_only=True)]
)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x7fae3e204190>

# Prediction

In [None]:
# Load best RNN model
file = os.listdir(dirname)
max = 0.0
filename = ''
for f in file:
  if f.endswith('.hdf5'):
    value = float(f.split('-')[1][:3])
    if value > max:
      max = value
      filename = f
modelRNN.load_weights('./checkpoints/' + filename)

In [None]:
# RNN model for relationship tagging
# sent = 'Bản thân tôi hễ nghe thấy tiếng xe cứu thương từ xa sẽ tìm cách tránh vào lề, nhường đường liền chứ không phải chờ nhắc nhở theo cách này.'
sent = 'Anh Thanh là cán bộ Uỷ ban nhân dân Thành phố Hà Nội'
tokens = getTokenByNgrams(sent)
tokens = token2vec(modelW2V, tokens)
entities = [[token, entityLabelEncoder.inverse_transform([modelSVM.predict([token])[0]])] for token in tokens]
index = []
value = rd.randint(0, len(entities)-1)
while len(index) < 2:
  if value not in index and entities[value][1][0] != 'OTHERS':
    index.append(value)
  value = rd.randint(0, len(entities)-1)
relationship = modelRNN.predict(np.array([np.add(entities[index[0]][0], entities[index[1]][0])]))
relaList = [relationshipLabelEncoder.inverse_transform([i])[0] for i in range(8)]
max = 0
relaIndex = 0
for i in range(len(relationship[0])):
  if relationship[0][i] > max:
    max = relationship[0][i]
    relaIndex = i
print('sentence:\t {}'.format(sent))
print('entity 1:\t {}'.format(entityLabelEncoder.inverse_transform(modelSVM.predict([entities[index[0]][0]]))[0]))
print('entity 2:\t {}'.format(entityLabelEncoder.inverse_transform(modelSVM.predict([entities[index[1]][0]]))[0]))
print('relationship:\t {}'.format(relaList[relaIndex]))

sentence:	 Anh Thanh là cán bộ Uỷ ban nhân dân Thành phố Hà Nội
entity 1:	 LOCATION
entity 2:	 LOCATION
relationship:	 ORGANIZATION–AFFILICATION
