In [8]:
import os
import sys
import time
import pickle
import numpy as np
import random
import json
import re
from nltk.stem.porter import *

In [9]:
def match_relation(s): 
    # 19 relations
    relation_class = -1
    if s == "Cause-Effect(e1,e2)":
        relation_class = 0
    elif s == "Cause-Effect(e2,e1)":
        relation_class = 1
    elif s == "Instrument-Agency(e1,e2)":
        relation_class = 2
    elif s == "Instrument-Agency(e2,e1)":
        relation_class = 3
    elif s == "Product-Producer(e1,e2)":
        relation_class = 4
    elif s == "Product-Producer(e2,e1)":
        relation_class = 5
    elif s == "Content-Container(e1,e2)":
        relation_class = 6
    elif s == "Content-Container(e2,e1)":
        relation_class = 7
    elif s == "Entity-Origin(e1,e2)":
        relation_class = 8
    elif s == "Entity-Origin(e2,e1)":
        relation_class = 9
    elif s == "Entity-Destination(e1,e2)":
        relation_class = 10
    elif s == "Entity-Destination(e2,e1)":
        relation_class = 11
    elif s == "Component-Whole(e1,e2)":
        relation_class = 12
    elif s == "Component-Whole(e2,e1)":
        relation_class = 13
    elif s == "Member-Collection(e1,e2)":
        relation_class = 14
    elif s == "Member-Collection(e2,e1)":
        relation_class = 15
    elif s == "Message-Topic(e1,e2)":
        relation_class = 16
    elif s == "Message-Topic(e2,e1)":
        relation_class = 17
    elif s == "Other":
        relation_class = 18
    
    return relation_class

In [10]:
def clean_text(line):
    line = line.lower()
    e1 = re.findall(r"<e1.*/e1>", line)[0]
    e2 = re.findall(r"<e2.*/e2>", line)[0]
    content = re.findall(r"<e1.*/e2>", line)[0]

    content = re.sub(u"<e1>|</e1>|<e2>|</e2>", "", content)
    content = re.sub(u"\d+", " <num> ", content)

    e1 = re.sub(r"^<e1>|</e1>$", "", e1)
    e2 = re.sub(r"^<e2>|</e2>$", "", e2)
    
    return e1, e2, content

In [18]:

TRAIN_PATH = "./data/TRAIN_FILE.txt"
    
train_entities = []
train_contents = []
comments = []
ys = []

_class = {}


with open(TRAIN_PATH, 'r') as f:
    n = 0
    for n_line, line in enumerate(f.readlines()):

        if n_line % 4 == 0:

            empty_content = False

            e1, e2, content = clean_text(line)
            
            if content == '':
                empty_content = True

            if empty_content == False:
                train_entities.append((e1, e2))
                train_contents.append(content)

        elif n_line % 4 == 1:
            if empty_content == False:
                line = line.strip()
                relation_class = match_relation(line)
                ys.append(relation_class)

                _class[relation_class] = _class.get(relation_class, 0) + 1

        elif n_line % 4 == 2:
            if empty_content == False:
                comment = line.strip()
                comments.append(comment)

        else:  # ignore Comment
            pass

#         print (line)
#         input()

In [27]:
TEST_PATH = "./data/TEST_FILE.txt"
    
test_entities = []
test_contents = []


with open(TEST_PATH, 'r') as f:
    n = 0
    for n_line, line in enumerate(f.readlines()):

        e1, e2, content = clean_text(line)
       
        if empty_content == False:
            test_entities.append((e1, e2))
            test_contents.append(content)

#         print (line)
#         input()

In [32]:
sentences = [content.split(' ') for content in train_contents+test_contents]

## build ELMO embedding

In [33]:
import sys
sys.path.append('./allennlp')
from allennlp.modules.elmo import Elmo, batch_to_ids
from allennlp.common.file_utils import cached_path
import spacy
import h5py
spacy.load("en")


options_file = "./pretrained_word2vec/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
weight_file = './pretrained_word2vec/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'

elmo = Elmo(options_file, weight_file, 2, dropout=0)

# use batch_to_ids to convert sentences to character ids
# sentences = [['cat', 'dogdsfeefws', '.']]
# sentences = [['cat', 'dogdsfeefws', '.'] for i in range(8000)]

character_ids = batch_to_ids(sentences)

embeddings = elmo(character_ids)
print (character_ids)
print (embeddings["elmo_representations"][0].data.numpy().shape)
# print ('-'*30)
# print (embeddings["elmo_representations"][1])
# print (embeddings["elmo_representations"][0].shape)
# print (embeddings["elmo_representations"][1].shape)


tensor([[[ 259,  100,  112,  ...,  261,  261,  261],
         [ 259,  112,  103,  ...,  261,  261,  261],
         [ 259,   98,  111,  ...,  261,  261,  261],
         ...,
         [   0,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0]],

        [[ 259,  100,  105,  ...,  261,  261,  261],
         [ 259,  120,   98,  ...,  261,  261,  261],
         [ 259,  100,   98,  ...,  261,  261,  261],
         ...,
         [   0,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0]],

        [[ 259,   98,  118,  ...,  261,  261,  261],
         [ 259,  112,  103,  ...,  261,  261,  261],
         [ 259,   98,  260,  ...,  261,  261,  261],
         ...,
         [   0,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0]],

In [36]:
np.save('./data/elmo_x_test.npy', embeddings["elmo_representations"][0].data.numpy()[8000:])
print ("create file: ./data/elmo_x_test.npy")

create file: ./data/elmo_x_test.npy


In [37]:
np.save('./data/elmo_x_train.npy', embeddings["elmo_representations"][0].data.numpy()[:8000])
print ("create file: ./data/elmo_x_train.npy")

create file: ./data/elmo_x_train.npy


In [35]:
embeddings["elmo_representations"][0].data.numpy()[8000:].shape

(2717, 32, 1024)