In [1]:
!git clone https://www.github.com/keras-team/keras-contrib.git

%cd keras-contrib

!python setup.py install

Cloning into 'keras-contrib'...
remote: Enumerating objects: 3634, done.[K
remote: Total 3634 (delta 0), reused 0 (delta 0), pack-reused 3634[K
Receiving objects: 100% (3634/3634), 861.24 KiB | 13.89 MiB/s, done.
Resolving deltas: 100% (2330/2330), done.
/content/keras-contrib
running install
running bdist_egg
running egg_info
creating keras_contrib.egg-info
writing keras_contrib.egg-info/PKG-INFO
writing dependency_links to keras_contrib.egg-info/dependency_links.txt
writing requirements to keras_contrib.egg-info/requires.txt
writing top-level names to keras_contrib.egg-info/top_level.txt
writing manifest file 'keras_contrib.egg-info/SOURCES.txt'
adding license file 'LICENSE'
writing manifest file 'keras_contrib.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build
creating build/lib
creating build/lib/keras_contrib
copying keras_contrib/__init__.py -> build/lib/keras_contrib
creating build/lib/keras_contrib

In [2]:
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from pathlib import Path

In [3]:
from utils import extract_data

# SemEval 2016 
TRAIN = extract_data("/content/ABSA16_Restaurants_Train_SB1_v2.xml")
TEST = extract_data("/content/EN_REST_SB1_TEST.xml.gold")

# SemEval 2015
TRAIN.update(extract_data("/content/ABSA-15_Restaurants_Train_Final.xml"))
TEST.update(extract_data("/content/ABSA15_Restaurants_Test.xml"))

# SemEval 2014 
TRAIN.update(extract_data("/content/Restaurants_Train.xml"))
TEST.update(extract_data("/content/ABSA15_Restaurants_Test.xml"))

ModuleNotFoundError: ignored

In [3]:
def tagging_IOB(s, aspects):
    tags = ['O'] * len(s)
    for aspect in aspects:
        pre_index = 0
        for word in s: 
            if word in aspect: # 'good' in 'a good place'
                cur_index = s.index(word) 
                if cur_index - pre_index == 1: # inside an aspect term
                    tags[cur_index] = 'I'
                else:                       # beginning of an aspect term
                    tags[cur_index] = 'B'
                pre_index = cur_index 
    return tags

In [4]:
def dict2df(train):
    data = pd.DataFrame()
    idx = 0
    for review, opinions in train.items():
        sentence = {}
        sentence['Sentence #'] = idx
        sentence['Word'] = review.split(" ")  # split text to words
        s_length = len(sentence['Word']) # the length of sentence, used to generate tag
        if len(opinions)==0 or opinions[0][0] == 'NULL': # tagging: if no aspect term
            sentence['Tag'] = ['O'] * s_length
        else:                                               # IOB format tag if aspect exist
            aspect_terms = [x.lower() for x,_ in opinions]  
            sentence['Tag'] = tagging_IOB(sentence['Word'], aspect_terms)

        # convert each setence to dataframe 
        sentence_df = pd.DataFrame.from_dict(sentence)
        data = data.append(sentence_df, ignore_index=True)
        idx+=1
    
    return data

In [5]:
data_train = dict2df(TRAIN)
data_test = dict2df(TEST)
data_train.head(30)

Unnamed: 0,Sentence #,Word,Tag
0,0,Judging,O
1,0,from,O
2,0,previous,O
3,0,posts,O
4,0,this,O
5,0,used,O
6,0,to,O
7,0,be,O
8,0,a,B
9,0,good,O


In [6]:
data = data_train

# Save all words as a list
words = list(set(data['Word'].values))
n_words = len(words)

tags = list(set(data["Tag"].values))
n_tags = len(tags)

max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
n_words = len(word2idx)
# word2idx['<unk>'] = len(word2idx) + 1
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx['<pad>'] = 0
n_tags = len(tag2idx) # Due to <pad>, here total tag number is from 17 to 18


# Sentence class
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
#                                                            s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

getter = SentenceGetter(data)
sentences = getter.sentences # get all sentences

max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
n_words = len(word2idx)
# word2idx['<unk>'] = len(word2idx) + 1
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx['<pad>'] = 0
n_tags = len(tag2idx) # Due to <pad>, here total tag number is from 17 to 18

# Word2inx & Padding for X
X = [[word2idx[w[0]] for w in s] for s in sentences]
X_train = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0)

# Word2inx & Padding for y
y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=0)

# Get one-hot labels
y_train = [to_categorical(i, num_classes=n_tags) for i in y]

In [7]:
#==============Bi-LSTM CRF=============
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=20,
                  input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="tanh"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags)  # CRF layer
out = crf(model)  # output

model = Model(input, out)
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()

AttributeError: ignored

In [16]:
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=8,
                    validation_split=0.1, verbose=1)

AttributeError: ignored