In [2]:
!pip install anago

Collecting anago
  Downloading https://files.pythonhosted.org/packages/0e/09/a62ba9564e488376966f771105522c4d7783ec141964c0b955230b1f5f63/anago-1.0.8-py3-none-any.whl
Collecting seqeval>=0.0.3
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |████████████████████████████████| 51kB 4.4MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-cp37-none-any.whl size=16184 sha256=f0aaf0a825e3bf0a09b677125358de4b1693b1c7354035b4ce3ca5320b3a73f1
  Stored in directory: /root/.cache/pip/wheels/52/df/1b/45d75646c37428f7e626214704a0e35bd3cfc32eda37e59e5f
Successfully built seqeval
Installing collected packages: seqeval, anago
Successfully installed anago-1.0.8 seqeval-1.2.2


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pathlib import Path
import anago
from anago.utils import load_data_and_labels, load_glove

ImportError: ignored

In [14]:
from utils import extract_data

# SemEval 2016 
TRAIN = extract_data("/content/ABSA16_Restaurants_Train_SB1_v2.xml")
TEST = extract_data("/content/EN_REST_SB1_TEST.xml.gold")

# SemEval 2015
TRAIN.update(extract_data("/content/ABSA-15_Restaurants_Train_Final.xml"))
TEST.update(extract_data("/content/ABSA15_Restaurants_Test.xml"))

# SemEval 2014 
TRAIN.update(extract_data("/content/Restaurants_Train.xml"))
TEST.update(extract_data("/content/ABSA15_Restaurants_Test.xml"))

In [15]:
def tagging_IOB(s, aspects):
    tags = ['O'] * len(s)
    for aspect in aspects:
        pre_index = 0
        for word in s: 
            if word in aspect: # 'good' in 'a good place'
                cur_index = s.index(word) 
                if cur_index - pre_index == 1: # inside an aspect term
                    tags[cur_index] = 'I'
                else:                       # beginning of an aspect term
                    tags[cur_index] = 'B'
                pre_index = cur_index 
    return tags

In [16]:
def dict2df(train):
    data = pd.DataFrame()
    idx = 0
    for review, opinions in train.items():
        sentence = {}
        sentence['Sentence #'] = idx
        sentence['Word'] = review.split(" ")  # split text to words
        s_length = len(sentence['Word']) # the length of sentence, used to generate tag
        if len(opinions)==0 or opinions[0][0] == 'NULL': # tagging: if no aspect term
            sentence['Tag'] = ['O'] * s_length
        else:                                               # IOB format tag if aspect exist
            aspect_terms = [x.lower() for x,_ in opinions]  
            sentence['Tag'] = tagging_IOB(sentence['Word'], aspect_terms)

        # convert each setence to dataframe 
        sentence_df = pd.DataFrame.from_dict(sentence)
        data = data.append(sentence_df, ignore_index=True)
        idx+=1
    
    return data

In [17]:
data_train = dict2df(TRAIN)
data_test = dict2df(TEST)
data_train.head(30)

Unnamed: 0,Sentence #,Word,Tag
0,0,Judging,O
1,0,from,O
2,0,previous,O
3,0,posts,O
4,0,this,O
5,0,used,O
6,0,to,O
7,0,be,O
8,0,a,B
9,0,good,O


In [18]:
def df2data(df):
    """Read data and labels from dataframe
    Input:
        df: three columns, ['Sentence #', 'Tag', 'Word']
    Output:
        data: datasize * ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
        label: datasize * ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
    """
    agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                 s["Tag"].values.tolist())]
    grouped = df.groupby("Sentence #").apply(agg_func)
    data = [[w[0] for w in s] for s in grouped]
    label = [[w[1] for w in s] for s in grouped]  
    
    return data, label

In [19]:
x_train, y_train = df2data(data_train)
x_test, y_test = df2data(data_test)

In [20]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=42)

In [23]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip

--2021-06-24 10:11:10--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2021-06-24 10:11:11--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2021-06-24 10:11:11--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

In [24]:
!unzip glove*.zip

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


In [25]:
embeddings = load_glove("glove.840B.300d.txt")

In [41]:
%tensorflow_version

Currently selected TF version: 2.x
Available versions:
* 1.x
* 2.x


In [1]:
model = anago.Sequence(embeddings=embeddings, word_embedding_dim=300)
model.fit(x_train, y_train, x_test, y_test, epochs=50)

NameError: ignored