In [1]:
import xmltodict

# Import spaCy and load a language model
import spacy
nlp = spacy.load("en_core_web_sm")

data = 'ABSA14_Restaurants_Test_Gold.xml'

with open(data) as fd:
    xml_source = xmltodict.parse(fd.read())

In [2]:
xml_source

OrderedDict([('sentences',
              OrderedDict([('sentence',
                            [OrderedDict([('@id', '32897564#894393#2'),
                                          ('text',
                                           'The bread is top notch as well.'),
                                          ('aspectTerms',
                                           OrderedDict([('aspectTerm',
                                                         OrderedDict([('@term',
                                                                       'bread'),
                                                                      ('@polarity',
                                                                       'positive'),
                                                                      ('@from',
                                                                       '4'),
                                                                      ('@to',
                                      

In [3]:
# Extracting text, term, and polarity values
sentences = xml_source['sentences']['sentence']
extracted_info = []

for sentence in sentences:
    text = sentence['text']
    
    # Check if 'aspectTerms' exist in the sentence
    aspect_terms = sentence.get('aspectTerms')
    
    if aspect_terms:
        aspect_term = aspect_terms['aspectTerm']
        
        # If there are multiple aspectTerms in a sentence
        if isinstance(aspect_term, list):
            for term in aspect_term:
                term_value = term['@term']
                polarity = term['@polarity']
                extracted_info.append({'text': text, 'term': term_value, 'polarity': polarity})
        else:  # Only one aspectTerm in a sentence
            term_value = aspect_term['@term']
            polarity = aspect_term['@polarity']
            extracted_info.append({'text': text, 'term': term_value, 'polarity': polarity})

# Print or process extracted information
for item in extracted_info:
    print(item)

{'text': 'The bread is top notch as well.', 'term': 'bread', 'polarity': 'positive'}
{'text': 'I have to say they have one of the fastest delivery times in the city.', 'term': 'delivery times', 'polarity': 'positive'}
{'text': 'Food is always fresh and hot- ready to eat!', 'term': 'Food', 'polarity': 'positive'}
{'text': 'Did I mention that the coffee is OUTSTANDING?', 'term': 'coffee', 'polarity': 'positive'}
{'text': 'Certainly not the best sushi in New York, however, it is always fresh, and the place is very clean, sterile.', 'term': 'sushi', 'polarity': 'conflict'}
{'text': 'Certainly not the best sushi in New York, however, it is always fresh, and the place is very clean, sterile.', 'term': 'place', 'polarity': 'positive'}
{'text': 'I trust the people at Go Sushi, it never disappoints.', 'term': 'people', 'polarity': 'positive'}
{'text': 'Straight-forward, no surprises, very decent Japanese food.', 'term': 'Japanese food', 'polarity': 'positive'}
{'text': 'BEST spicy tuna roll, gr

In [4]:
# -*- coding: utf-8 -*-


import os
import pickle
import torch
import torch.nn.functional as F
import argparse

from data_utils import ABSADatesetReader, ABSADataset, Tokenizer, build_embedding_matrix
from bucket_iterator import BucketIterator
from models import LSTM, PWCN_POS, PWCN_DEP
from dependency_dist import dependency_dist_func

class Inferer:
    """A simple inference example"""
    def __init__(self, opt):
        self.opt = opt
        print("loading {0} tokenizer...".format(opt.dataset))
        with open(opt.dataset+'_word2idx.pkl', 'rb') as f:
            word2idx = pickle.load(f)
            self.tokenizer = Tokenizer(word2idx=word2idx)
        embedding_matrix = build_embedding_matrix(self.tokenizer.word2idx, opt.embed_dim, opt.dataset)
        self.model = opt.model_class(embedding_matrix, opt)
        print('loading model {0} ...'.format(opt.model_name))
        self.model.load_state_dict(torch.load(opt.state_dict_path, map_location=lambda storage, loc: storage))
        # switch model to evaluation mode
        self.model.eval()
        torch.autograd.set_grad_enabled(False)

    def evaluate(self, raw_text, aspect):
        text_seqs = [self.tokenizer.text_to_sequence(raw_text.lower().strip())]
        aspect_seqs = [self.tokenizer.text_to_sequence(aspect.lower())]
        left_seqs = [self.tokenizer.text_to_sequence(raw_text.lower().split(aspect.lower())[0])]
        text_indices = torch.tensor(text_seqs, dtype=torch.int64)
        aspect_indices = torch.tensor(aspect_seqs, dtype=torch.int64)
        left_indices = torch.tensor(left_seqs, dtype=torch.int64)
        dependency_dist = torch.tensor([dependency_dist_func(raw_text, aspect)], dtype=torch.int64)
        data = {
            'text_indices':text_indices, 
            'aspect_indices':aspect_indices, 
            'left_indices':left_indices,
            'dependency_dist':dependency_dist,
        }
        t_inputs = [data[col] for col in self.opt.inputs_cols]
        t_outputs = self.model(t_inputs)

        t_probs = F.softmax(t_outputs, dim=-1).cpu().numpy()
        return t_probs

if __name__ == '__main__':
    model_classes = {
        'lstm': LSTM,
        'pwcn_pos': PWCN_POS,
        'pwcn_dep': PWCN_DEP,
    }
    dataset = 'restaurant'
    # set your trained models here
    model_state_dict_paths = {
        'lstm': 'state_dict/lstm_'+dataset+'.pkl',
        'pwcn_pos': 'state_dict/pwcn_pos_'+dataset+'.pkl',
        'pwcn_dep': 'state_dict/pwcn_dep_'+dataset+'.pkl',
    }
    input_colses = {
        'lstm': ['text_indices'],
        'pwcn_pos': ['text_indices', 'aspect_indices', 'left_indices'], 
        'pwcn_dep': ['text_indices', 'aspect_indices', 'left_indices', 'dependency_dist'],
    }
    class Option(object): pass
    opt = Option()
    opt.model_name = 'pwcn_dep'
    opt.model_class = model_classes[opt.model_name]
    opt.inputs_cols = input_colses[opt.model_name]
    opt.dataset = dataset
    opt.state_dict_path = model_state_dict_paths[opt.model_name]
    opt.embed_dim = 300
    opt.hidden_dim = 300
    opt.polarities_dim = 3
    opt.device = torch.device('cpu')

    inf = Inferer(opt)

    wrong_count = 0
    wrong_items = []
    for item in extracted_info:    
        

        sentence = item['text'].lower()
        aspect = item['term'].lower()
        polarity = item['polarity']
        t_probs = inf.evaluate(sentence, aspect)
        result_polarity = t_probs.argmax(axis=-1) 

        if result_polarity[0] == 2:
            sentiment_result = "positive"
        elif result_polarity[0] == 1:
            sentiment_result = "neutral"
        else:
            sentiment_result = "negative"

        # print(sentiment_result, polarity)

        if sentiment_result != polarity:
            print('WRONG!!', 'expected: ', polarity, 'result: ', sentiment_result)
            print(sentence, aspect, result_polarity[0])
            wrong_count = wrong_count+1
            wrong_items.append({'text': sentence, 'aspect': aspect, 'expected': polarity, 'result': sentiment_result})

            
    print(wrong_count, ' out of ', len(extracted_info))   

        # print(t_probs.argmax(axis=-1))

    





loading restaurant tokenizer...
loading embedding_matrix: 300_restaurant_embedding_matrix.pkl
loading model pwcn_dep ...
WRONG!! expected:  positive result:  negative
i have to say they have one of the fastest delivery times in the city. delivery times 0
WRONG!! expected:  conflict result:  negative
certainly not the best sushi in new york, however, it is always fresh, and the place is very clean, sterile. sushi 0
WRONG!! expected:  neutral result:  positive
try the rose roll (not on menu). menu 2
WRONG!! expected:  negative result:  neutral
in fact, this was not a nicoise salad and was barely eatable. nicoise salad 1
WRONG!! expected:  negative result:  neutral
the sangria's - watered down. sangria 1
WRONG!! expected:  negative result:  positive
menu - uneventful, small. menu 2
WRONG!! expected:  negative result:  positive
great food but the service was dreadful! service 2
WRONG!! expected:  positive result:  neutral
desserts are almost incredible: my personal favorite is their tart o

In [5]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame(wrong_items)

# Adjusting display options to show all rows and columns
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)


# Displaying the DataFrame
print(df)

                                                  text          aspect  \
0    i have to say they have one of the fastest del...  delivery times   
1    certainly not the best sushi in new york, howe...           sushi   
2                     try the rose roll (not on menu).            menu   
3    in fact, this was not a nicoise salad and was ...   nicoise salad   
4                        the sangria's - watered down.         sangria   
..                                                 ...             ...   
279  its good to go there for drinks if you don't w...          drinks   
280                        anyway, the owner was fake.           owner   
281  i have never in my life sent back food before,...            food   
282  i have never in my life sent back food before,...          waiter   
283  creamy appetizers--taramasalata, eggplant sala...      warm pitas   

     expected    result  
0    positive  negative  
1    conflict  negative  
2     neutral  positive  
3    ne

In [6]:
df

Unnamed: 0,text,aspect,expected,result
0,i have to say they have one of the fastest del...,delivery times,positive,negative
1,"certainly not the best sushi in new york, howe...",sushi,conflict,negative
2,try the rose roll (not on menu).,menu,neutral,positive
3,"in fact, this was not a nicoise salad and was ...",nicoise salad,negative,neutral
4,the sangria's - watered down.,sangria,negative,neutral
...,...,...,...,...
279,its good to go there for drinks if you don't w...,drinks,neutral,positive
280,"anyway, the owner was fake.",owner,negative,positive
281,"i have never in my life sent back food before,...",food,negative,neutral
282,"i have never in my life sent back food before,...",waiter,negative,positive


In [7]:
import spacy
import networkx as nx
import numpy as np
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")

def dependency_dist_func(text, aspect_term):
    document = nlp(text)
    edges = []
    potential_candidate_term = []
    patterns = [
        [{'DEP': "amod"}],
        [{'DEP': "advmod"}],
        [{'DEP': "attr"}],
        [{'DEP': "ccomp"}],
        [{'DEP': "acomp"}],
    ]

    matcher = Matcher(nlp.vocab)
    matcher.add("potentialOpinion", patterns)
    matches = matcher(document)
    for match_id, start, end, in matches:
        string_id = nlp.vocab.strings[match_id]
        span = document[start:end]
        potential_candidate_term.append(span)
        # print(string_id, span.text)

    for token in document:
        for child in token.children:
            edges.append((token.i, child.i))
    graph = nx.Graph(edges)

    text_lst = text.split()
    seq_len = len(text_lst)
    text_left, _, _ = text.partition(aspect_term)
    start = len(text_left.split())
    end = start + len(aspect_term.split())
    asp_idx = [i for i in range(start, end)]
    dist_matrix = seq_len*np.ones((seq_len, len(asp_idx))).astype('float32')
    for i, asp in enumerate(asp_idx):
        for j in range(seq_len):
            try:
                dist_matrix[j][i] = nx.shortest_path_length(graph, source=asp, target=j)
            except:
                dist_matrix[j][i] = seq_len/2
    dist_matrix = np.min(dist_matrix, axis=1)
    return dist_matrix, potential_candidate_term

def find_closest_words(sentence, aspect):
    print("context: ", sentence)
    print("aspect: ", aspect)
    result, potential_opinion_term_from_dependency_dist = dependency_dist_func(sentence, aspect)
    print("Distances:", result)

    try:

        closest_word_index = np.argsort(result)[1]  # Excluding the aspect term itself
        closest_word = sentence.split()[closest_word_index]
        closest_second_word = sentence.split()[closest_word_index+1]
        closest_third_word = sentence.split()[closest_word_index+2]
        closest_fourth_word = sentence.split()[closest_word_index+3]
        print("The closest word to '",aspect,"' is:", closest_word)
        print("The closest second word to '",aspect,"' is:", closest_second_word)
        print("The closest third word to '",aspect,"' is:", closest_third_word)
        print("The closest fourth word to '",aspect,"' is:", closest_fourth_word)
        

    except:
        pass
    
    print("While the potential opinion words are: ", potential_opinion_term_from_dependency_dist)
    print()

In [8]:
for item in wrong_items:
    print("expected: ", item['expected'])
    print("result: ", item['result'])
    find_closest_words(item['text'], item['aspect'])

expected:  positive
result:  negative
context:  i have to say they have one of the fastest delivery times in the city.
aspect:  delivery times
Distances: [6. 5. 5. 4. 4. 3. 2. 1. 1. 1. 0. 0. 1. 3. 2.]
The closest word to ' delivery times ' is: times
The closest second word to ' delivery times ' is: in
The closest third word to ' delivery times ' is: the
The closest fourth word to ' delivery times ' is: city.
While the potential opinion words are:  [have, fastest]

expected:  conflict
result:  negative
context:  certainly not the best sushi in new york, however, it is always fresh, and the place is very clean, sterile.
aspect:  sushi
Distances: [1. 1. 1. 1. 0. 1. 3. 2. 2. 2. 2. 2. 1. 2. 2. 2. 2. 4. 3. 2.]
The closest word to ' sushi ' is: certainly
The closest second word to ' sushi ' is: not
The closest third word to ' sushi ' is: the
The closest fourth word to ' sushi ' is: best
While the potential opinion words are:  [certainly, best, however, always, fresh, very, clean]

expected:  