In [1]:
import re
import pandas as pd
from collections import defaultdict

import stanfordnlp
from textblob import TextBlob
from allennlp.predictors.predictor import Predictor

import spacy
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r"(?<=[0-9])[+\-\*^](?=[0-9-])",
            r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
            ),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
            r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        ]
    )
    
infix_re = compile_infix_regex(infixes)

In [12]:
class SelfLearnABSA:
    def __init__(self, spacy_model = '', coref_res_url = '', dep_parser_url = '', stanfordnlp_lang = 'en', coref_remove_word = None):
        print('Loading models and setting up environment ...')
        if spacy_model == '':
            spacy_model = "en_core_web_sm"

        if coref_res_url == '':
            coref_res_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
        
        if dep_parser_url == '':
            dep_parser_url = "https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz"
        
        if not coref_remove_word:
            self.coref_remove_word = []
        else:
            self.coref_remove_word = coref_remove_word

        # self.nlp = spacy.load(spacy_model)
        self.nlp_without_hyphen = spacy.load(spacy_model)
        self.nlp_without_hyphen.tokenizer.infix_finditer = infix_re.finditer
        self.coreference_resolver = Predictor.from_path(coref_res_url)
        # self.dependency_parser = Predictor.from_path(dep_parser_url)
        # self.stanford_dependency_parser = stanfordnlp.Pipeline(processors='tokenize,mwt,pos,lemma,depparse', lang=stanfordnlp_lang)
        print('Models loaded and environment setup completed!!')

    def resolve_coreference(self, text, remove_words = []):
        coref_res_pred = self.coreference_resolver.predict(text)
        coref_resolved_text = self.coreference_resolver.coref_resolved(text)
        for cluster in coref_res_pred['clusters']:
            coref_word, coref_dep = cluster[0], cluster[1]
            coref_list = text.split()[coref_word[0]:coref_word[1]+1]
            for bad_word in remove_words:
                try:
                    coref_list.remove(bad_word)
                except:
                    pass
            coref_hyphenated_text = '-'.join(coref_list)
            coref_hyphenated_text = re.sub('[^A-Za-z0-9-]+', ' ', coref_hyphenated_text).strip()
            coref_spaced_text = ' '.join(coref_list)
            coref_spaced_text = re.sub('[^A-Za-z0-9-]+', ' ', coref_spaced_text).strip()
            coref_resolved_text = coref_resolved_text.replace(coref_spaced_text, coref_hyphenated_text)
        return coref_resolved_text


    def dependency_parser(self, resolved_text):
        coref_res_doc = self.nlp_without_hyphen(resolved_text)
        aspect_target_opinion = []
        for token in coref_res_doc:

            #Rule 1 - Using nsubj
            dependency, governer_word, governer_pos, dependent_word, dependent_pos = token.dep_, token.head, token.head.pos_, token, token.pos_
            if dependency == 'nsubj':
                pass
            
            # Rule 2 - Using dobj:
            if dependency == 'dobj':
                if 'VERB' == governer_pos:
                    aspect_target_opinion.append({'target':dependent_word.text, 'opinion':governer_word.text, 'polarity': TextBlob(governer_word.text).sentiment.polarity})

            # Rule 3 - Extract all amod
            if dependency == 'amod':
                if dependent_pos == 'ADJ':
                    aspect_target_opinion.append({'target':dependent_word.text, 'opinion':governer_word.text, 'polarity': TextBlob(dependent_word.text).sentiment.polarity})

        return aspect_target_opinion


    def get_parsed_output(self, input_, coref_remove_word = []):
        if self.coref_remove_word == []:
            self.coref_remove_word = coref_remove_word

        if type(input_) == str:
            resolved_text = self.resolve_coreference(input_, self.coref_remove_word)
            return [(token.dep_, token.head, token.head.pos_, token, token.pos_) for token in self.nlp_without_hyphen(resolved_text)]
        else:
            raise Exception("parsed_outputs is only available for strings.")


    def fit(self, input_, coref_remove_word = []):
        if self.coref_remove_word == []:
            self.coref_remove_word = coref_remove_word

        if type(input_) == str:
            print('Resolving coreference ..')
            coref_res_test = self.resolve_coreference(input_, self.coref_remove_word)
            print('Resolving coreference completed ..')
            print('Mining aspect target and opinions ...')
            aspects = self.dependency_parser(coref_res_test)
            print('Aspect target and opinions extracted!! ...') 
        return aspects

In [13]:
sl = SelfLearnABSA(coref_remove_word = ['the'])

Loading models and setting up environment ...
Models loaded and environment setup completed!!


In [14]:
text = "I enjoyed the screen resolution, it is amazing for such a cheap laptop."
# text = "But the staff was so horrible to us."
# text = "To be completely fair, the only redeeming factor was the food, which was above average, but couldn't make up for all the other deficiencies of Teodora."
# text = "The food is uniformly exceptional, with a very capable kitchen which will proudly whip up whatever you feel like eating, whether it's on the menu or not."
# text = "contracting covid after vaccines are proven but before receiving a shot would be like being a war casualty."

In [15]:
sl.resolve_coreference(text)

'I enjoyed the-screen-resolution, the-screen-resolution is amazing for such a cheap laptop.'

In [79]:
asp_pairs = sl.fit(text)
print()
print(asp_pairs)

Resolving coreference ..
Resolving coreference completed ..
Mining aspect target and opinions ...
Aspect target and opinions extracted!! ...

[{'target': 'shot', 'opinion': 'receiving', 'polarity': 0.0}]


In [229]:
# # AllenNLP dependency parsing
# dep_parser_pred = dependency_parser.predict(coref_resolved_text)

# # Stanford dependency parsing
# st_dep_parser_pred = stanford_dependency_parser(text)
# st_dep_parser_pred.sentences[0].print_dependencies()