In [1]:
import warnings
warnings.filterwarnings('ignore')
from functools import reduce
from collections import defaultdict

import nltk, os, math
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import rbf_kernel

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments

from datasets import Dataset, DatasetDict

import spacy
nlp = spacy.load('en_core_web_lg')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/hgupta35/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# ASTREX Pipeline

In [2]:
def reconstruct_strings(df, col):
    """
    Reconstruct strings to dictionaries when loading csv/xlsx files.
    """
    reconstructed_col = []
    for text in df[col]:
        if text != '[]' and isinstance(text, str):
            text = text.replace('[', '').replace(']', '').replace('{', '').replace('}', '').split(", '")
            req_list = []
            for idx, pair in enumerate(text):
                splitter = ': ' if ': ' in pair else ':'
                if idx%2==0:
                    reconstructed_dict = {} 
                    reconstructed_dict[pair.split(splitter)[0].replace("'", '')] = pair.split(splitter)[1].replace("'", '')
                else:
                    reconstructed_dict[pair.split(splitter)[0].replace("'", '')] = pair.split(splitter)[1].replace("'", '')
                    req_list.append(reconstructed_dict)
        else:
            req_list = text
        reconstructed_col.append(req_list)
    df[col] = reconstructed_col
    return df

def extract_rowwise_aspect_polarity(df, on, key, min_val = None):
    """
    Create duplicate records based on number of aspect term labels in the dataset.
    Extract each aspect term for each row for reviews with muliple aspect term entries. 
    Do same for polarities and create new column for the same.
    """
    try:
        df.iloc[0][on][0][key]
    except:
        df = reconstruct_strings(df, on)

    df['len'] = df[on].apply(lambda x: len(x))
    if min_val is not None:
        df.loc[df['len'] == 0, 'len'] = min_val
    df = df.loc[df.index.repeat(df['len'])]
    df['record_idx'] = df.groupby(df.index).cumcount()
    df['aspect'] = df[[on, 'record_idx']].apply(lambda x : (x[0][x[1]][key], x[0][x[1]]['polarity']) if len(x[0]) != 0 else ('',''), axis=1)
    df['polarity'] = df['aspect'].apply(lambda x: x[-1])
    df['aspect'] = df['aspect'].apply(lambda x: x[0])
    df = df.drop(['len', 'record_idx'], axis=1).reset_index(drop = True)
    return df

def create_data_in_atsc_format(df, on, key, text_col, aspect_col, bos_instruction = '', 
                    delim_instruction = '', eos_instruction = ''):
    """
    Prepare the data in the input format required.
    """
    if df is None:
        return
    df = extract_rowwise_aspect_polarity(df, on=on, key=key, min_val=1)
    df['text'] = df[[text_col, aspect_col]].apply(lambda x: bos_instruction + x[0] + delim_instruction + x[1] + eos_instruction, axis=1)
    df = df.rename(columns = {'polarity': 'labels'})
    return df

## Step 1: Massive Pre Finetuning for Domain Understanding

In [63]:
class MLM:
    def __init__(self, text_input, text_column = 'text', seed = 42, test_size = None, chunk_size=128, model_ckpt='bert-large-uncased'):
        self.seed = 42
        self.test_size = test_size
        self.chunk_size = chunk_size
        self.text_column = text_column

        # Load the BERT-large tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        self.model = AutoModelForMaskedLM.from_pretrained(model_ckpt)

        if isinstance(text_input, list):
            df = pd.DataFrame({text_column:text_input})
            if test_size is not None:
                train, test = train_test_split(df, random_state = self.seed, test_size = self.test_size)
                self.dataset = DatasetDict({'train':Dataset.from_pandas(train), 'test':Dataset.from_pandas(test)})
            else:
                self.dataset = DatasetDict({'train':Dataset.from_pandas(df), 'test':Dataset.from_pandas(df)})
        else:
            self.dataset = text_input


    def tokenize_function(self, examples):
        result = self.tokenizer(examples[self.text_column])
        if self.tokenizer.is_fast:
            result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        return result


    def group_texts(self, examples):
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        total_length = (total_length // self.chunk_size) * self.chunk_size
        result = {
            k: [t[i : i + self.chunk_size] for i in range(0, total_length, self.chunk_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result


    def pre_finetune(self, root_path, mlm_proba = 0.15, batch_size=16, epochs = 8, return_trainer = False):
        remove_cols = self.dataset['train'].column_names
        tokenized_datasets = self.dataset.map(self.tokenize_function, batched=True, remove_columns=remove_cols)
        lm_datasets = tokenized_datasets.map(self.group_texts, batched=True)
        data_collator = DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm_probability=mlm_proba)

        training_args = TrainingArguments(
            output_dir=root_path,
            overwrite_output_dir=True,
            num_train_epochs=epochs,
            save_strategy='epoch',
            evaluation_strategy='epoch',
            learning_rate=2e-5,
            weight_decay=0.01,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            push_to_hub=False,
            fp16=True,
            logging_strategy='epoch',
            save_total_limit = 2,
            load_best_model_at_end=True 
        )

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=lm_datasets["train"],
            eval_dataset=lm_datasets["test"],
            data_collator=data_collator
        )

        eval_results = self.trainer.evaluate()
        print(f">>> Initial Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
        print('Model training started ...')
        self.trainer.train()
        eval_results = self.trainer.evaluate()
        print(f">>> Final Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

        if return_trainer:
            return self.trainer

In [64]:
df = pd.read_csv("/data/data/hgupta35/causal_lm_kjs/unsup/Restaurants_Test.csv")
df = reconstruct_strings(df, 'aspectTerms')

input_text = df['raw_text'].tolist()
model_ckpt = "microsoft/deberta-v2-xlarge"
root_path = f"/data/data/hgupta35/causal_lm_kjs/unsup/{model_ckpt}"
chunk_size = 128

ft = MLM(text_input=input_text, text_column='raw_text', chunk_size=chunk_size, model_ckpt=model_ckpt)
ft.pre_finetune(root_path, epochs = 20)

Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-v2-xlarge and are newly initialized: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 800/800 [00:00<00:00, 11065.96 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 14852.35 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 9099.04 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 9017.63 examples/s]
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


>>> Initial Perplexity: 436334.35
Model training started ...


Epoch,Training Loss,Validation Loss
1,12.4803,11.334242
2,11.0019,10.116465
3,10.3194,9.759091
4,9.6167,9.278866
5,9.0831,8.533404
6,8.3491,8.005946
7,7.9144,7.939423
8,7.8239,7.717659
9,7.6447,7.426126
10,7.4338,7.274204


>>> Final Perplexity: 1139.78


## Step 2: Aspect Terms Extraction using POS tagger

In [4]:
# Define the input sentence
sentence = "The cab ride was amazing but the driver was pathetic."
sentence = "Did not enjoy the new Windows 8 and touchscreen functions."

# Tokenize the sentence and perform POS tagging
tokens = nltk.word_tokenize(sentence)
pos_tags = nltk.pos_tag(tokens)

# Extract the aspect terms (nouns) from the POS tags
aspect_terms = [word for word, tag in pos_tags if tag.startswith('NN')]

# Print the aspect terms
print(aspect_terms)

['Did', 'Windows', 'functions']


## Step 3: Opinion Mining using Dependency Reweighting using Attention Scores

In [63]:
# def _get_aligned_subwords_embeddings_old(self, text, tokenized_text):
#         """
#         Return word embeddings from hidden states when word piece/byte pair tokenizer is used.
#         This method aligns the subwords into words by averaging the embeddings of subwords together
#         """
#         tokens = self.tokenizer.tokenize(text)
#         token_ids = tokenized_text.input_ids[0].detach().cpu().numpy()
#         new_tokens = []
#         new_token_ids = []
#         new_position_ids = []

#         for i in range(len(tokens)):
#             if tokens[i].startswith('##') and i > 0:
#                 new_tokens[-1] += tokens[i][2:]
#                 if isinstance(new_token_ids[-1], list):
#                     new_token_ids[-1] = new_token_ids[-1] + [token_ids[i]]
#                     new_position_ids[-1] = new_position_ids[-1] + [i]
#                 else:
#                     new_token_ids[-1] = [new_token_ids[-1]] + [token_ids[i]]
#                     new_position_ids[-1] = [new_position_ids[-1]] + [i]
#             else:
#                 new_tokens.append(tokens[i])
#                 new_token_ids.append(token_ids[i])
#                 new_position_ids.append(i)

#         print("TOKENS ORIGINAL: ", tokens)
#         print("TOKENS NEW: ", new_tokens)
#         print("TOKENS IDS NEW: ", new_token_ids)
#         print("TOKENS POS IDS NEW: ", new_position_ids)

#         word_embeddings_aligned_list = []
#         subword_embedding_list = []

#         for pos_ids in new_position_ids:       
#             if isinstance(pos_ids, list):
#                 for sub_word_pos_id in pos_ids:
#                     subword_embedding_list.append(self.token_level_embeddings[sub_word_pos_id])
                
#                 subword_embedding_list = torch.tensor(subword_embedding_list)
#                 subword_embedding_list = subword_embedding_list.mean(dim=0)
#                 word_embeddings_aligned_list.append(subword_embedding_list)
#                 subword_embedding_list = []
#             else:
#                 word_embeddings_aligned_list.append(self.token_level_embeddings[pos_ids])

#         word_embeddings_aligned = torch.tensor(word_embeddings_aligned_list)

#         if self.score_by == 'attentions':
#             # Block to handle the columns of the attention weights
#             word_embeddings_aligned_list = []
#             subword_embedding_list = []

#             for pos_ids in new_position_ids:       
#                 if isinstance(pos_ids, list):
#                     for sub_word_pos_id in pos_ids:
#                         subword_embedding_list.append(word_embeddings_aligned[:, sub_word_pos_id])
                    
#                     subword_embedding_list = [list(t) for t in zip(*subword_embedding_list)]
#                     subword_embedding_list = torch.tensor(subword_embedding_list)
#                     subword_embedding_list = subword_embedding_list.mean(dim=1).view(-1, 1)
#                     word_embeddings_aligned_list.append(subword_embedding_list)
#                     subword_embedding_list = []
#                 else:
#                     word_embeddings_aligned_list.append(word_embeddings_aligned[:, sub_word_pos_id].view(-1, 1))
            
#             word_embeddings_aligned = torch.cat(word_embeddings_aligned_list, dim=1)
        
#         return word_embeddings_aligned, new_tokens

In [9]:
class OpinionWordMiner:
    """
    The class is written for a batch_size 1 scenario
    TODO: Convert to a batch processing scenario 
    """
    def __init__(self, tokenizer, model, gamma = 0.0005, 
                 we_layer_list=[-1], score_by='attention'):
        self.tokenizer = tokenizer
        self.model = model
        self.gamma = gamma
        self.token_level_embeddings = None
        self.score_by = 'attentions' if score_by == 'attention' else 'hidden_states'


    def _get_word_embeddings(self, tokenized_text):
        """
        This function extracts the word/token embeddings from the specified hidden layers.
        """
        tokenized_sentence_output = self.model(**tokenized_text)
        tokenized_sentence_hidden_states = torch.stack(tokenized_sentence_output[self.score_by], dim=0)
        tokenized_sentence_hidden_states = torch.squeeze(tokenized_sentence_hidden_states, dim=1)
        if self.score_by == 'attentions':
            # Extract the lower layer attention heads
            tokenized_sentence_embeddings = tokenized_sentence_hidden_states[2] # (Num Heads, Seq Length, Seq Length)
        else:
            tokenized_sentence_embeddings = tokenized_sentence_hidden_states.permute(1,0,2) # (Num Layers, Seq Length, Emb_Size)


        if self.score_by == 'attentions':
            # Extract specific specific heads for attention based scoring
            self.token_level_embeddings = tokenized_sentence_embeddings.mean(dim=0).detach().numpy()
        else:
            # Extract specific layer outputs for embedding based scoring
            token_vecs_cat = []
            for token in tokenized_sentence_embeddings:
                cat_vec = torch.cat((
                    token[0],
                    token[1],
                    token[2],
                    # token[-1]
                ), dim=0)
                token_vecs_cat.append(cat_vec.detach().numpy())
            self.token_level_embeddings = np.array(token_vecs_cat)
        return self.token_level_embeddings, tokenized_sentence_output
    

    def _get_aligned_subwords_embeddings(self, text):
        """
        Return word embeddings from hidden states when word piece/byte pair tokenizer is used.
        This method aligns the subwords into words by averaging the embeddings of subwords together
        """
        word_embeddings_aligned_list = []
        index_handler_for_cols = []
        tokens = self.tokenizer.tokenize(text)
        new_tokens = word_tokenize(text)

        print("TOKENS ORIGINAL: ", tokens)
        print("TOKENS NEW: ", new_tokens)
        
        for word in new_tokens:
            tokenized_token = self.tokenizer.tokenize(word)
            start_idx = tokens.index(tokenized_token[0])
            end_idx = start_idx + len(tokenized_token)
            word_embeddings = self.token_level_embeddings[start_idx:end_idx]
            # if word == 'car':
            #     print("ll: ", word_embeddings)
            if word_embeddings.shape[0] > 1:
                word_embeddings =np.mean(word_embeddings, axis=0).reshape(1, -1)
                index_handler_for_cols.append([start_idx, end_idx])

            word_embeddings_aligned_list.append(word_embeddings)

        word_embeddings_aligned = np.array(word_embeddings_aligned_list).squeeze(axis=1)

        if self.score_by == 'attentions':
            diff_idx_ = 0
            for start_idx_, end_idx_ in index_handler_for_cols:
                start_idx_-=diff_idx_
                end_idx_-=diff_idx_
                mean_val = np.mean(word_embeddings_aligned[:, start_idx_:end_idx_], axis=1, keepdims=True)
                word_embeddings_aligned[:, [start_idx_]] = mean_val
                word_embeddings_aligned = np.delete(word_embeddings_aligned, np.s_[start_idx_+1:end_idx_], axis=1)
                diff_idx_ = end_idx_ - start_idx_-1

        assert len(new_tokens) == word_embeddings_aligned.shape[0]
        
        if self.score_by == 'attentions':
            assert len(new_tokens) == word_embeddings_aligned.shape[1]

        return torch.tensor(word_embeddings_aligned), new_tokens
    

    def _filter_candidates(self, dataframe, shift_index_filter, pos_filter):
        """
        Method to create rules for extracting compound phrases.
        Arguments:
        dataframe: This argument is the dataframe with the POS and dependency information.
        index_filters: This argument handles the sequential indexes to check for the pos tags.
        pos_filters: This argument handles the pos tags at each index in the dependency dataframe.
        """
        filter_ = [(idx_, pos_) for idx_, pos_ in zip(shift_index_filter, pos_filter)]
        filter_condition = reduce(lambda x, y: x & y, [dataframe['pos'].shift(pos_) == val_ for pos_, val_ in filter_])
        
        compound_phrase_idx = []
        comp_phrase_record = []
        for idx in dataframe[filter_condition].index:
            if (idx + 1) < len(dataframe):
                comp_phrase_record = [
                    ' '.join([dataframe.loc[idx-idx_val]['opinion_word'] for idx_val in shift_index_filter]),
                    dataframe.loc[idx]['attention_score'],
                    '-'.join([dataframe.loc[idx-idx_val]['pos'] for idx_val in shift_index_filter]),
                    dataframe.loc[idx]['dep']
                    ]
                for idx_val in shift_index_filter:
                    compound_phrase_idx.append(idx-idx_val)
                dataframe.loc[len(dataframe)] = comp_phrase_record
        dataframe.drop(index=compound_phrase_idx, inplace=True)
        return dataframe

    
    def mine_opinion_words(self, text, aspect_word, display_df = False):
        # Tokenize the text
        tokenized_text = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')

        # Query the word embeddings/attention_weights for each token
        word_embeddings, temp_ = self._get_word_embeddings(tokenized_text)

        # Align the word embeddings if the tokenizer splits words into sub words
        word_embeddings, aligned_tokens = self._get_aligned_subwords_embeddings(text)

        # Extract POS tags and Dependency tags
        spacy_tokens_pos_tags = [token.pos_ for token in nlp(text)]
        spacy_tokens_deps = [token.dep_ for token in nlp(text)]

        try:
            # Set default attention score for the aspect word
            aspect_word_score = [0 for i in range(len(aligned_tokens))]

            if self.score_by != 'attentions':
                # Only use RBF kernel similarity if it is embedding based method
                self_attention_matrix = rbf_kernel(word_embeddings, word_embeddings, self.gamma)
            else:
                # RBF Kernel is not required since attention weights already show where the Query is attending to other Key vectors
                self_attention_matrix = word_embeddings

            if ' ' in aspect_word:
                drop_records = []
                aspect_words = aspect_word.split()
                combined_aspect_word_embeddings = []

                for aspect_word in aspect_words:
                    aspect_word_idx = aligned_tokens.index(aspect_word)
                    drop_records.append(aspect_word_idx)
                    combined_aspect_word_embeddings.append(self_attention_matrix[aspect_word_idx])
                aspect_word_score = np.mean(combined_aspect_word_embeddings, axis=0)
            else:
                aspect_word_idx = aligned_tokens.index(aspect_word)
                drop_records = [aspect_word_idx]
                aspect_word_score = self_attention_matrix[:, aspect_word_idx]

            dep_df = pd.DataFrame(aspect_word_score, columns = ['attention_score'], index=aligned_tokens)
            dep_df['pos'] = spacy_tokens_pos_tags
            dep_df['dep'] = spacy_tokens_deps            
            dep_df = dep_df.reset_index().rename(columns = {'index':'opinion_word'})
            dep_df.drop(index=drop_records, inplace=True) # Remove aspect word scores (since it will be highest)

            # Reset index before applying rule based filtering
            dep_df.reset_index(drop=True, inplace=True)
            display(dep_df)

            """
            TODO: Add option to generalize the rules 
            """            
    
            # RULE 1: COMPOUND PHRASE EXTRACTION -> Compound Noun (Adjective + Noun) [AMOD]
            dep_df = self._filter_candidates(dataframe=dep_df, 
                                             shift_index_filter=[0, -1], 
                                             pos_filter=['ADJ', 'NOUN'])

            # RULE 2: COMPOUND PHRASE EXTRACTION -> Compound Noun (Adjective + Noun + Noun) [AMOD~COMPOUND]
            dep_df = self._filter_candidates(dataframe=dep_df, 
                                             shift_index_filter=[0, -1, -2], 
                                             pos_filter=['ADJ', 'NOUN', 'NOUN'])

            # RULE 3: COMPOUND PHRASE EXTRACTION -> Adverbial Phrase (Adverb + Adjective)
            dep_df = self._filter_candidates(dataframe=dep_df, 
                                             shift_index_filter=[0, -1], 
                                             pos_filter=['ADV', 'ADJ'])
        
            # RULE 4: COMPOUND PHRASE EXTRACTION -> Adverbial Phrase (AdP + NOUN)
            dep_df = self._filter_candidates(dataframe=dep_df, 
                                             shift_index_filter=[0, -1], 
                                             pos_filter=['ADP', 'NOUN'])

            if display_df:
                display(dep_df)

            dep_df.reset_index(drop=True, inplace=True)

            # Final filters
            dep_df = dep_df[(dep_df['pos'] == 'ADJ') | \
                            (dep_df['pos'] == 'ADJ-NOUN') |\
                            (dep_df['pos'] == 'ADJ-NOUN-NOUN') |\
                            (dep_df['pos'] == 'ADV-ADJ') |\
                            (dep_df['pos'] == 'ADP-NOUN')
                            ]

            if display_df:
                display(dep_df)
            if dep_df.shape[0] == 0:
                return ''

            # Candidate Reweighting
            opinion_word = dep_df.sort_values(by = 'attention_score', ascending = False).head(1)['opinion_word'].values[0]
            return opinion_word, None
        except:
            return 'NoOptinionTerm', None

In [14]:
owm = OpinionWordMiner(tokenizer, model, gamma=0.003, score_by='attention')

text = "The cabdriver was amazing, but the car was in pathetic condition"
aspect_term = "car"

text = "I had chocolate for lunch. It was really delicious. But the ambience was decent."
aspect_term = "chocolate"
# aspect_term = "ambience"

text = "I charge it at night and skip taking the cord with me because of the good batterylife."
aspect_term = "batterylife"
xy, xx = owm.mine_opinion_words(text, aspect_term, display_df=True)
print(text)
print(aspect_term, " --> ", xy)

TOKENS ORIGINAL:  ['i', 'charge', 'it', 'at', 'night', 'and', 'skip', 'taking', 'the', 'cord', 'with', 'me', 'because', 'of', 'the', 'good', 'battery', '##life', '.']
TOKENS NEW:  ['I', 'charge', 'it', 'at', 'night', 'and', 'skip', 'taking', 'the', 'cord', 'with', 'me', 'because', 'of', 'the', 'good', 'batterylife', '.']


Unnamed: 0,opinion_word,attention_score,pos,dep
0,I,0.015123,PRON,nsubj
1,charge,0.013026,VERB,ROOT
2,it,0.010823,PRON,dobj
3,at,0.008657,ADP,prep
4,night,0.013295,NOUN,pobj
5,and,0.007944,CCONJ,cc
6,skip,0.009113,VERB,conj
7,taking,0.014255,VERB,xcomp
8,the,0.017507,DET,det
9,cord,0.017336,NOUN,dobj


Unnamed: 0,opinion_word,attention_score,pos,dep
0,I,0.015123,PRON,nsubj
1,charge,0.013026,VERB,ROOT
2,it,0.010823,PRON,dobj
5,and,0.007944,CCONJ,cc
6,skip,0.009113,VERB,conj
7,taking,0.014255,VERB,xcomp
8,the,0.017507,DET,det
9,cord,0.017336,NOUN,dobj
10,with,0.006101,ADP,prep
11,me,0.010881,PRON,pobj


Unnamed: 0,opinion_word,attention_score,pos,dep
13,good,0.014061,ADJ,amod
15,at night,0.008657,ADP-NOUN,prep


I charge it at night and skip taking the cord with me because of the good batterylife.
batterylife  -->  good


In [12]:
spacy.explain("prep")

'prepositional modifier'

In [5]:
model_ckpt = 'bert-large-uncased'
local_model = '/data/data/hgupta35/nps/unsup/bert-large-uncased-local/checkpoint-16'

# model_ckpt = 'microsoft/deberta-v2-xlarge'
# local_model = '/data/data/hgupta35/causal_lm_kjs/unsup/deberta-v2-xlarge/checkpoint-40'

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForMaskedLM.from_pretrained(local_model, output_hidden_states=True, output_attentions=True)

In [399]:
# df = pd.read_csv("/data/data/hgupta35/causal_lm_kjs/unsup/Restaurants_Test.csv")
# df = create_data_in_atsc_format(df, 'aspectTerms', 'term', 'raw_text', 'aspect')

# idx = 0
# print(df.iloc[idx]['text'])
# print(df.iloc[idx]['aspect'])

In [79]:
# sub_word_embedding = []
# for token, token_id in zip(tokenizer.tokenize("The techpota is great"), tokenizer.encode_plus("The techpota is great")['input_ids'][1:-1]):
#     print(token, token_id)
#     if "#" in token:

## Step 4: Polarity Matching using Semantic Similarity

## Testing

In [None]:
# # Testing on text inputs
# text = df.loc[14, 'text']
# # text = "The cab ride was amazing but the driver was pathetic."
# # text = "Delhi palace has probably the best service and their chicken is mindblow."

# text = resolve_coreference(text)
# mark_text = "[CLS] " + text + " [SEP]"
# print('Text: ', mark_text)

# # Get word embeddings from BERT domain specific model
# wts, _ = get_word_embeddings(mark_text)

# # Algining word embeddings for subwords by averaging
# word_emb = align_subwords_embeddings(mark_text, tokenizer, wts)

# idx2word_map = defaultdict(list)
# for token, subword_idx in zip(_, tokenizer(mark_text).word_ids()[1:-1]):
#     idx2word_map[subword_idx].append(token.replace('#', ''))

# for key, val in idx2word_map.items():
#     idx2word_map[key] = ''.join(val)

# word2idx_map = {val:key for key, val in idx2word_map.items()}  
    
# we_tokens_pos_tags, we_tokens_deps = [], []
# spacy_tokens_pos_tags = {token.text.lower():token.pos_ for token in nlp(text)}
# spacy_tokens_deps = {token.text.lower():token.dep_ for token in nlp(text)}

# for we_tokens in list(idx2word_map.values()):
#     if we_tokens in spacy_tokens_pos_tags:
#         we_tokens_pos_tags.append(spacy_tokens_pos_tags[we_tokens])
#         we_tokens_deps.append(spacy_tokens_deps[we_tokens])
#     else:
#         we_tokens_pos_tags.append(None)
#         we_tokens_deps.append(None)

# print(word2idx_map)
# word_i = 8
# dep_df = pd.DataFrame(rbf_kernel(word_emb, word_emb, gamma)[word_i], columns = ['dep'], 
#                       index = idx2word_map.values())
# dep_df['pos'] = we_tokens_pos_tags
# dep_df['deps'] = we_tokens_deps
# dep_df[dep_df['pos'] == 'ADJ'].sort_values(by = 'dep', ascending = False)

In [None]:
# Testing on text inputs
# text, asp_word = df.loc[8, 'text'], 'bagels'
# text, asp_word = "The cab ride was amazing but the driver was pathetic.", 'ride'
# text, asp_word = "The cab ride was amazing but the driver was pathetic.", 'driver'
# text, asp_word = "DelhiPalace has probably the best service and their chicken is yummy.", 'chicken'
# text, asp_word = "DP has probably the best service but their chicken is yummy.", 'service'
# text, asp_word = "I goto Olive Garden for their beautiful ambience and calming music.", 'music'
# text, asp_word = "I goto Olive Garden for their beautiful ambience and calming music.", 'ambience'
# text, asp_word = "Cafe Bistro had very bad waitress, but I got there just for their amazing coffee.", 'waitress'
# text, asp_word = "Cafe Bistro had very bad waitress, but I got there just for their amazing coffee.", 'coffee'
# text, asp_word = "The food joint had some issues with their payments, but ambience was topnotch.", 'payments'
# text, asp_word = "The food joint had some issues with their payments, but ambience was topnotch.", 'ambience'
# text, asp_word = "I went to watch a movie. The actor was not good.", 'actor'

# text = resolve_coreference(text)
# print(text, asp_word)

# owm.mine_opinion_words(text, asp_word)

In [2]:
import requests

In [8]:
def get_api_reponse(payload, model, API_KEY):
    API_URL = f"https://api-inference.huggingface.co/models/{model}"
    headers_ = {"Authorization": f"Bearer {API_KEY}"}
    print(API_URL)
    print(headers_)
    try:
        response = requests.post(API_URL, headers=headers_, json=payload)
        return response.json()
    except Exception as e:
        return "Error: " + e

In [10]:
input_ = {"inputs": "My name is Sarah Jessica Parker but you can call me Jessica", 
          "options":{"wait_for_model":True,
                    "use_cache":True}
         }
MODEL = "dbmdz/bert-large-cased-finetuned-conll03-english"

response = get_api_reponse(payload=input_, model=MODEL, API_KEY=API_TOKEN)
print(response)

https://api-inference.huggingface.co/models/dbmdz/bert-large-cased-finetuned-conll03-english
{'Authorization': 'Bearer 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'}
[{'entity_group': 'PER', 'score': 0.9991335868835449, 'word': 'Sarah Jessica Parker', 'start': 11, 'end': 31}, {'entity_group': 'PER', 'score': 0.9979913234710693, 'word': 'Jessica', 'start': 52, 'end': 59}]
