In [1]:
# Load the model in fairseq
import torch
import numpy as np
from fairseq.models.bart import BARTModel
from fairseq.data.data_utils import collate_tokens

bart = torch.hub.load('pytorch/fairseq', 'bart.large.mnli')
bart.eval()  # disable dropout for evaluation
bart.cuda()

Using cache found in /home/ubuntu/.cache/torch/hub/pytorch_fairseq_master


BARTHubInterface(
  (model): BARTModel(
    (encoder): TransformerEncoder(
      (dropout_module): FairseqDropout()
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): LearnedPositionalEmbedding(1026, 1024, padding_idx=1)
      (layernorm_embedding): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (dropout_module): FairseqDropout()
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout_module): FairseqDropout()
          (activation_dropout_module): FairseqDro

In [2]:
# Encode a pair of sentences and make a prediction
# 0 = contradiction, 1 = neutral, 2 = entailment
batch_of_pairs = [
    ['BART is a seq2seq model.', 'BART is not sequence to sequence.'],
    ['BART is denoising autoencoder.', 'BART is version of autoencoder.'],
]

batch = collate_tokens(
    [bart.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
)
logits = bart.predict('mnli', batch).detach().cpu().numpy() 
probs = np.exp(logits) / np.exp(logits).sum(-1, keepdims=True)
probs

array([[0.9967863 , 0.00117084, 0.00204286],
       [0.0711104 , 0.0326806 , 0.896209  ]], dtype=float32)

In [3]:
# more testing
batch_of_pairs = [
    ['I like Apple', 'Apple is positive'],
    ['I hate Apple.', 'Apple is positive'],
]

batch = collate_tokens(
    [bart.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
)
logits = bart.predict('mnli', batch).detach().cpu().numpy() 
probs = np.exp(logits) / np.exp(logits).sum(-1, keepdims=True)
probs

array([[0.00358193, 0.5513655 , 0.44505256],
       [0.9937598 , 0.00393454, 0.00230566]], dtype=float32)

# Zero Shot Entity Classifier

In [4]:

class ZeroShotEntityClassifier:
    """
    Zero Shot Entity Classifier using Facebook's Bart Large model finetuned on MNLI
    Classify entities in a given sentence or a batch to labels
    
    Multiclass, probability calculated across label logits
    In case of multilabel, the logits can be calculated across each model output 
    Model outputs 3 scores: contradiction, neutral and entailment for each sentence, hypothesis pair
    
    Note and todo: 
    1. Multilabel probabilities are not implemented
    2. Batches are not handled carefully
    """
    
    def __init__(self,model, finetune="mnli"):
        self.model = model
        self.finetune="mnli"
        
        
        
    @staticmethod
    def get_sentence_pairs(sentences, entities, hypothesis):
        return [["{}.".format(s), "{}.".format(hypothesis.format(e.title(),l))] for s,en in zip(sentences,entities) for e in en for l in labels]

    def prepare_batch(self,sen_hyp_pairs):
        return collate_tokens(
            [self.model.encode(pair[0], pair[1]) for pair in sen_hyp_pairs], pad_idx=1
        )

    def get_logits(self,batch, total_entities, multi_label):
        # contradiction, neutral and entailment logits for each sentence_hyp_pair
        logits = self.model.predict( self.finetune, batch).detach().cpu().numpy()
        # we only look at entailment logits, where the hypothesis matches the logic of the sentence
        if not multi_label:
            entailment_logits = logits[:,-1]
            return entailment_logits.reshape(total_entities,len(labels))
        else:
            return logits.reshape(total_entities,len(labels),-1)
             
    
    @staticmethod
    def get_softmax(lgts, multi_label=True):
        probs = np.exp(lgts) / np.exp(lgts).sum(-1, keepdims=True)
        if multi_label:
            probs = probs[:,:,-1] # only entailment logits for multilabel. Already taken care of in case of multiclass
        probs = np.around(probs, decimals=3)
        return probs
        
    @staticmethod
    def display_results(sentences, entities, probs):
        # display probs
        prob_index = 0
        for s,en in zip(sentences,entities):
            print("Sentence: {}".format(s))
            print()
            for e in en:
                print("Entity: {}".format(e))
                print( ' | '.join("{}:{:.4f}".format(l,p) for l,p in zip( labels, probs[prob_index])))
                print()
                prob_index+=1
        
        
    def classify(self, sentences, entities, hypothesis, multi_label=True,print_result=False):
        """
        classify using zero shot and return label probabilities 
        """
        # prepare batch
        sentence_hyp_pairs = self.get_sentence_pairs(sentences, entities, hypothesis)
        batch = self.prepare_batch(sentence_hyp_pairs)
        
        # get entailment logits
        total_entities = sum((len(e) for e in entities))
        logits_reshape = self.get_logits(batch,total_entities,multi_label)
        probs = self.get_softmax(logits_reshape, multi_label)
        
        # display results if requested
        if print_result:
            self.display_results(sentences, entities, probs)

        # get the probability across positive, neutral and negative logits
        return probs

# zero shot sentiment classification using entities
labels = ["positive","neutral","negative"]
hypothesis = "{} is {}"
sentences = ["Apple is a great company but iphone sucks", "Microsoft xbox is not better than ps4", "I shopped at Target"]
entities = [["Apple","iphone"],["Microsoft","ps4"],["Target"]]
ZeroShotEntityClassifier(bart).classify(sentences, entities, hypothesis, multi_label=True,print_result=False)

array([[0.756, 0.016, 0.004],
       [0.001, 0.013, 0.956],
       [0.017, 0.052, 0.307],
       [0.666, 0.147, 0.035],
       [0.049, 0.052, 0.005]], dtype=float32)

# Zero Shot Entity Sentiment Analysis using Entailment scores

### Twitter api and preprocessor

In [5]:

import json
import re
from string import printable, punctuation
from nltk.tokenize.casual import TweetTokenizer
from emoji import UNICODE_EMOJI
from twitter_api import Twitter
from unidecode import unidecode


REMOVE_TAGS = ["rt"]
link_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
hashtag_regex = r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))#([A-Za-z]+[A-Za-z0-9]+)'

    
class TweetPreprocessor:
    """ preprocessing code to tokenize tweets"""

    def __init__(self):
        self.tw_api = Twitter()
        self.tknzr = TweetTokenizer(strip_handles=False)
        self.tokenize_and_filter = lambda tweet: [t for t in self.tnzr.tokenize(tweet)
                                                  if not any(r in t.lower() for r in self.remove_tags)]

    
    @staticmethod
    def return_case_insen_keyword(string, keyword):
        r= re.search(r'\b({})\b'.format(keyword), string,re.IGNORECASE)
            
        return string[r.start():r.start()+len(keyword)] if r else keyword.title()


    @staticmethod
    def remove_links(text):
        return re.sub(link_regex, ' ', text)

    @staticmethod
    def remove_hashtags(text):
        return re.sub(hashtag_regex, ' ', text)

    @staticmethod
    def is_emoji(character):
        return character in UNICODE_EMOJI

    @staticmethod
    def is_punct(character):
        return character in punctuation

    @classmethod
    def cleanup_non_alpha(cls, tokens):
        prev_char = '329'  # random alphanum string
        first_alpha = False
        new_tokens = []
        for i in tokens:
            is_char_alpha = i.isalnum()
            is_char_punct = cls.is_punct(i)
            is_char_emoji = cls.is_emoji(i)

            # remove character if its a punctuation (other than quotations)
            if is_char_alpha:
                first_alpha = True

            if (not first_alpha) and (i not in '"\'') and is_char_punct:
                continue

            # remove repeating punctuations or emojis
            if is_char_emoji or is_char_punct:
                if i == prev_char:
                    continue
                prev_char = i

            # check remove tags
            if i.lower().strip() in REMOVE_TAGS:
                continue

            # if the character is outside alpha
            if not (is_char_alpha or is_char_punct or is_char_emoji) and len(i) == 1:
                continue

            new_tokens.append(i)
        return new_tokens

    @staticmethod
    def normalize_case(tokens):
        """convert tweets with all or mostly upper case to all lower case"""
        num_tokens = sum([1 for t in tokens if t.isalpha()])
        if not num_tokens:
            return []
        num_caps = sum([1 for t in tokens if (t.isupper() or t.istitle())])
        return [t.lower() for t in tokens] if num_caps / num_tokens > 0.7 else tokens
    
    @staticmethod
    def is_headline(tweet):
        """
        check whether a tweet is a headline e.g 'Donald Trump Has Started Trade Wars'
        if the number of title words in the tweet is > 80% of total words
        """
        sen_splits = tweet.split()
        num_title = len([s for s in sen_splits if s.istitle()])
        return (num_title / len(sen_splits)) > 0.8

    @staticmethod
    def strip_non_alpha_and_lower(tweet):
        """used for near duplicate elimination"""
        return ' '.join([t.lower() for t in tweet.split() if t.isalpha()][:10])

    def preprocess_tweet(self, tweet, min_alpha_token_length=3, remove_emoticons=False, keep_hashtags=True):
        """
        filter tweet by,
        1. Removing hashtags, at mentions and urls
        2. combining repeating punctuations and emoticons into one
        3. if the number of alphabetic words are less than min_alpha_token_length, remove the tweet
        4. if the tweet has all caps or all title case words, make them all lower
        5. replace "'s" with " 's"
        """
        tweet = unidecode(tweet)
        tweet = tweet.replace("\n", " ")
        if remove_emoticons:
            tweet = self.remove_non_ascii(tweet)
        if not keep_hashtags:
            tweet = self.remove_hashtags(tweet)
        tweet = self.remove_links(tweet)
        tokens = self.tknzr.tokenize(tweet)
        tokens = self.cleanup_non_alpha(tokens)
        tokens = self.normalize_case(tokens)
        num_alpha_tokens = sum([1 for t in tokens if t.isalpha()])
        if num_alpha_tokens < min_alpha_token_length:
            return ''
        return ' '.join(tokens).replace("'s", " 's")
    
    def get_cleaned_tweets(self, search_term,entity_term, count):
        tweets = self.tw_api.get_tweets(search_term, count)
        # preprocess and filter tweets with the actual entity term, not present as a subset of a larger word
        tweets = [self.preprocess_tweet(t) for t in tweets]
        tweets = [t for t in tweets if t and any(search_term.lower()==w for w in t.lower().split())]
        # keywords with cases as they appear in tweets
        keywords = [self.return_case_insen_keyword(t, entity_term) for t in tweets]
        return tweets, keywords

In [6]:
def chop_tweet_around_entity(tweet, entity_term, sen_len_offset = 10):
    tw_splits = tweet.split()
    en_index = tw_splits.index(entity_term)
    start_index = max(0,en_index-sen_len_offset)
    end_index = min(len(tw_splits),en_index+sen_len_offset)
    tw_chopped = ' '.join(tw_splits[start_index:end_index])
    # monitoring token_len to be less than 40 to fit into the GPU of G4dnx with batch size 8 while running inference
    token_len = collate_tokens([bart.encode(tw_chopped,hypothesis.format(entity_term,"positive"))],pad_idx=1).shape[1]
    return chop_tweet_around_entity(tweet, entity_term, sen_len_offset-1) if token_len > 40 else tw_chopped

In [17]:
from os import path
import pandas as pd
import matplotlib
import glob
from time import sleep

#sleep(15*60)
# to rest the api window

tp = TweetPreprocessor()

data_dir = "data"
search_term = "@amazon"
entity_term = "amazon"
count = 50000


csv_file = "data/{}_{}.csv".format(search_term.lower(), count)
if path.exists(csv_file):
    df = pd.read_csv(csv_file)
    df = df[~df.isnull()['tweets']] # remove nans
    tweets, keywords = df['tweets'], df['entities']
else:
    tweets, keywords = tp.get_cleaned_tweets(search_term, entity_term, count)
    pd.DataFrame({'tweets':tweets,'entities':keywords}).to_csv(csv_file,index=False)
    
# replace at mention with the entity term
tweets = [re.sub(search_term, entity_term, t, flags=re.IGNORECASE) for t in tweets]
tweets = [chop_tweet_around_entity(t,entity_term,15) for t in tweets]

500 requests and 2 15 min sleeps required
Max requests per window reached. Sleeping ...
Max requests per window reached. Sleeping ...
Time Taken 1949.4088015556335 Seconds


In [18]:
len(tweets)

47778

In [19]:
def chunks(lst1,lst2, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst1), n):
        yield lst1[i:i + n],lst2[i:i + n]

In [20]:
labels = ["positive","neutral","negative"]
hypothesis = "{} is {}"
probs_all = []
chunk_size = 8
zc = ZeroShotEntityClassifier(bart)
entities = [[e] for e in keywords]
from tqdm import tqdm
for t,e in tqdm(chunks(tweets,entities,chunk_size),total=int(len(tweets)/chunk_size)):
    
    # get classification probs
    probs = zc.classify(t,e, hypothesis,multi_label=True, print_result=False)
    probs_all.extend(probs)
    torch.cuda.empty_cache()


5973it [30:20,  3.28it/s]                          


In [21]:
pd.set_option('display.max_colwidth', -1)
sentiment_df = pd.DataFrame(probs_all,columns=["Positive","Neutral","Negative"])
sentiment_df.insert(0,"tweets",tweets)
sentiment_df

  """Entry point for launching an IPython kernel.


Unnamed: 0,tweets,Positive,Neutral,Negative
0,It 's Sunday ... let 's turn up the VOLUME @NicoleBrizee amazon amazonlaunchpad,0.164,0.000,0.001
1,Fixed that for you amazon amazonUK amazonHelp with thanks to @SavingDowns1 . Hope this cheers you up @AmazingBrent,0.074,0.001,0.001
2,A friend of @ciarale01 has had this very human response from amazon,0.104,0.001,0.023
3,the art and craft of writing christian fiction : #religion #christianfiction #sponsored amazon,0.057,0.000,0.000
4,Hi amezon win me prize Today best Contest mi notebook @MukeshK41711839 amazonInQuiz @aqt11u amazon,0.286,0.000,0.000
...,...,...,...,...
47773,fox news . so should their advertisers amazon @zappos @audible_com @pillpack @ring @adt @tecovas,0.049,0.001,0.004
47774,"amazon to boost hiring , will host upcoming Career Day . #retail #ecommerce #amazon #housewares",0.548,0.000,0.000
47775,Last week amazon was caught trying to hire former private military contractors to spy on workers and,0.000,0.000,0.336
47776,Last week amazon was caught trying to hire former private military contractors to spy on workers and,0.000,0.000,0.336


In [23]:
sentiment_df[sentiment_df.Positive/sentiment_df.Negative > 100].sort_values('Positive',ascending=False).head(50)

Unnamed: 0,tweets,Positive,Neutral,Negative
4336,amazon cheers for that,0.985,0.001,0.0
33737,"The Real Napoleon : The Untold Story by John Tarttelin via amazon "" Excellent read , I can't put this book down "" #paperback #book",0.985,0.001,0.0
4997,"The Real Napoleon : The Untold Story by John Tarttelin via amazon "" Excellent read , I can't put this book down "" #paperback #book",0.985,0.001,0.0
4339,amazon cheers for that,0.985,0.001,0.0
17945,@ciarale01 amazon That 's great news . I'd like to see spreading hatred against disability become,0.983,0.003,0.001
18003,@sallyephillips @ciarale01 amazon This is very positive encouraging from amazon and it needs to be applied to,0.981,0.001,0.0
17984,@sallyephillips @ciarale01 amazon This is very positive encouraging from amazon and it needs to be applied to,0.981,0.001,0.0
17999,@sallyephillips @ciarale01 amazon This is very positive encouraging from amazon and it needs to be applied to,0.981,0.001,0.0
30311,the like should be knocking on my social media . The amazon reviews have been lovely so far #comedy #books #dvd #comedybook,0.981,0.001,0.001
30313,the like should be knocking on my social media . The amazon reviews have been lovely so far #comedy #books #dvd #comedybook,0.981,0.001,0.001


In [None]:
sentiment_df[(sentiment_df.Negative<0.01) & (sentiment_df.Positive>0.2)].sort_values('Positive',ascending=False)

In [None]:
sentiment_df.Negative.plot(kind='hist')

# TODO: REMOVE CUSTOM TOKENIZATION OR KEEP IT MINIMAL. USE THE MODEL'S PREPROCESSING LOGIC (collate_tokens?)

In [None]:
np.exp([-3.10656931e-02, -3.50330782e+00, -7.62020540e+00]) / np.exp([-3.10656931e-02, -3.50330782e+00, -7.62020540e+00]).sum(-1, keepdims=True)

In [None]:
probs


# Notes:
1. Need multilabel approach. Separate 0-1 for each positive, negative and neutral labels by using softmax across model output for each sen, hyp pairs
2. Use the model "neutral" output for sentiment "neutral" label?
3. Multilabel sometimes giving low scores. Softmax across cont, neut, ent for each label
4. Experiment with different hypothesis
5. Effect of hashtags and at mentions?