## Solving dependencies


### Git repository, embeddings, NLTK

In [0]:
! git clone https://github.com/josipjukic/Adversarial-NLP.git
% cd /content/Adversarial-NLP/src

% mkdir .vector_cache
% cp '/content/drive/My Drive/Master Thesis/glove/glove.6B.100d.txt.pt' .vector_cache/
% cp '/content/drive/My Drive/Master Thesis/glove/counter-fitted-vectors.txt' .vector_cache/

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

## IMDb experiments

In [0]:
import torch
from torchtext import data
from torchtext import datasets
import spacy
from data_utils import load_dataset
from nltk.corpus import stopwords

In [0]:
SEED = 42
torch.manual_seed(SEED)
LOAD_PATH = '/content/drive/My Drive/Master Thesis/IMDB'
MAX_VOCAB_SIZE = 25_000
EMBEDDINGS_FILE = 'glove.6B.100d'

splits, fields = load_dataset(LOAD_PATH,
                              include_lengths=True,
                              lower=False,
                              stop_words=None,
                              load_raw=False,
                              load_id=False)
train_data, valid_data, test_data = splits
TEXT, LABEL, _, _ = fields
LABEL.build_vocab(train_data)
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = EMBEDDINGS_FILE, 
                 unk_init = torch.Tensor.normal_)

In [0]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased', ).to('cuda')

tokens = tokenizer.encode("hello, my [MASK] is cute")
input_ids = torch.tensor(tokens).unsqueeze(0).to('cuda')  # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids)

loss, prediction_scores = outputs[:2]
prediction_scores

In [0]:
tks = tokenizer.encode("cat dog weekend", add_special_tokens=False)
prediction_scores[0,7, tks]

In [0]:
# ! pip install transformers
from transformers import BertTokenizer, BertForMaskedLM
import numpy as np
import torch

import matplotlib.pyplot as plt
% matplotlib inline


class MaskedLM():
    def __init__(self, LM=None, tokenizer=None,
                 device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
    
        self.LM = LM
        self.tokenizer = tokenizer
        self.device = device
        if self.LM is None:
            name = 'bert-base-uncased'
            self.tokenizer = BertTokenizer.from_pretrained(name)
            self.LM = BertForMaskedLM.from_pretrained(name).to('cuda')
            

    def substitution_score(self, sentence, target_index, subs):
        target = sentence[target_index]
        x_in = torch.tensor([self.tokenizer.convert_tokens_to_ids(sentence)],
                            device=self.device)
        sentence[target_index] = '[MASK]'
        mask_in = torch.tensor([self.tokenizer.convert_tokens_to_ids(sentence)],
                               device=self.device)
        sentence[target_index] = target

        sub_ids = self.tokenizer.convert_tokens_to_ids(subs)
        with torch.no_grad():
            preds = self.LM(mask_in, masked_lm_labels=x_in)[1][0,target_index]

        indices = torch.argsort(preds[sub_ids], descending=True)
        ordered_subs = [subs[i] for i in indices]
        return ordered_subs

    def _get_candidates(self, sentence, target_index, n_substitutes=10):
        target = sentence[target_index]
        x_in = torch.tensor([self.tokenizer.convert_tokens_to_ids(sentence)],
                            device=self.device)
        sentence[target_index] = '[MASK]'
        mask_in = torch.tensor([self.tokenizer.convert_tokens_to_ids(sentence)],
                               device=self.device)
        sentence[target_index] = word

        with torch.no_grad():
            preds = self.LM(mask_in, masked_lm_labels=x_in)[1][0,target_index]
            _, top_k_index = torch.topk(preds, n_substitutes)
            return self.tokenizer.convert_ids_to_tokens(top_k_index.tolist())

# lex_sub = LexSub()
a = 'You are the son of my dad.'
a = lex_sub.tokenizer.tokenize(a)
print(a)
subs, topk = lex_sub.substitution_score(a, 3, ['daughter', 'wife', 'sister', 'brick', 'shoe'])

print(subs, topk)

In [0]:

from collections import defaultdict
import numpy as np 
from numpy.linalg import norm
from gensim.models import KeyedVectors
import string
import re
from abc import ABC, abstractmethod
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.corpus import lin_thesaurus

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')


class WordNetTagger():
    def __init__(self):
        self.tag_map = defaultdict(
            lambda: None,
            {'NN':wordnet.NOUN, 'JJ':wordnet.ADJ,
             'VB':wordnet.VERB, 'RB':wordnet.ADV}
        )
      
    def tag(self, sentence):
        self.tags = pos_tag(sentence)

    def get_tag(self, index):
        return self.tag_map[self.tags[index][1][:2]]



def load_vocab_embeddings(path, vocab, emb_dim=300):
    emb_mat = np.zeros((emb_dim, len(vocab)))
    with open(path, 'r') as f:
        for line in f:
            row = line.strip().split(' ')
            word = row[0]
            i = vocab.stoi[word]
            if i == 0: continue
            emb_mat[:,i] = np.array(row[1:]).astype(np.float)
        return emb_mat


class LexSubBase(ABC):
    def __init__(self, vocab):
        self.vocab = vocab
        stop_words = stopwords.words('english')
        punkt = string.punctuation
        self.spec_words = set()
        self._add_spec_words(stop_words)
        self._add_spec_words(punkt)

    def _add_spec_words(self, words):
        for el in words:
            idx = self.vocab.stoi[el]
            if idx > 0:
                self.spec_words.add(idx)

    @abstractmethod
    def get_candidates(self):
        pass

    @abstractmethod
    def sort_by_substitutability(self):
       pass
    

class LexSub(LexSubBase):
    "Find word substitutions for a word in context using word2vec skip-gram embedding"
    def __init__(self, vocab,
                 vector_path='.vector_cache/counter-fitted-vectors.txt'):
        super().__init__(vocab)
        self.emb_mat = load_vocab_embeddings(vector_path, vocab)

        c_ = -2*np.dot(self.emb_mat.T, self.emb_mat)
        a = np.sum(np.square(self.emb_mat), axis=0).reshape((1,-1))
        b = a.T
        self.dist_mat = a+b+c_

    def get_candidates(self, words, n_substitutes=10,
                       n_candidates=10, sentence=None):
        
        cand_list = []
        for i, word in enumerate(words):
            cands = self._get_candidates(
                        target=word,
                        target_index=i,
                        n_candidates=n_candidates,
                        sentence=sentence
                     )
            if sentence:
                cands = self.sort_by_substitutability(cands, word, i, words,
                                                      n_substitutes)
            else:
                cands = cands[:n_substitutes]
            cand_list.append(cands)

        return cand_list

    def sort_by_substitutability(self, cands, target, target_index,
                                 sentence, n_substitutes):
        C = [c for c in sentence if c not in self.spec_words and c != target]
        scores = [self.get_substitutability(target, target_index, cand, C)
                  for cand in cands]
        sorted_cands = sorted(zip(cands, scores), key = lambda x : x[1])
        return [sub for sub, _ in sorted_cands][:n_substitutes]

    def get_substitutability(self, t, ti, s, C):
        """
        t = target word
        ti = target index
        s = candidate substitution 
        C = list of context words 
        """
        tscore = self.dist_mat[t][s]
        
        if len(C) == 0:
            cscore = 0
        else:
            cscores = [self.dist_mat[t][c] for c in C ]
            cscore = sum(cscores) / (len(C))

        return tscore + cscore

    @abstractmethod
    def _get_candidates(self, **kwargs):
        pass


class SynonymModel(LexSub):
    def _get_candidates(self, target, n_candidates=10, **kwargs):
        if target == 0: return []
        return np.argsort(self.dist_mat[target,:])[1:1+n_candidates]


class WordnetModel(LexSub):
    def __init__(self, vocab,
                 vector_path='.vector_cache/counter-fitted-vectors.txt'):
        super().__init__(vocab, vector_path)
        self.tagger = WordNetTagger()

    def get_candidates(self, **kwargs):
        self.tagger.tag(kwargs['sentence'])
        return super().get_candidates(**kwargs)
    
    def _get_candidates(self, target, target_index, n_candidates, **kwargs):
        if target == 0: return []
        tag = self.tagger.get_tag(target_index)
        word = self.vocab.itos[target]
        syns = WordnetModel.wordnet_synonyms(word, tag)
        cands = set()
        for syn in syns:
            if syn != word:
                id = self.vocab.stoi[syn]
                if id != 0:
                    cands.add(id)
        return list(cands)[:n_candidates]

    @staticmethod
    def wordnet_synonyms(word, pos_tag):
        synset = wordnet.synsets(word, pos_tag)
        return [lemma.name() for s in synset for lemma in s.lemmas()]


class LinModel(LexSub):
    def __init__(self, vocab,
                 vector_path='.vector_cache/counter-fitted-vectors.txt'):
        super().__init__(vocab, vector_path)
        self.tagger = WordNetTagger()

    def get_candidates(self, **kwargs):
        self.tagger.tag(kwargs['sentence'])
        return super().get_candidates(**kwargs)
    
    def _get_candidates(self, target, target_index, n_candidates, **kwargs):
        if target == 0: return []
        tag = self.tagger.get_tag(target_index)
        word = self.vocab.itos[target]
        syns = LinModel.lin_synonyms(word, tag)
        cands = []
        for syn in syns:
            if syn != word:
                id = self.vocab.stoi[syn]
                if id != 0:
                    print(syn)
                    cands.append(id)
        return list(cands)[:n_candidates]

    @staticmethod
    def lin_synonyms(word, pos):
        fileid = 'sim%s.lsp' % pos.upper()
        thes_entry = lin_thesaurus.scored_synonyms(word, fileid=fileid)
        thes_entry = sorted(thes_entry, key = (lambda x : x[1]), reverse = True)
        return [syn for syn, score in thes_entry]

sm = WordnetModel(TEXT.vocab)

In [0]:
sent = ['dog', 'is', 'sitting']
words = [TEXT.vocab.stoi[s] for s in sent]
a = sm.get_candidates(words=words, n_substitutes=10, n_candidates=10, sentence=sent)

In [0]:
print(a[2])

In [0]:
from nltk import pos_tag
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from collections import defaultdict

class WordNetTagger():
    def __init__(self):
        self.tag_map = defaultdict(
            lambda: None,
            {'NN':wordnet.NOUN, 'JJ':wordnet.ADJ,
             'VB':wordnet.VERB, 'RB':wordnet.ADV}
        )
      
    def tag(self, sentence):
        self.tags = pos_tag(sentence)

    def get_tag(self, index):
        return self.tag_map[self.tags[index][1][:2]]

s = WordNetTagger()
s.tag(['movie', 'my', 'dear', 'friend', '!', '?', 'mouse', 'is', 'rolling'])
print(s.get_tag(3))

In [0]:
import nltk
# nltk.download('wordnet')
from nltk.corpus import wordnet

synonyms = []
antonyms = []

for syn in wordnet.synsets('animal') :
    for l in syn.lemmas():
        print(l.hypernyms())
        synonyms.append(l.name())
        if l.antonyms():
                antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))
print(wordnet.ADJ_SAT)